|
4 | 4 |
|
5 | 5 | enable_xorbits = False |
6 | 6 |
|
7 | | - |
8 | 7 | if enable_xorbits: |
9 | 8 | import xorbits.pandas as pd |
10 | 9 | import xorbits.numpy as np |
11 | 10 | import xorbits |
12 | | - xorbits.init(n_worker=1, n_cpu=2) |
| 11 | + xorbits.init() |
13 | 12 | else: |
14 | 13 | import pandas as pd |
15 | 14 | import numpy as np |
@@ -69,7 +68,7 @@ def get_hugging_face_dataset(name): |
69 | 68 | sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1) |
70 | 69 |
|
71 | 70 | tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs( |
72 | | - ["Introduction", "Junk Data🤖", "Contamination🧹", "Short Documents🌐", "Biased Content🛡️", "Duplication🔍"]) |
| 71 | + ["Introduction", "Junk Data🤖", "Short Documents🌐", "Biased Content🛡️", "Contamination🧹", "Duplication🔍"]) |
73 | 72 | with tab0: |
74 | 73 |
|
75 | 74 | st.markdown( |
@@ -205,7 +204,120 @@ def impurity(text, min_len=10): |
205 | 204 | ) |
206 | 205 |
|
207 | 206 |
|
208 | | -with tab2: |
| 207 | +with tab2: |
| 208 | + st.header('Toxic Content') |
| 209 | + st.markdown(''' |
| 210 | +It is crucial in the training of language models to be vigilant and potentially apply tools |
| 211 | +to exclude toxic content from the pre-training datasets. This practice helps to |
| 212 | +prevent the models from demonstrating bias or generating detrimental content in subsequent applications. |
| 213 | +
|
| 214 | +One approach to address this issue is by scanning the text for **offensive words**. |
| 215 | +For instance, the creators of the C4 dataset have implemented such a |
| 216 | +filtering mechanism. The follow code references this |
| 217 | +[word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source. |
| 218 | +
|
| 219 | +The following code utilizes the word list to quantify the "biased content ratio" in the dataset. |
| 220 | +
|
| 221 | + ''') |
| 222 | + |
| 223 | + metrics, code = st.tabs(['Metrics', 'Code']) |
| 224 | + with metrics: |
| 225 | + with st.spinner('Calculating toxic ratio...'): |
| 226 | + df = datasets['train'] |
| 227 | + |
| 228 | + with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f: |
| 229 | + lines = f.readlines() |
| 230 | + |
| 231 | + banned_words = [line.rstrip('\n') for line in lines] |
| 232 | + df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) |
| 233 | + df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) |
| 234 | + total_num_docs = len(df) |
| 235 | + biased_num_docs = df['matches'].sum() |
| 236 | + biased_content_ratio = biased_num_docs / total_num_docs |
| 237 | + col1, col2, col3 = st.columns(3) |
| 238 | + |
| 239 | + col1.metric(label="Total Doc Count", value="%d" % total_num_docs) |
| 240 | + col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs) |
| 241 | + col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100)) |
| 242 | + st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20]) |
| 243 | + with code: |
| 244 | + st.code( |
| 245 | + ''' |
| 246 | + with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f: |
| 247 | + lines = f.readlines() |
| 248 | +
|
| 249 | + banned_words = [line.rstrip('\n') for line in lines] |
| 250 | + df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) |
| 251 | + total_num_docs = len(df) |
| 252 | + df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) |
| 253 | + biased_num_docs = df['matches'].sum() |
| 254 | + biased_content_ratio = biased_num_docs / total_num_docs |
| 255 | + ''' |
| 256 | + ) |
| 257 | + |
| 258 | + |
| 259 | + |
| 260 | +with tab3: |
| 261 | + st.header("Too-Short Documents") |
| 262 | + |
| 263 | + st.markdown(''' |
| 264 | +The aim of language modeling is to master the generation of text based on preceding tokens. |
| 265 | +In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately |
| 266 | + 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to |
| 267 | + model dependencies within the text. |
| 268 | +
|
| 269 | +
|
| 270 | + Use the Hugging Face Transformers library to tokenize text and then calculate the proportion |
| 271 | + of documents that are "too short" in a dataset. This example converts text into tokens that the BERT |
| 272 | + model can understand. Choose a tokenizer for your model. |
| 273 | + ''') |
| 274 | + metrics, code = st.tabs(['Metrics', 'Code']) |
| 275 | + |
| 276 | + with metrics: |
| 277 | + with st.spinner('Calculating too-short ratio...'): |
| 278 | + from transformers import BertTokenizer |
| 279 | + |
| 280 | + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| 281 | + |
| 282 | + df = datasets['train'] |
| 283 | + # Create a new column with the number of tokens for each text |
| 284 | + df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) |
| 285 | + total_num_docs = len(df) |
| 286 | + too_short_docs = len(df[df['text_length'] < 100]) |
| 287 | + too_short_doc_ratio = too_short_docs / total_num_docs |
| 288 | + |
| 289 | + col1, col2, col3 = st.columns(3) |
| 290 | + col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs) |
| 291 | + col2.metric(label="Total Doc Count", value="%d" % total_num_docs) |
| 292 | + col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100)) |
| 293 | + |
| 294 | + # col1, _ = st.columns([2, 1]) |
| 295 | + |
| 296 | + # import seaborn as sns |
| 297 | + # import matplotlib.pyplot as plt |
| 298 | + # fig, ax = plt.subplots(figsize=(10, 5)) |
| 299 | + # ax.set_title('Distribution of text length (in tokens)') |
| 300 | + # sns.histplot(data=df, x='text_length', ax=ax) |
| 301 | + # plt.axvline(100, color='r', linestyle='--') |
| 302 | + # col1.pyplot(fig) |
| 303 | + with code: |
| 304 | + st.code( |
| 305 | + ''' |
| 306 | + from transformers import BertTokenizer |
| 307 | +
|
| 308 | + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| 309 | +
|
| 310 | + df = datasets['train'] |
| 311 | + # Create a new column with the number of tokens for each text |
| 312 | + df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) |
| 313 | + total_num_docs = len(df) |
| 314 | + too_short_docs = len(df[df['text_length'] < 100]) |
| 315 | + too_short_doc_ratio = too_short_docs / total_num_docs |
| 316 | + ''' |
| 317 | + ) |
| 318 | + |
| 319 | + |
| 320 | +with tab4: |
209 | 321 | st.header('Contamination') |
210 | 322 |
|
211 | 323 | st.markdown(''' |
@@ -309,117 +421,6 @@ def process_data(df): |
309 | 421 | ''' |
310 | 422 | ) |
311 | 423 |
|
312 | | -with tab3: |
313 | | - st.header("Too-Short Documents") |
314 | | - |
315 | | - st.markdown(''' |
316 | | -The aim of language modeling is to master the generation of text based on preceding tokens. |
317 | | -In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately |
318 | | - 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to |
319 | | - model dependencies within the text. |
320 | | -
|
321 | | -
|
322 | | - Use the Hugging Face Transformers library to tokenize text and then calculate the proportion |
323 | | - of documents that are "too short" in a dataset. This example converts text into tokens that the BERT |
324 | | - model can understand. Choose a tokenizer for your model. |
325 | | - ''') |
326 | | - metrics, code = st.tabs(['Metrics', 'Code']) |
327 | | - |
328 | | - with metrics: |
329 | | - with st.spinner('Calculating too-short ratio...'): |
330 | | - from transformers import BertTokenizer |
331 | | - |
332 | | - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
333 | | - |
334 | | - df = datasets['train'] |
335 | | - # Create a new column with the number of tokens for each text |
336 | | - df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) |
337 | | - total_num_docs = len(df) |
338 | | - too_short_docs = len(df[df['text_length'] < 100]) |
339 | | - too_short_doc_ratio = too_short_docs / total_num_docs |
340 | | - |
341 | | - col1, col2, col3 = st.columns(3) |
342 | | - col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs) |
343 | | - col2.metric(label="Total Doc Count", value="%d" % total_num_docs) |
344 | | - col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100)) |
345 | | - |
346 | | - # col1, _ = st.columns([2, 1]) |
347 | | - |
348 | | - # import seaborn as sns |
349 | | - # import matplotlib.pyplot as plt |
350 | | - # fig, ax = plt.subplots(figsize=(10, 5)) |
351 | | - # ax.set_title('Distribution of text length (in tokens)') |
352 | | - # sns.histplot(data=df, x='text_length', ax=ax) |
353 | | - # plt.axvline(100, color='r', linestyle='--') |
354 | | - # col1.pyplot(fig) |
355 | | - with code: |
356 | | - st.code( |
357 | | - ''' |
358 | | - from transformers import BertTokenizer |
359 | | -
|
360 | | - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
361 | | -
|
362 | | - df = datasets['train'] |
363 | | - # Create a new column with the number of tokens for each text |
364 | | - df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) |
365 | | - total_num_docs = len(df) |
366 | | - too_short_docs = len(df[df['text_length'] < 100]) |
367 | | - too_short_doc_ratio = too_short_docs / total_num_docs |
368 | | - ''' |
369 | | - ) |
370 | | - |
371 | | -with tab4: |
372 | | - st.header('Toxic Content') |
373 | | - st.markdown(''' |
374 | | -It is crucial in the training of language models to be vigilant and potentially apply tools |
375 | | -to exclude toxic content from the pre-training datasets. This practice helps to |
376 | | -prevent the models from demonstrating bias or generating detrimental content in subsequent applications. |
377 | | -
|
378 | | -One approach to address this issue is by scanning the text for **offensive words**. |
379 | | -For instance, the creators of the C4 dataset have implemented such a |
380 | | -filtering mechanism. The follow code references this |
381 | | -[word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source. |
382 | | -
|
383 | | -The following code utilizes the word list to quantify the "biased content ratio" in the dataset. |
384 | | -
|
385 | | - ''') |
386 | | - |
387 | | - metrics, code = st.tabs(['Metrics', 'Code']) |
388 | | - with metrics: |
389 | | - with st.spinner('Calculating toxic ratio...'): |
390 | | - df = datasets['train'] |
391 | | - |
392 | | - with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f: |
393 | | - lines = f.readlines() |
394 | | - |
395 | | - banned_words = [line.rstrip('\n') for line in lines] |
396 | | - df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) |
397 | | - df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) |
398 | | - total_num_docs = len(df) |
399 | | - biased_num_docs = df['matches'].sum() |
400 | | - biased_content_ratio = biased_num_docs / total_num_docs |
401 | | - col1, col2, col3 = st.columns(3) |
402 | | - |
403 | | - col1.metric(label="Total Doc Count", value="%d" % total_num_docs) |
404 | | - col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs) |
405 | | - col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100)) |
406 | | - st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20]) |
407 | | - with code: |
408 | | - st.code( |
409 | | - ''' |
410 | | - with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f: |
411 | | - lines = f.readlines() |
412 | | -
|
413 | | - banned_words = [line.rstrip('\n') for line in lines] |
414 | | - df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) |
415 | | - total_num_docs = len(df) |
416 | | - df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) |
417 | | - biased_num_docs = df['matches'].sum() |
418 | | - biased_content_ratio = biased_num_docs / total_num_docs |
419 | | - ''' |
420 | | - ) |
421 | | - |
422 | | - |
423 | 424 |
|
424 | 425 | with tab5: |
425 | 426 | st.header("Duplication") |
|
0 commit comments