Skip to content

Commit 7dd8c7b

Browse files
committed
change ord
1 parent 89d10bc commit 7dd8c7b

File tree

1 file changed

+116
-115
lines changed

1 file changed

+116
-115
lines changed

app.py

Lines changed: 116 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44

55
enable_xorbits = False
66

7-
87
if enable_xorbits:
98
import xorbits.pandas as pd
109
import xorbits.numpy as np
1110
import xorbits
12-
xorbits.init(n_worker=1, n_cpu=2)
11+
xorbits.init()
1312
else:
1413
import pandas as pd
1514
import numpy as np
@@ -69,7 +68,7 @@ def get_hugging_face_dataset(name):
6968
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
7069

7170
tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
72-
["Introduction", "Junk Data🤖", "Contamination🧹", "Short Documents🌐", "Biased Content🛡️", "Duplication🔍"])
71+
["Introduction", "Junk Data🤖", "Short Documents🌐", "Biased Content🛡️", "Contamination🧹", "Duplication🔍"])
7372
with tab0:
7473

7574
st.markdown(
@@ -205,7 +204,120 @@ def impurity(text, min_len=10):
205204
)
206205

207206

208-
with tab2:
207+
with tab2:
208+
st.header('Toxic Content')
209+
st.markdown('''
210+
It is crucial in the training of language models to be vigilant and potentially apply tools
211+
to exclude toxic content from the pre-training datasets. This practice helps to
212+
prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
213+
214+
One approach to address this issue is by scanning the text for **offensive words**.
215+
For instance, the creators of the C4 dataset have implemented such a
216+
filtering mechanism. The follow code references this
217+
[word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
218+
219+
The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
220+
221+
''')
222+
223+
metrics, code = st.tabs(['Metrics', 'Code'])
224+
with metrics:
225+
with st.spinner('Calculating toxic ratio...'):
226+
df = datasets['train']
227+
228+
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
229+
lines = f.readlines()
230+
231+
banned_words = [line.rstrip('\n') for line in lines]
232+
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
233+
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
234+
total_num_docs = len(df)
235+
biased_num_docs = df['matches'].sum()
236+
biased_content_ratio = biased_num_docs / total_num_docs
237+
col1, col2, col3 = st.columns(3)
238+
239+
col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
240+
col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
241+
col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
242+
st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
243+
with code:
244+
st.code(
245+
'''
246+
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
247+
lines = f.readlines()
248+
249+
banned_words = [line.rstrip('\n') for line in lines]
250+
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
251+
total_num_docs = len(df)
252+
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
253+
biased_num_docs = df['matches'].sum()
254+
biased_content_ratio = biased_num_docs / total_num_docs
255+
'''
256+
)
257+
258+
259+
260+
with tab3:
261+
st.header("Too-Short Documents")
262+
263+
st.markdown('''
264+
The aim of language modeling is to master the generation of text based on preceding tokens.
265+
In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
266+
100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
267+
model dependencies within the text.
268+
269+
270+
Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
271+
of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
272+
model can understand. Choose a tokenizer for your model.
273+
''')
274+
metrics, code = st.tabs(['Metrics', 'Code'])
275+
276+
with metrics:
277+
with st.spinner('Calculating too-short ratio...'):
278+
from transformers import BertTokenizer
279+
280+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
281+
282+
df = datasets['train']
283+
# Create a new column with the number of tokens for each text
284+
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
285+
total_num_docs = len(df)
286+
too_short_docs = len(df[df['text_length'] < 100])
287+
too_short_doc_ratio = too_short_docs / total_num_docs
288+
289+
col1, col2, col3 = st.columns(3)
290+
col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
291+
col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
292+
col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
293+
294+
# col1, _ = st.columns([2, 1])
295+
296+
# import seaborn as sns
297+
# import matplotlib.pyplot as plt
298+
# fig, ax = plt.subplots(figsize=(10, 5))
299+
# ax.set_title('Distribution of text length (in tokens)')
300+
# sns.histplot(data=df, x='text_length', ax=ax)
301+
# plt.axvline(100, color='r', linestyle='--')
302+
# col1.pyplot(fig)
303+
with code:
304+
st.code(
305+
'''
306+
from transformers import BertTokenizer
307+
308+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
309+
310+
df = datasets['train']
311+
# Create a new column with the number of tokens for each text
312+
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
313+
total_num_docs = len(df)
314+
too_short_docs = len(df[df['text_length'] < 100])
315+
too_short_doc_ratio = too_short_docs / total_num_docs
316+
'''
317+
)
318+
319+
320+
with tab4:
209321
st.header('Contamination')
210322

211323
st.markdown('''
@@ -309,117 +421,6 @@ def process_data(df):
309421
'''
310422
)
311423

312-
with tab3:
313-
st.header("Too-Short Documents")
314-
315-
st.markdown('''
316-
The aim of language modeling is to master the generation of text based on preceding tokens.
317-
In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately
318-
100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to
319-
model dependencies within the text.
320-
321-
322-
Use the Hugging Face Transformers library to tokenize text and then calculate the proportion
323-
of documents that are "too short" in a dataset. This example converts text into tokens that the BERT
324-
model can understand. Choose a tokenizer for your model.
325-
''')
326-
metrics, code = st.tabs(['Metrics', 'Code'])
327-
328-
with metrics:
329-
with st.spinner('Calculating too-short ratio...'):
330-
from transformers import BertTokenizer
331-
332-
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
333-
334-
df = datasets['train']
335-
# Create a new column with the number of tokens for each text
336-
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
337-
total_num_docs = len(df)
338-
too_short_docs = len(df[df['text_length'] < 100])
339-
too_short_doc_ratio = too_short_docs / total_num_docs
340-
341-
col1, col2, col3 = st.columns(3)
342-
col1.metric(label="Too-Short Doc Count", value="%d" % too_short_docs)
343-
col2.metric(label="Total Doc Count", value="%d" % total_num_docs)
344-
col3.metric(label="Too Short Doc Ratio", value="%.2f%%" % (too_short_doc_ratio * 100))
345-
346-
# col1, _ = st.columns([2, 1])
347-
348-
# import seaborn as sns
349-
# import matplotlib.pyplot as plt
350-
# fig, ax = plt.subplots(figsize=(10, 5))
351-
# ax.set_title('Distribution of text length (in tokens)')
352-
# sns.histplot(data=df, x='text_length', ax=ax)
353-
# plt.axvline(100, color='r', linestyle='--')
354-
# col1.pyplot(fig)
355-
with code:
356-
st.code(
357-
'''
358-
from transformers import BertTokenizer
359-
360-
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
361-
362-
df = datasets['train']
363-
# Create a new column with the number of tokens for each text
364-
df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text)))
365-
total_num_docs = len(df)
366-
too_short_docs = len(df[df['text_length'] < 100])
367-
too_short_doc_ratio = too_short_docs / total_num_docs
368-
'''
369-
)
370-
371-
with tab4:
372-
st.header('Toxic Content')
373-
st.markdown('''
374-
It is crucial in the training of language models to be vigilant and potentially apply tools
375-
to exclude toxic content from the pre-training datasets. This practice helps to
376-
prevent the models from demonstrating bias or generating detrimental content in subsequent applications.
377-
378-
One approach to address this issue is by scanning the text for **offensive words**.
379-
For instance, the creators of the C4 dataset have implemented such a
380-
filtering mechanism. The follow code references this
381-
[word ](https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en) that they open source.
382-
383-
The following code utilizes the word list to quantify the "biased content ratio" in the dataset.
384-
385-
''')
386-
387-
metrics, code = st.tabs(['Metrics', 'Code'])
388-
with metrics:
389-
with st.spinner('Calculating toxic ratio...'):
390-
df = datasets['train']
391-
392-
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
393-
lines = f.readlines()
394-
395-
banned_words = [line.rstrip('\n') for line in lines]
396-
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
397-
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
398-
total_num_docs = len(df)
399-
biased_num_docs = df['matches'].sum()
400-
biased_content_ratio = biased_num_docs / total_num_docs
401-
col1, col2, col3 = st.columns(3)
402-
403-
col1.metric(label="Total Doc Count", value="%d" % total_num_docs)
404-
col2.metric(label="Biased Doc Count", value="%d" % biased_num_docs)
405-
col3.metric(label="Biased Ratio", value="%.2f%%" % (biased_content_ratio * 100))
406-
st.dataframe(df[df['matches']][['text', 'banned_words_in_text']][:20])
407-
with code:
408-
st.code(
409-
'''
410-
with open('./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words', 'r') as f:
411-
lines = f.readlines()
412-
413-
banned_words = [line.rstrip('\n') for line in lines]
414-
df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()])
415-
total_num_docs = len(df)
416-
df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0)
417-
biased_num_docs = df['matches'].sum()
418-
biased_content_ratio = biased_num_docs / total_num_docs
419-
'''
420-
)
421-
422-
423424

424425
with tab5:
425426
st.header("Duplication")

0 commit comments

Comments
 (0)