Skip to content

Commit 61c9c8b

Browse files
committed
smaller sample size
1 parent 1b561d3 commit 61c9c8b

File tree

1 file changed

+2
-5
lines changed

1 file changed

+2
-5
lines changed

app.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,11 @@
55
enable_xorbits = False
66

77
if enable_xorbits:
8-
import xorbits.pandas as pd
9-
import xorbits.numpy as np
108
import xorbits
119
xorbits.init()
10+
import xorbits.pandas as pd
1211
else:
1312
import pandas as pd
14-
import numpy as np
1513

1614
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
1715
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
@@ -65,7 +63,7 @@ def get_hugging_face_dataset(name):
6563
hf_datasets = get_hugging_face_dataset(dataset_name)
6664
subsets = set([x['config'] for x in hf_datasets['parquet_files']])
6765
subset_option = st.sidebar.selectbox("Choose a subset", subsets)
68-
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1)
66+
sample_rate_option = st.sidebar.slider('Select sample rate', value=0.01, min_value=0.1, max_value=1.0, step=0.1)
6967

7068
tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs(
7169
["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"])
@@ -159,7 +157,6 @@ def get_hugging_face_dataset(name):
159157

160158
with st.spinner('Calculating impurity ratio...'):
161159
df = datasets['train']
162-
163160
import re
164161
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
165162

0 commit comments

Comments
 (0)