|
5 | 5 | enable_xorbits = False |
6 | 6 |
|
7 | 7 | if enable_xorbits: |
8 | | - import xorbits.pandas as pd |
9 | | - import xorbits.numpy as np |
10 | 8 | import xorbits |
11 | 9 | xorbits.init() |
| 10 | + import xorbits.pandas as pd |
12 | 11 | else: |
13 | 12 | import pandas as pd |
14 | | - import numpy as np |
15 | 13 |
|
16 | 14 | st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide") |
17 | 15 | st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face') |
@@ -65,7 +63,7 @@ def get_hugging_face_dataset(name): |
65 | 63 | hf_datasets = get_hugging_face_dataset(dataset_name) |
66 | 64 | subsets = set([x['config'] for x in hf_datasets['parquet_files']]) |
67 | 65 | subset_option = st.sidebar.selectbox("Choose a subset", subsets) |
68 | | - sample_rate_option = st.sidebar.slider('Select sample rate', value=0.05, min_value=0.1, max_value=1.0, step=0.1) |
| 66 | + sample_rate_option = st.sidebar.slider('Select sample rate', value=0.01, min_value=0.1, max_value=1.0, step=0.1) |
69 | 67 |
|
70 | 68 | tab0, tab1, tab2, tab3, tab4, tab5 = st.tabs( |
71 | 69 | ["Introduction", "Junk Data🤖", "Biased Content🛡️", "Short Documents🌐", "Contamination🧹", "Duplication🔍"]) |
@@ -159,7 +157,6 @@ def get_hugging_face_dataset(name): |
159 | 157 |
|
160 | 158 | with st.spinner('Calculating impurity ratio...'): |
161 | 159 | df = datasets['train'] |
162 | | - |
163 | 160 | import re |
164 | 161 | RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]') |
165 | 162 |
|
|
0 commit comments