Skip to content

Commit 92f6688

Browse files
committed
add sample
1 parent 61c9c8b commit 92f6688

File tree

1 file changed

+22
-19
lines changed

1 file changed

+22
-19
lines changed

app.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,7 @@
11
import streamlit as st
22
import requests
3-
import os
43

5-
enable_xorbits = False
6-
7-
if enable_xorbits:
8-
import xorbits
9-
xorbits.init()
10-
import xorbits.pandas as pd
11-
else:
12-
import pandas as pd
4+
enable_xorbits = True
135

146
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
157
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
@@ -25,28 +17,39 @@
2517

2618
#@st.cache_data
2719
def load_dataset(j, name, fraction):
20+
import os
21+
22+
if enable_xorbits:
23+
import xorbits
24+
xorbits.init()
25+
import xorbits.pandas as pd
26+
else:
27+
import pandas as pd
2828

29-
if not os.path.exists('train.gzip'):
29+
if not os.path.exists('%s-train.gzip' % name):
3030
with st.spinner('Downloading file from remote server'):
3131
import pandas
3232
train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
3333
train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
34-
train_dataset.to_parquet('train.gzip')
34+
train_dataset.to_parquet('%s-train.gzip' % name)
3535

36-
if not os.path.exists('test.gzip'):
36+
if not os.path.exists('%s-test.gzip' % name):
3737
with st.spinner('Downloading file from remote server'):
3838
import pandas
3939
test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
4040
test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
41-
test_dataset.to_parquet('test.gzip')
41+
test_dataset.to_parquet('%s-test.gzip' % name)
4242

43-
train_dataset = pd.read_parquet('train.gzip', engine='pyarrow')
43+
train_dataset = pd.read_parquet('%s-train.gzip' % name, engine='pyarrow')
44+
test_dataset = pd.read_parquet('%s-test.gzip' % name, engine='pyarrow')
4445

45-
test_dataset = pd.read_parquet('test.gzip', engine='pyarrow')
46+
if enable_xorbits:
47+
train_dataset.rebalance()
48+
test_dataset.rebalance()
4649

4750
dataset = {
48-
"train": train_dataset[:int(len(train_dataset)*fraction)],
49-
"test": test_dataset[:int(len(test_dataset)*fraction)],
51+
"train": train_dataset.sample(frac=fraction),
52+
"test": test_dataset.sample(frac=fraction),
5053
}
5154

5255
return dataset
@@ -351,9 +354,9 @@ def impurity(text, min_len=10):
351354

352355
def process_data(df):
353356
minhashes = {}
354-
for idx, r in df.iterrows():
357+
for idx, text in enumerate(df['text']):
355358
minhash = MinHash(num_perm=128)
356-
for d in ngrams(r['text'], 13):
359+
for d in ngrams(text, 13):
357360
s = "".join(d).encode('utf-8')
358361
minhash.update(s)
359362
minhashes[idx] = minhash

0 commit comments

Comments
 (0)