|
1 | 1 | import streamlit as st |
2 | 2 | import requests |
3 | | -import os |
4 | 3 |
|
5 | | -enable_xorbits = False |
6 | | - |
7 | | -if enable_xorbits: |
8 | | - import xorbits |
9 | | - xorbits.init() |
10 | | - import xorbits.pandas as pd |
11 | | -else: |
12 | | - import pandas as pd |
| 4 | +enable_xorbits = True |
13 | 5 |
|
14 | 6 | st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide") |
15 | 7 | st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face') |
|
25 | 17 |
|
26 | 18 | #@st.cache_data |
27 | 19 | def load_dataset(j, name, fraction): |
| 20 | + import os |
| 21 | + |
| 22 | + if enable_xorbits: |
| 23 | + import xorbits |
| 24 | + xorbits.init() |
| 25 | + import xorbits.pandas as pd |
| 26 | + else: |
| 27 | + import pandas as pd |
28 | 28 |
|
29 | | - if not os.path.exists('train.gzip'): |
| 29 | + if not os.path.exists('%s-train.gzip' % name): |
30 | 30 | with st.spinner('Downloading file from remote server'): |
31 | 31 | import pandas |
32 | 32 | train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train'] |
33 | 33 | train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True) |
34 | | - train_dataset.to_parquet('train.gzip') |
| 34 | + train_dataset.to_parquet('%s-train.gzip' % name) |
35 | 35 |
|
36 | | - if not os.path.exists('test.gzip'): |
| 36 | + if not os.path.exists('%s-test.gzip' % name): |
37 | 37 | with st.spinner('Downloading file from remote server'): |
38 | 38 | import pandas |
39 | 39 | test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation'] |
40 | 40 | test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True) |
41 | | - test_dataset.to_parquet('test.gzip') |
| 41 | + test_dataset.to_parquet('%s-test.gzip' % name) |
42 | 42 |
|
43 | | - train_dataset = pd.read_parquet('train.gzip', engine='pyarrow') |
| 43 | + train_dataset = pd.read_parquet('%s-train.gzip' % name, engine='pyarrow') |
| 44 | + test_dataset = pd.read_parquet('%s-test.gzip' % name, engine='pyarrow') |
44 | 45 |
|
45 | | - test_dataset = pd.read_parquet('test.gzip', engine='pyarrow') |
| 46 | + if enable_xorbits: |
| 47 | + train_dataset.rebalance() |
| 48 | + test_dataset.rebalance() |
46 | 49 |
|
47 | 50 | dataset = { |
48 | | - "train": train_dataset[:int(len(train_dataset)*fraction)], |
49 | | - "test": test_dataset[:int(len(test_dataset)*fraction)], |
| 51 | + "train": train_dataset.sample(frac=fraction), |
| 52 | + "test": test_dataset.sample(frac=fraction), |
50 | 53 | } |
51 | 54 |
|
52 | 55 | return dataset |
@@ -351,9 +354,9 @@ def impurity(text, min_len=10): |
351 | 354 |
|
352 | 355 | def process_data(df): |
353 | 356 | minhashes = {} |
354 | | - for idx, r in df.iterrows(): |
| 357 | + for idx, text in enumerate(df['text']): |
355 | 358 | minhash = MinHash(num_perm=128) |
356 | | - for d in ngrams(r['text'], 13): |
| 359 | + for d in ngrams(text, 13): |
357 | 360 | s = "".join(d).encode('utf-8') |
358 | 361 | minhash.update(s) |
359 | 362 | minhashes[idx] = minhash |
|
0 commit comments