From 15c53ebe903bedec04b3fe4971188e0535a00c96 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Mon, 23 Jun 2025 16:45:07 -0400 Subject: [PATCH 1/9] Add deduplication logic --- README.md | 15 ++ dedup.py | 398 +++++++++++++++++++++++++++++++++++++++++++++++++ export.py | 28 +++- test_dedup.py | 402 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 842 insertions(+), 1 deletion(-) create mode 100644 dedup.py create mode 100644 test_dedup.py diff --git a/README.md b/README.md index 54fe6b8..0113e64 100644 --- a/README.md +++ b/README.md @@ -34,3 +34,18 @@ python export.py The script will create a directory at the specified output path containing the dataset in Parquet format. If `--output_dir` is not provided, it will save to `dataset` in the current working directory. +## Tests +The deduplication scripts can be tested by running +```bash +python test_dedup.py +# if you have pytest you can run +python -m pytest test_dedup.py -v +``` +To test things we actually create a fake dataset. Here are the features of it +The test creates a 50-entry dataset with: +- **Exact duplicates**: First 5 entries use identical code +- **Fuzzy duplicates**: Next 5 entries use similar code with small variations +- **Multiple run modes**: `leaderboard`, `test`, `benchmark` +- **Mixed success states**: Both `True` and `False` values for `run_passed` +- **Realistic struct data**: Complex nested structures for `run_result`, `run_compilation`, `run_meta`, and `run_system_info` +- **Proper timestamps**: All timestamp fields include timezone information diff --git a/dedup.py b/dedup.py new file mode 100644 index 0000000..b316e38 --- /dev/null +++ b/dedup.py @@ -0,0 +1,398 @@ +# script to dedup a huggingface dataset + +from datasets import load_dataset +import tqdm +from collections import defaultdict +import hashlib +from typing import Dict, List, Tuple, Union + +import datasketch +import pandas as pd + +def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]): + """ + Remove exact duplicates from the nested data structure returned by get_sorted_hf_data. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + Dictionary with same structure but duplicates removed + """ + deduplicated_dict = {} + + for run_mode, score_duration_dict in tqdm.tqdm(data_dict.items(), desc="Processing run modes"): + deduplicated_dict[run_mode] = {} + + for run_success, run_success_dict in tqdm.tqdm(score_duration_dict.items(), desc=f"Processing {run_mode}", leave=False): + deduplicated_dict[run_mode][run_success] = {} + for score_duration, rows in tqdm.tqdm(run_success_dict.items(), desc=f"Processing {run_mode}", leave=False): + # Use a dictionary to track unique entries by their content hash + unique_entries = {} + + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False): + # Create a hash of the relevant content (assuming 'input' or similar field exists) + # If the row has an 'input' field, use that; otherwise use the entire row + content = row.get('code', "") + content_hash = hashlib.sha256(content.encode()).hexdigest() + + if content_hash not in unique_entries: + unique_entries[content_hash] = row + else: + # If duplicate found, keep the one with better metrics + existing_row = unique_entries[content_hash] + + # For leaderboard mode with successful runs, prefer higher scores + if run_mode == 'leaderboard' and row.get('run_passed') == True: + if row.get('run_score', 0) > existing_row.get('run_score', 0): + unique_entries[content_hash] = row + # For other cases, prefer shorter duration (faster execution) + else: + existing_duration = existing_row.get('run_meta', {}).get('duration', float('inf')) + current_duration = row.get('run_meta', {}).get('duration', float('inf')) + if current_duration < existing_duration: + unique_entries[content_hash] = row + + deduplicated_dict[run_mode][run_success][score_duration] = list(unique_entries.values()) + + return deduplicated_dict + + +def create_minhashes( + documents: List[Dict[str, str]], + ngram_size: int = 5, + bands: int = 20, + rows_per_band: int = 128, +) -> Tuple[Dict[str, datasketch.MinHash], int]: + """ + Create MinHash signatures for a list of documents with LSH bands configuration. + + Args: + documents: List of dictionaries, each containing 'submission_id' and 'input' keys + num_permutations: Number of hash functions to use (default: 100) + ngram_size: Size of n-grams to generate from input text (default: 3) + bands: Number of bands for LSH (default: 20) + + Returns: + Tuple containing: + - Dictionary mapping document submission_ids to their MinHash signatures + - Rows per band (num_permutations / bands) + + Raises: + ValueError: If num_permutations is not divisible by bands + """ + + num_permutations = rows_per_band * bands + + def generate_ngrams(text: str, n: int) -> List[str]: + """Generate n-grams from input text.""" + return [text[i : i + n] for i in range(len(text) - n + 1)] + + # Initialize result dictionary + minhash_dict = {} + # Process each document + for doc in tqdm.tqdm(documents, desc="Creating minhashes"): + minhash = datasketch.MinHash(num_perm=num_permutations) + submission_id = doc["submission_id"] + text = doc["code"].lower() # Convert to lowercase for consistency + + # Generate n-grams + ngrams = generate_ngrams(text, ngram_size) + for ngram in ngrams: + minhash.update(ngram.encode("utf8")) + + minhash_dict[submission_id] = minhash + + return minhash_dict + + +# 16 bands with 128 rows +def create_similarity_matrix( + minhashes: Dict[str, datasketch.MinHash], + rows_per_band: int, + num_bands: int, + threshold: float, +) -> Dict[str, List[str]]: + lsh = datasketch.MinHashLSH(threshold=threshold, num_perm=num_bands * rows_per_band) + print(f"num_perm: {num_bands*rows_per_band}") + similarity_matrix = {} + for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Inserting minhashes into LSH"): + lsh.insert(submission_id, minhash) + for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Querying LSH"): + similar_submission_ids = lsh.query(minhash) + similarity_matrix[submission_id] = similar_submission_ids + for submission_id, similar_submission_ids in tqdm.tqdm( + similarity_matrix.items(), desc="Removing self-similarities" + ): + if submission_id in similar_submission_ids: + similar_submission_ids.remove(submission_id) + return similarity_matrix + + +def filter_matrix( + similarity_matrix: Dict[str, List[str]] +) -> set: + good_submission_ids = set() + processed = set() + + for submission_id, similar_submission_ids in similarity_matrix.items(): + if submission_id in processed: + continue + + # Find all submissions in the similarity cluster + cluster = {submission_id} + cluster.update(similar_submission_ids) + + # Keep the one with the largest ID (tiebreaker) + keeper = max(cluster) + good_submission_ids.add(keeper) + + # Mark all in cluster as processed + processed.update(cluster) + + return good_submission_ids + + +def fuzzy_filter( + data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]], + threshold: float = 0.7, + ngram_size: int = 5, + bands: int = 16, + rows_per_band: int = 128, +) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: + + total_categories = 0 + for run_mode, run_success_dict in data_dict.items(): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + total_categories += 1 + + deduped_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + current_category = 0 + for run_mode, run_success_dict in data_dict.items(): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + print(f"Processing {run_mode} {run_success} {score_duration} {len(rows)}") + print(f"This is {current_category} of {total_categories}") + current_category += 1 + deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(rows, threshold, ngram_size, bands, rows_per_band) + + return deduped_data + +def _fuzzy_filter( + data_list: List[Dict], + threshold: float = 0.7, + ngram_size: int = 5, + bands: int = 16, + rows_per_band: int = 128, +) -> List[Dict]: + """ + Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + threshold: Similarity threshold for LSH + ngram_size: Size of n-grams for MinHash + bands: Number of bands for LSH + rows_per_band: Rows per band for LSH + create_histogram: Whether to create similarity histogram + + Returns: + Dictionary with same structure but fuzzy duplicates removed + """ + # Flatten the data for processing + + # Create documents for MinHash processing + + if len(data_list) <= 1: + return data_list + + all_documents = [] + for i, row in tqdm.tqdm(enumerate(data_list), desc="Creating documents for MinHash"): + # Use 'input' field if available, otherwise use a string representation + content = row.get('code', str(row)) + document = { + "submission_id": str(i), + "code": content, + "original_row": row + } + all_documents.append(document) + + # Apply fuzzy deduplication + minhashes = create_minhashes( + all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band + ) + similarity_matrix = create_similarity_matrix( + minhashes, rows_per_band=rows_per_band, num_bands=bands, threshold=threshold + ) + + good_submission_ids = filter_matrix(similarity_matrix) + + # Keep only the documents that passed the filter + good_documents = [all_documents[int(submission_id)]["original_row"] for submission_id in good_submission_ids] + + # Reconstruct the nested structure + return good_documents + +def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]: + # Login using e.g. `huggingface-cli login` to access this dataset + ds = load_dataset("GPUMODE/kernelbot-data", "submissions") + + # we should divide things up into type + # run_mode + # run_sucess + # if run_mode is leaderboard then use score + # otherwise use run_meta[duration] + + + data = ds['train'] + + run_mode_dict = defaultdict(list) + run_success_dict = defaultdict(lambda: defaultdict(list)) + run_duration_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + + for _, row in tqdm.tqdm(enumerate(data), desc="Processing dataset rows"): + run_mode = row['run_mode'] + run_mode_dict[run_mode].append(row) + + for run_mode, rows in tqdm.tqdm(run_mode_dict.items(), desc="Processing run modes"): + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} success/failure", leave=False): + run_success_dict[run_mode][row['run_passed']].append(row) + + for run_mode, mode_dict in tqdm.tqdm(run_success_dict.items(), desc="Processing success/failure groups"): + for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False): + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False): + if run_mode == 'leaderboard' and run_success == True: + rounded_score = round(float(row['run_score']), 4) + run_duration_dict[run_mode][run_success][rounded_score].append(row) + else: + rounded_duration = round(float(row['run_meta']['duration']), 0) + run_duration_dict[run_mode][run_success][rounded_duration].append(row) + + return run_duration_dict + +def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: + """ + Convert a pandas DataFrame to a nested dictionary structure. + + Args: + df: pandas DataFrame + + Returns: + Nested dictionary structure + """ + data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + for _, row in tqdm.tqdm(df.iterrows(), desc="Processing DataFrame rows"): + run_mode = row['run_mode'] + run_success = row['run_passed'] + score_duration = row['run_meta']['duration'] + data_dict[run_mode][run_success][score_duration].append(row) + return data_dict + +def flatten_data(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]]) -> List[Dict]: + """ + Flatten the nested data structure to a list of documents with metadata. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + List of documents with additional metadata fields + """ + flattened = [] + for run_mode, run_success_dict in tqdm.tqdm(data_dict.items(), desc="Flattening data"): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False): + # Add metadata to each row + row_with_metadata = row.copy() + row_with_metadata['_run_mode'] = run_mode + row_with_metadata['_run_success'] = run_success + row_with_metadata['_score_duration'] = score_duration + flattened.append(row_with_metadata) + return flattened + +def count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int: + """ + Count total number of items in the nested data structure. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + Total number of items + """ + total = 0 + for run_mode in data_dict.values(): + for run_success_dict in run_mode.values(): + for rows in run_success_dict.values(): + total += len(rows) + return total + + +def example_usage(): + """ + Example of how to use the deduplication functions with get_hf_data output. + """ + # Load the data + data = get_hf_data() + + print(f"Original data has {count_items(data)} total items") + + # Remove exact duplicates + deduplicated_data = remove_duplicates(data) + print(f"After exact deduplication: {count_items(deduplicated_data)} items") + + # Apply fuzzy deduplication + fuzzy_deduplicated_data = fuzzy_filter( + deduplicated_data, + threshold=0.8, # High threshold for more strict deduplication + ngram_size=5, + bands=16, + rows_per_band=128 + ) + # convert to df + flattened_data = flatten_data(fuzzy_deduplicated_data) + df = pd.DataFrame(flattened_data) + + return df + +def dedup_df(df: pd.DataFrame) -> pd.DataFrame: + """ + Deduplicate a pandas DataFrame. + + Args: + df: pandas DataFrame + """ + # convert to dict + data_dict = convert_df_to_dict(df) + # deduplicate + deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128) + # convert to df + flattened_data = flatten_data(deduplicated_data) + df = pd.DataFrame(flattened_data) + return df + +def create_parquet_file(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]], filename: str): + """ + Create a Parquet file from the nested data structure. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + filename: Name of the output Parquet file + """ + # Flatten the data + flattened_data = flatten_data(data_dict) + + # Create a pandas DataFrame from the flattened data + df = pd.DataFrame(flattened_data) + # Convert the DataFrame to a Parquet file + df.to_parquet(filename, index=False) + + + +def main(): + example_usage() + +if __name__ == "__main__": + main() diff --git a/export.py b/export.py index b4e3cdb..8c534ae 100644 --- a/export.py +++ b/export.py @@ -5,6 +5,7 @@ from datasets import Dataset from dotenv import load_dotenv from sqlalchemy import create_engine, text +from dedup import dedup_df load_dotenv() @@ -199,11 +200,27 @@ def main(output_dir): submissions_dataset.to_parquet(submissions_output_path) print(f"Submissions dataset successfully saved to {submissions_output_path}") + # Deduplicate submissions + print("Applying deduplication to submissions...") + try: + deduplicated_submissions_df = dedup_df(submissions_df.copy()) + deduplicated_submissions_path = os.path.join(output_dir, "deduplicated_submissions.parquet") + + # Convert to dataset and save + deduplicated_submissions_dataset = Dataset.from_pandas(deduplicated_submissions_df) + deduplicated_submissions_dataset.to_parquet(deduplicated_submissions_path) + print(f"Deduplicated submissions dataset successfully saved to {deduplicated_submissions_path}") + print(f"Original submissions: {len(submissions_df)}, After deduplication: {len(deduplicated_submissions_df)}") + + except Exception as e: + print(f"Warning: Deduplication failed with error: {e}") + print("Proceeding without deduplication...") + deduplicated_submissions_df = submissions_df.copy() + # Filter for and save successful submissions from the anonymized data if 'run_passed' in submissions_df.columns: print("Creating successful submissions dataset...") successful_submissions_df = submissions_df[submissions_df['run_passed'] == True].copy() - # Convert to dataset and save successful_submissions_dataset = Dataset.from_pandas(successful_submissions_df) successful_output_path = os.path.join( @@ -215,6 +232,15 @@ def main(output_dir): f"{successful_output_path}" ) + # Create deduplicated successful submissions + print("Creating deduplicated successful submissions dataset...") + deduplicated_successful_submissions_df = deduplicated_submissions_df[deduplicated_submissions_df['run_passed'] == True].copy() + deduplicated_successful_submissions_dataset = Dataset.from_pandas(deduplicated_successful_submissions_df) + deduplicated_successful_submissions_path = os.path.join(output_dir, "deduplicated_successful_submissions.parquet") + deduplicated_successful_submissions_dataset.to_parquet(deduplicated_successful_submissions_path) + print(f"Deduplicated successful submissions dataset successfully saved to {deduplicated_successful_submissions_path}") + print(f"Original successful submissions: {len(successful_submissions_df)}, After deduplication: {len(deduplicated_successful_submissions_df)}") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Export leaderboard data to a Hugging Face dataset.") diff --git a/test_dedup.py b/test_dedup.py new file mode 100644 index 0000000..f5ea811 --- /dev/null +++ b/test_dedup.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 +""" +Unit tests for the deduplication pipeline. +Tests the end-to-end flow with fake data matching the database schema. +""" + +import unittest +import pandas as pd +import numpy as np +from datetime import datetime, timezone +import random +from typing import Dict, List, Any +import tempfile +import os +import sys + +# Import the functions we want to test +try: + from dedup import ( + remove_duplicates, + fuzzy_filter, + convert_df_to_dict, + flatten_data, + dedup_df, + count_items, + create_parquet_file + ) +except ImportError as e: + print(f"Import error: {e}") + print("Some functions may not be available for testing") + + +class TestDedupEndToEnd(unittest.TestCase): + + def setUp(self): + """Set up test fixtures with fake data matching the schema.""" + random.seed(42) # For reproducible tests + np.random.seed(42) + self.fake_data = self.create_fake_dataset(50) + self.df = pd.DataFrame(self.fake_data) + + def create_fake_dataset(self, num_entries: int) -> List[Dict[str, Any]]: + """Create a fake dataset with the required schema fields.""" + fake_data = [] + + # Sample code snippets (some duplicates for testing) + code_samples = [ + "def hello_world():\n print('Hello World')", + "import numpy as np\nx = np.array([1, 2, 3])", + "for i in range(10):\n print(i)", + "class MyClass:\n def __init__(self):\n pass", + "def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)", + "import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})", + "def quicksort(arr):\n if len(arr) <= 1:\n return arr", + "x = [1, 2, 3, 4, 5]\ny = [i**2 for i in x]", + "try:\n result = 10 / 0\nexcept ZeroDivisionError:\n print('Error')", + "def hello_world():\n print('Hello World')", # Exact duplicate + ] + + run_modes = ['leaderboard', 'benchmark', 'test'] + file_names = ['solution.py', 'main.py', 'algorithm.py', 'test.py'] + + for i in range(num_entries): + # Create base timestamp + base_time = datetime(2024, 1, 1, tzinfo=timezone.utc) + submission_time = base_time.replace( + day=random.randint(1, 28), + hour=random.randint(0, 23), + minute=random.randint(0, 59) + ) + + # Select code (with some duplicates) + code = random.choice(code_samples) + if i < 5: # First 5 entries use the same code for exact duplicate testing + code = code_samples[0] + elif i < 10: # Next 5 use slightly modified versions for fuzzy testing + code = code_samples[0] + f"\n# Comment {i}" + + run_mode = random.choice(run_modes) + run_passed = random.choice([True, False]) + + # Generate run score based on mode and success + if run_mode == 'leaderboard' and run_passed: + run_score = round(random.uniform(0.1, 1.0), 4) + else: + run_score = 0.0 if not run_passed else round(random.uniform(0.1, 0.8), 4) + + # Create the entry matching the database schema + entry = { + 'submission_id': i + 1000, + 'leaderboard_id': random.randint(1, 10), + 'user_id': random.randint(100, 999), + 'submission_time': submission_time, + 'file_name': random.choice(file_names), + 'code': code, + 'code_id': i + 2000, + 'run_id': i + 3000, + 'run_start_time': submission_time, + 'run_end_time': submission_time.replace( + second=random.randint(1, 59) + ), + 'run_mode': run_mode, + 'run_score': run_score, + 'run_passed': run_passed, + 'run_result': { + 'benchmark-count': random.randint(1, 10), + 'benchmark.0.best': f'benchmark_{random.randint(1, 100)}.txt', + 'benchmark.0.err': '', + 'benchmark.0.mean': round(random.uniform(0.1, 2.0), 6), + 'benchmark.0.report': f'report_{i}.json' + }, + 'run_compilation': { + 'command': 'python', + 'exit_code': 0 if run_passed else random.randint(1, 255), + 'nvcc_found': random.choice([True, False]), + 'nvcc_version': f'11.{random.randint(0, 8)}', + 'stderr': '' if run_passed else f'Error message {i}', + 'stdout': f'Output {i}', + 'success': run_passed + }, + 'run_meta': { + 'command': 'python solution.py', + 'duration': round(random.uniform(0.1, 10.0), 3), + 'exit_code': 0 if run_passed else random.randint(1, 255), + 'stderr': '' if run_passed else f'Runtime error {i}', + 'stdout': f'Runtime output {i}', + 'success': run_passed + }, + 'run_system_info': { + 'cpu': f'Intel Core i{random.randint(5, 9)}', + 'gpu': random.choice(['NVIDIA RTX 3080', 'NVIDIA RTX 4090', 'None']), + 'platform': random.choice(['linux', 'darwin', 'win32']), + 'torch': f'2.{random.randint(0, 3)}.{random.randint(0, 9)}' + } + } + fake_data.append(entry) + + return fake_data + + def test_dataframe_creation(self): + """Test that the fake dataset creates a valid DataFrame.""" + self.assertEqual(len(self.df), 50) + + # Check required columns exist (matching the schema in the image) + required_columns = [ + 'submission_id', 'leaderboard_id', 'user_id', 'submission_time', + 'file_name', 'code', 'code_id', 'run_id', 'run_start_time', + 'run_end_time', 'run_mode', 'run_score', 'run_passed', + 'run_result', 'run_compilation', 'run_meta', 'run_system_info' + ] + + for col in required_columns: + self.assertIn(col, self.df.columns, f"Missing required column: {col}") + + # Check data types + self.assertTrue(self.df['submission_id'].dtype in ['int64', 'int32']) + self.assertTrue(self.df['run_passed'].dtype == 'bool') + self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32']) + + # Verify struct fields exist + sample_row = self.df.iloc[0] + self.assertIsInstance(sample_row['run_result'], dict) + self.assertIsInstance(sample_row['run_compilation'], dict) + self.assertIsInstance(sample_row['run_meta'], dict) + self.assertIsInstance(sample_row['run_system_info'], dict) + + def test_convert_df_to_dict(self): + """Test conversion from DataFrame to nested dictionary structure.""" + try: + data_dict = convert_df_to_dict(self.df) + + # Check structure + self.assertIsInstance(data_dict, dict) + + # Should have run_mode keys + run_modes = set(self.df['run_mode'].unique()) + self.assertEqual(set(data_dict.keys()), run_modes) + + # Check nested structure + for run_mode in data_dict: + self.assertIsInstance(data_dict[run_mode], dict) + for run_success in data_dict[run_mode]: + self.assertIsInstance(data_dict[run_mode][run_success], dict) + for score_duration in data_dict[run_mode][run_success]: + self.assertIsInstance( + data_dict[run_mode][run_success][score_duration], + list + ) + except NameError: + self.skipTest("convert_df_to_dict function not available") + + def test_exact_deduplication(self): + """Test exact duplicate removal.""" + try: + data_dict = convert_df_to_dict(self.df) + original_count = count_items(data_dict) + + deduplicated_data = remove_duplicates(data_dict) + deduplicated_count = count_items(deduplicated_data) + + # Should have fewer or equal items after deduplication + self.assertLessEqual(deduplicated_count, original_count) + + # Structure should be preserved + self.assertEqual(set(data_dict.keys()), set(deduplicated_data.keys())) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_fuzzy_deduplication_small(self): + """Test fuzzy duplicate removal with small threshold for faster testing.""" + try: + data_dict = convert_df_to_dict(self.df) + original_count = count_items(data_dict) + + # Use small parameters for faster testing + fuzzy_deduplicated_data = fuzzy_filter( + data_dict, + threshold=0.5, # Lower threshold for faster testing + ngram_size=3, # Smaller ngram size + bands=4, # Fewer bands + rows_per_band=32 # Fewer rows per band + ) + + fuzzy_count = count_items(fuzzy_deduplicated_data) + + # Should have fewer or equal items after fuzzy deduplication + self.assertLessEqual(fuzzy_count, original_count) + + # Structure should be preserved + self.assertEqual(set(data_dict.keys()), set(fuzzy_deduplicated_data.keys())) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_flatten_and_reconstruct(self): + """Test flattening and reconstruction of data.""" + try: + data_dict = convert_df_to_dict(self.df) + original_count = count_items(data_dict) + + # Flatten + flattened_data = flatten_data(data_dict) + self.assertEqual(len(flattened_data), original_count) + + # Check metadata fields were added + if flattened_data: + sample_row = flattened_data[0] + self.assertIn('_run_mode', sample_row) + self.assertIn('_run_success', sample_row) + self.assertIn('_score_duration', sample_row) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_dedup_df_end_to_end(self): + """Test the complete deduplication pipeline.""" + try: + original_length = len(self.df) + + # Run the complete deduplication pipeline + deduplicated_df = dedup_df(self.df) + + # Should return a DataFrame + self.assertIsInstance(deduplicated_df, pd.DataFrame) + + # Should have fewer or equal rows + self.assertLessEqual(len(deduplicated_df), original_length) + + # Should preserve required columns + required_columns = ['submission_id', 'code', 'run_mode', 'run_passed'] + for col in required_columns: + self.assertIn(col, deduplicated_df.columns) + + # Check data integrity + self.assertFalse(deduplicated_df.empty, "Deduplicated DataFrame should not be empty") + + except NameError as e: + self.skipTest(f"dedup_df function not available: {e}") + + def test_parquet_creation(self): + """Test Parquet file creation.""" + try: + data_dict = convert_df_to_dict(self.df) + + with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as tmp_file: + try: + create_parquet_file(data_dict, tmp_file.name) + + # Check file was created + self.assertTrue(os.path.exists(tmp_file.name)) + + # Check file is not empty + self.assertGreater(os.path.getsize(tmp_file.name), 0) + + # Try to read the file back + df_from_parquet = pd.read_parquet(tmp_file.name) + self.assertIsInstance(df_from_parquet, pd.DataFrame) + self.assertGreater(len(df_from_parquet), 0) + + finally: + # Clean up + if os.path.exists(tmp_file.name): + os.unlink(tmp_file.name) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_data_consistency_after_deduplication(self): + """Test that data remains consistent after deduplication.""" + try: + # Create dataset with known duplicates + duplicate_data = [] + + # Add the same code 3 times with different metadata + base_entry = self.fake_data[0].copy() + for i in range(3): + entry = base_entry.copy() + entry['submission_id'] = 9000 + i + entry['run_id'] = 9100 + i + duplicate_data.append(entry) + + # Add to main dataset + test_data = self.fake_data + duplicate_data + test_df = pd.DataFrame(test_data) + + original_length = len(test_df) + deduplicated_df = dedup_df(test_df) + + # Should have removed at least 2 duplicates + self.assertLess(len(deduplicated_df), original_length) + + # Check that essential fields are preserved + self.assertTrue(all(col in deduplicated_df.columns for col in + ['submission_id', 'code', 'run_mode', 'run_passed'])) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_schema_compliance(self): + """Test that the fake dataset matches the expected schema from the database.""" + # Test all required fields exist and have correct types + + # Test BIGINT fields + bigint_fields = ['submission_id', 'leaderboard_id', 'user_id', 'code_id', 'run_id'] + for field in bigint_fields: + self.assertTrue(self.df[field].dtype in ['int64', 'int32'], + f"{field} should be integer type") + + # Test VARCHAR fields + varchar_fields = ['file_name', 'code', 'run_mode'] + for field in varchar_fields: + self.assertTrue(self.df[field].dtype == 'object', + f"{field} should be string/object type") + + # Test TIMESTAMP fields + timestamp_fields = ['submission_time', 'run_start_time', 'run_end_time'] + for field in timestamp_fields: + # Check that all values are datetime objects with timezone + sample_value = self.df[field].iloc[0] + self.assertIsInstance(sample_value, datetime) + self.assertIsNotNone(sample_value.tzinfo) + + # Test DOUBLE field + self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32']) + + # Test BOOLEAN field + self.assertTrue(self.df['run_passed'].dtype == 'bool') + + # Test STRUCT fields + struct_fields = ['run_result', 'run_compilation', 'run_meta', 'run_system_info'] + for field in struct_fields: + # All values should be dictionaries + self.assertTrue(all(isinstance(val, dict) for val in self.df[field])) + + def test_duplicate_detection(self): + """Test that we can detect exact and near duplicates in the dataset.""" + # Count exact duplicates by code + code_counts = self.df['code'].value_counts() + exact_duplicates = code_counts[code_counts > 1] + + # Should have some exact duplicates (first 5 entries) + self.assertGreater(len(exact_duplicates), 0, "Should have exact duplicates for testing") + + # Check that fuzzy duplicates exist (entries with similar code) + similar_code_count = 0 + base_code = "def hello_world():\n print('Hello World')" + for code in self.df['code']: + if base_code in code and code != base_code: + similar_code_count += 1 + + self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing") + + +if __name__ == '__main__': + # Add some helpful output + print("Running deduplication pipeline tests...") + print(f"Python version: {sys.version}") + print(f"Pandas version: {pd.__version__}") + + # Run the tests + unittest.main(verbosity=2) \ No newline at end of file From 2ad5e808d56243da49aa25a44dbac861628263d7 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Mon, 23 Jun 2025 17:10:24 -0400 Subject: [PATCH 2/9] magic number removal --- dedup.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/dedup.py b/dedup.py index b316e38..8f209a6 100644 --- a/dedup.py +++ b/dedup.py @@ -9,6 +9,82 @@ import datasketch import pandas as pd +# ============================================================================= +# DEDUPLICATION CONFIGURATION CONSTANTS +# ============================================================================= + +# Fuzzy Deduplication Parameters +FUZZY_SIMILARITY_THRESHOLD = 0.8 +""" +Jaccard similarity threshold for considering two documents as duplicates. +Range: 0.0 to 1.0 +- 0.8 = High threshold, only very similar documents are considered duplicates +- 0.7 = Medium threshold, moderately similar documents are duplicates +- 0.5 = Low threshold, loosely similar documents are duplicates +Higher values = more strict deduplication, fewer items removed +""" + +NGRAM_SIZE = 5 +""" +Size of character n-grams used for MinHash fingerprinting. +- Smaller values (3-4): More sensitive to small changes, better for short text +- Larger values (5-7): Less sensitive to minor variations, better for longer text +- Too small: May create false positives (different texts seem similar) +- Too large: May miss actual duplicates with small variations +""" + +LSH_BANDS = 16 +""" +Number of bands for Locality Sensitive Hashing (LSH). +Used to speed up similarity detection by grouping similar hashes. +- More bands = faster but less accurate similarity detection +- Fewer bands = slower but more accurate similarity detection +Must divide evenly into ROWS_PER_BAND * LSH_BANDS = total permutations +""" + +ROWS_PER_BAND = 128 +""" +Number of rows per band in LSH configuration. +Total MinHash permutations = ROWS_PER_BAND * LSH_BANDS +- More rows per band = higher precision, may miss some similar pairs +- Fewer rows per band = higher recall, may include more false positives +Default: 128 rows × 16 bands = 2048 total permutations +""" + +# Score Processing Parameters +LEADERBOARD_SCORE_PRECISION = 4 +""" +Number of decimal places to round leaderboard scores when grouping submissions. +Used to group submissions with very similar scores together. +- Higher precision (more decimal places): More granular grouping +- Lower precision (fewer decimal places): Broader grouping of similar scores +""" + +DURATION_PRECISION = 0 +""" +Number of decimal places to round execution duration (in seconds). +Used to group submissions with similar execution times. +- 0: Round to nearest second (1.7s → 2s) +- 1: Round to nearest 0.1s (1.73s → 1.7s) +""" + +# ============================================================================= +# CONFIGURATION SUMMARY +# ============================================================================= +""" +Current deduplication configuration: +├─ Similarity Detection: 0.8 threshold (strict) +├─ Text Fingerprinting: 5-character n-grams +├─ LSH Performance: 16 bands × 128 rows = 2048 permutations +├─ Score Grouping: 4 decimal places for leaderboard scores +└─ Duration Grouping: 0 decimal places for execution times + +To adjust deduplication sensitivity: +- Increase FUZZY_SIMILARITY_THRESHOLD (0.8→0.9) for stricter deduplication +- Decrease FUZZY_SIMILARITY_THRESHOLD (0.8→0.7) for more aggressive deduplication +- Adjust NGRAM_SIZE for different text lengths (3-4 for short, 5-7 for long) +""" + def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]): """ Remove exact duplicates from the nested data structure returned by get_sorted_hf_data. @@ -60,9 +136,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li def create_minhashes( documents: List[Dict[str, str]], - ngram_size: int = 5, - bands: int = 20, - rows_per_band: int = 128, + ngram_size: int = NGRAM_SIZE, + bands: int = LSH_BANDS, + rows_per_band: int = ROWS_PER_BAND, ) -> Tuple[Dict[str, datasketch.MinHash], int]: """ Create MinHash signatures for a list of documents with LSH bands configuration. @@ -155,10 +231,10 @@ def filter_matrix( def fuzzy_filter( data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]], - threshold: float = 0.7, - ngram_size: int = 5, - bands: int = 16, - rows_per_band: int = 128, + threshold: float = FUZZY_SIMILARITY_THRESHOLD, + ngram_size: int = NGRAM_SIZE, + bands: int = LSH_BANDS, + rows_per_band: int = ROWS_PER_BAND, ) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: total_categories = 0 @@ -181,10 +257,10 @@ def fuzzy_filter( def _fuzzy_filter( data_list: List[Dict], - threshold: float = 0.7, - ngram_size: int = 5, - bands: int = 16, - rows_per_band: int = 128, + threshold: float = FUZZY_SIMILARITY_THRESHOLD, + ngram_size: int = NGRAM_SIZE, + bands: int = LSH_BANDS, + rows_per_band: int = ROWS_PER_BAND, ) -> List[Dict]: """ Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data. @@ -263,10 +339,10 @@ def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]: for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False): for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False): if run_mode == 'leaderboard' and run_success == True: - rounded_score = round(float(row['run_score']), 4) + rounded_score = round(float(row['run_score']), LEADERBOARD_SCORE_PRECISION) run_duration_dict[run_mode][run_success][rounded_score].append(row) else: - rounded_duration = round(float(row['run_meta']['duration']), 0) + rounded_duration = round(float(row['run_meta']['duration']), DURATION_PRECISION) run_duration_dict[run_mode][run_success][rounded_duration].append(row) return run_duration_dict @@ -346,10 +422,10 @@ def example_usage(): # Apply fuzzy deduplication fuzzy_deduplicated_data = fuzzy_filter( deduplicated_data, - threshold=0.8, # High threshold for more strict deduplication - ngram_size=5, - bands=16, - rows_per_band=128 + threshold=FUZZY_SIMILARITY_THRESHOLD, + ngram_size=NGRAM_SIZE, + bands=LSH_BANDS, + rows_per_band=ROWS_PER_BAND ) # convert to df flattened_data = flatten_data(fuzzy_deduplicated_data) @@ -367,7 +443,13 @@ def dedup_df(df: pd.DataFrame) -> pd.DataFrame: # convert to dict data_dict = convert_df_to_dict(df) # deduplicate - deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128) + deduplicated_data = fuzzy_filter( + data_dict, + threshold=FUZZY_SIMILARITY_THRESHOLD, + ngram_size=NGRAM_SIZE, + bands=LSH_BANDS, + rows_per_band=ROWS_PER_BAND + ) # convert to df flattened_data = flatten_data(deduplicated_data) df = pd.DataFrame(flattened_data) From a30f85d0353b6347aef785e827edc7f25f97bae9 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Tue, 24 Jun 2025 08:43:42 -0400 Subject: [PATCH 3/9] Add deduplicated datasets --- dedup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedup.py b/dedup.py index 8f209a6..f6f3449 100644 --- a/dedup.py +++ b/dedup.py @@ -118,9 +118,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li # If duplicate found, keep the one with better metrics existing_row = unique_entries[content_hash] - # For leaderboard mode with successful runs, prefer higher scores + # For leaderboard mode with successful runs, prefer lower scores / faster times if run_mode == 'leaderboard' and row.get('run_passed') == True: - if row.get('run_score', 0) > existing_row.get('run_score', 0): + if row.get('run_score', 0) < existing_row.get('run_score', 0): unique_entries[content_hash] = row # For other cases, prefer shorter duration (faster execution) else: From 16567f949cecc27b946ff352d90f2323681e97a4 Mon Sep 17 00:00:00 2001 From: Benjamin Horowitz Date: Mon, 24 Nov 2025 15:58:03 -0800 Subject: [PATCH 4/9] Updates for 2nd comptetition This change modifies the extraction processes so that it uses a lot less memory. In particular, the process no longer loads the whole dataset into memory before exporting to parquet files. Instead, it processes the dataset into small, incremental parquet files, and then consolidates these files into a single file as the final step. --- .gitignore | 2 + dataset/README.md | 8 +- export.py | 259 +++++++++++++++++++++++++++++++--------------- requirements.txt | 2 - 4 files changed, 183 insertions(+), 88 deletions(-) diff --git a/.gitignore b/.gitignore index 4c49bd7..76ffd48 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ .env +dataset/*.parquet +dataset/submissions/* \ No newline at end of file diff --git a/dataset/README.md b/dataset/README.md index 6084549..52550e2 100644 --- a/dataset/README.md +++ b/dataset/README.md @@ -11,7 +11,13 @@ tags: license: mit --- -If you use GPUMODE/amd-kernels-2025 in your work, please cite: +This is the dataset that was created from the first and second AMD $100K kernel competitions, containing roughly 110K kernels for fp8-gemm, moe, mla, all2all, gemm+reducescatter, and allgather+gemm optimized to run on MI300. Learn more at gpumode.com/v2/news + +To see the full list of kernel competitions we've ran and are running you can checkout https://github.com/gpu-mode/reference-kernels which also contains details on reference kernels and their input shapes and distributions + +We are planning on adding kernels optimized for NVFP4 on Blackwell next + +If you use this dataset in your work, please cite: ```bibtex @inproceedings{ diff --git a/export.py b/export.py index b4e3cdb..6d3078f 100644 --- a/export.py +++ b/export.py @@ -1,10 +1,11 @@ import argparse +import gc import os -import numpy as np import pandas as pd -from datasets import Dataset from dotenv import load_dotenv from sqlalchemy import create_engine, text +import pyarrow as pa +import pyarrow.parquet as pq load_dotenv() @@ -24,32 +25,32 @@ DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" # The leaderboard IDs to export -LEADERBOARD_IDS = [463, 430, 399, 398] +LEADERBOARD_IDS = [463, 430, 399, 398, 563, 564, 565] -def fetch_leaderboards(engine, leaderboard_ids) -> Dataset: +def fetch_and_save_leaderboards(engine, leaderboard_ids, output_path): """ - Fetches and processes leaderboard data from the database. + Fetches leaderboard data from the database and saves it directly to parquet. This function queries the database for specific leaderboards, selecting key fields and fetching all associated GPU types for each leaderboard - using a subquery. + using a subquery. It saves the leaderboards directly to parquet. Args: engine: The SQLAlchemy engine instance for database connection. leaderboard_ids: A list of integer IDs for the leaderboards to fetch. Returns: - A Hugging Face `Dataset` object containing the leaderboard data. + The number of leaderboards. """ - print("Fetching leaderboards...") + print("Fetching and saving leaderboards...") query = text(""" SELECT id, name, - deadline, + deadline AT TIME ZONE 'UTC' AS deadline, task->>'lang' AS lang, - task->>'description' AS description, + description, task->'files'->>'reference.py' AS reference, ( SELECT array_agg(gpu_type) @@ -60,10 +61,45 @@ def fetch_leaderboards(engine, leaderboard_ids) -> Dataset: WHERE id = ANY(:leaderboard_ids) """) df = pd.read_sql_query(query, engine, params={'leaderboard_ids': leaderboard_ids}) - return Dataset.from_pandas(df) + df.to_parquet(output_path, index=False) + print(f"Leaderboards saved to {output_path}") + + +def anonymize_users_in_db(engine, leaderboard_ids): + """Create a temporary mapping table in the database.""" + with engine.begin() as conn: + # Create temporary table with anonymized IDs + conn.execute(text(""" + CREATE TEMP TABLE user_mapping AS + SELECT + user_id as original_user_id, + ROW_NUMBER() OVER (ORDER BY RANDOM()) as anonymized_user_id + FROM ( + SELECT DISTINCT user_id + FROM leaderboard.submission + WHERE leaderboard_id = ANY(:leaderboard_ids) + ) AS distinct_users + """), {'leaderboard_ids': leaderboard_ids}) + + +def handle_empty_structs(df): + """ + Replace empty struct/dict values with None to avoid PyArrow serialization errors. + + PyArrow cannot write empty struct types to Parquet. This function checks + columns that contain dict/struct values and replaces empty ones with None. + """ + for col in df.columns: + if df[col].dtype == 'object': + # Check if column contains dict-like objects + sample = df[col].dropna().head(1) + if len(sample) > 0 and isinstance(sample.iloc[0], dict): + # Replace empty dicts with None + df[col] = df[col].apply(lambda x: None if isinstance(x, dict) and len(x) == 0 else x) + return df -def fetch_submissions(engine, leaderboard_ids) -> Dataset: +def fetch_and_save_submissions(engine, leaderboard_ids, output_path, chunksize=8192): """ Fetches and processes submission data from the database. @@ -76,23 +112,20 @@ def fetch_submissions(engine, leaderboard_ids) -> Dataset: engine: The SQLAlchemy engine instance for database connection. leaderboard_ids: A list of integer IDs for the leaderboards whose submissions are to be fetched. - - Returns: - A Hugging Face `Dataset` object containing the submissions data. """ print("Fetching submissions...") - query = text(""" + query = """ SELECT s.id AS submission_id, s.leaderboard_id, - s.user_id, - s.submission_time, + um.anonymized_user_id AS user_id, + s.submission_time AT TIME ZONE 'UTC' AS submission_time, s.file_name, c.code, c.id AS code_id, r.id AS run_id, - r.start_time AS run_start_time, - r.end_time AS run_end_time, + r.start_time AT TIME ZONE 'UTC' AS run_start_time, + r.end_time AT TIME ZONE 'UTC' AS run_end_time, r.mode AS run_mode, r.score AS run_score, r.passed AS run_passed, @@ -101,12 +134,61 @@ def fetch_submissions(engine, leaderboard_ids) -> Dataset: r.meta as run_meta, r.system_info AS run_system_info FROM leaderboard.submission AS s - JOIN leaderboard.runs AS r ON s.id = r.submission_id + LEFT JOIN leaderboard.runs AS r ON s.id = r.submission_id JOIN leaderboard.code_files AS c ON s.code_id = c.id + LEFT JOIN user_mapping um ON s.user_id = um.original_user_id WHERE s.leaderboard_id = ANY(:leaderboard_ids) - """) - df = pd.read_sql_query(query, engine, params={'leaderboard_ids': leaderboard_ids}) - return Dataset.from_pandas(df) + """ + + part = 0 + + with engine.connect().execution_options(stream_results=True) as conn: + for chunk_df in pd.read_sql_query( + text(query), + conn, + params={'leaderboard_ids': leaderboard_ids}, + chunksize=chunksize + ): + # Decode hex values code column + if 'code' in chunk_df.columns: + chunk_df['code'] = chunk_df['code'].apply(decode_hex_if_needed) + + # Convert nullable integer columns to consistent types + # This prevents type mismatches when some chunks have all NULLs + nullable_int_cols = ['run_id', 'code_id', 'submission_id', 'leaderboard_id', 'user_id'] + for col in nullable_int_cols: + if col in chunk_df.columns: + chunk_df[col] = chunk_df[col].astype('Int64') + + # Handle empty structs that PyArrow can't serialize + chunk_df = handle_empty_structs(chunk_df) + + # Convert to arrow table + table = pa.Table.from_pandas(chunk_df) + + # Write chunk as separate parquet file + filename = os.path.join(output_path, f"submissions_part_{part:05d}.parquet") + pq.write_table(table, filename) + + print(f" Wrote {len(chunk_df)} submissions to part {part}") + + # Filter for and save successful submissions + if 'run_passed' in chunk_df.columns: + success_mask = chunk_df['run_passed'] == True + if success_mask.any(): + success_df = chunk_df[success_mask] + success_table = pa.Table.from_pandas(success_df) + success_filename = os.path.join(output_path, f"successful_submissions_part_{part:05d}.parquet") + pq.write_table(success_table, success_filename) + print(f" Wrote {len(success_df)} successful submissions to part {part}") + del success_df, success_table, success_mask + + del chunk_df, table + gc.collect() + + part += 1 + + print(f"Submissions saved to {part} parquet files in {output_path}") def decode_hex_if_needed(code_val: str) -> str: @@ -132,6 +214,53 @@ def decode_hex_if_needed(code_val: str) -> str: return code_val +def consolidate_parquet_files(input_dir, pattern, output_file): + """ + Consolidates multiple parquet part files into a single parquet file. + + Args: + input_dir: Directory containing the parquet part files + pattern: Glob pattern to match the part files (e.g., "submissions_part_*.parquet") + output_file: Path to the output consolidated parquet file + """ + import glob + + # Find all matching parquet files + part_files = sorted(glob.glob(os.path.join(input_dir, pattern))) + + if not part_files: + print(f" No files found matching pattern {pattern}") + return + + print(f" Consolidating {len(part_files)} {pattern} files into {output_file}...") + + # First pass: Read only schemas (not data) from all files to unify them + schemas = [] + for part_file in part_files: + parquet_file = pq.ParquetFile(part_file) + schemas.append(parquet_file.schema_arrow) + + # Unify schemas across all tables to handle struct field variations + unified_schema = pa.unify_schemas(schemas) + + # Second pass: Read each file, cast to unified schema, and write incrementally + total_rows = 0 + with pq.ParquetWriter(output_file, unified_schema) as writer: + for part_file in part_files: + # Read one file at a time + table = pq.read_table(part_file) + + # Cast to unified schema (fills missing fields with nulls) + unified_table = table.cast(unified_schema) + + # Write to output file + writer.write_table(unified_table) + + total_rows += len(unified_table) + + print(f" Done! Consolidated {len(part_files)} files ({total_rows} total rows)") + + def main(output_dir): """ Orchestrates the data export process. @@ -140,80 +269,40 @@ def main(output_dir): and submission data, anonymizes user IDs, and saves the results to separate Parquet files: `leaderboards.parquet`, `submissions.parquet`, and `successful_submissions.parquet`. The user ID mapping is not saved. + Temporary files are not deleted and should be manually removed if + desired. Args: output_dir (str): The local directory path to save the Parquet files. """ engine = create_engine(DATABASE_URL) - rng = np.random.default_rng() # Ensure the output directory exists os.makedirs(output_dir, exist_ok=True) # Fetch and save leaderboards - leaderboards_dataset = fetch_leaderboards(engine, LEADERBOARD_IDS) leaderboards_output_path = os.path.join(output_dir, "leaderboards.parquet") - leaderboards_dataset.to_parquet(leaderboards_output_path) - print(f"Leaderboards dataset successfully saved to {leaderboards_output_path}") + fetch_and_save_leaderboards(engine, LEADERBOARD_IDS, leaderboards_output_path) + + anonymize_users_in_db(engine, LEADERBOARD_IDS) # Fetch submissions - submissions_dataset = fetch_submissions(engine, LEADERBOARD_IDS) - submissions_df = submissions_dataset.to_pandas() - - # Decode hexadecimal 'code' values - if 'code' in submissions_df.columns: - print("Decoding 'code' column from hexadecimal where necessary...") - submissions_df['code'] = submissions_df['code'].apply(decode_hex_if_needed) - - # Anonymize user IDs if submissions exist - if not submissions_df.empty and 'user_id' in submissions_df.columns: - print("Anonymizing user IDs...") - unique_user_ids = submissions_df['user_id'].unique() - num_unique_users = len(unique_user_ids) - - # Create a randomly permuted mapping in memory - permuted_ids = rng.permutation(range(1, num_unique_users + 1)) - user_map_df = pd.DataFrame({ - 'original_user_id': unique_user_ids, - 'anonymized_user_id': permuted_ids - }) - - # Replace original user IDs with anonymized IDs - original_cols = list(submissions_df.columns) - user_id_index = original_cols.index('user_id') - - submissions_df = submissions_df.merge(user_map_df, left_on='user_id', right_on='original_user_id') - submissions_df = submissions_df.drop(columns=['user_id', 'original_user_id']) - submissions_df = submissions_df.rename(columns={'anonymized_user_id': 'user_id'}) - - # Restore original column order - new_order = [col for col in original_cols if col != 'user_id'] - new_order.insert(user_id_index, 'user_id') - submissions_df = submissions_df[new_order] - - # Convert back to a dataset - submissions_dataset = Dataset.from_pandas(submissions_df) - - # Save the submissions dataset (anonymized or original if empty) - submissions_output_path = os.path.join(output_dir, "submissions.parquet") - submissions_dataset.to_parquet(submissions_output_path) - print(f"Submissions dataset successfully saved to {submissions_output_path}") - - # Filter for and save successful submissions from the anonymized data - if 'run_passed' in submissions_df.columns: - print("Creating successful submissions dataset...") - successful_submissions_df = submissions_df[submissions_df['run_passed'] == True].copy() - - # Convert to dataset and save - successful_submissions_dataset = Dataset.from_pandas(successful_submissions_df) - successful_output_path = os.path.join( - output_dir, "successful_submissions.parquet" - ) - successful_submissions_dataset.to_parquet(successful_output_path) - print( - "Successful submissions dataset successfully saved to " - f"{successful_output_path}" - ) + submissions_output_path = os.path.join(output_dir, "submissions") + os.makedirs(submissions_output_path, exist_ok=True) + fetch_and_save_submissions(engine, LEADERBOARD_IDS, submissions_output_path) + + # Consolidate part files into single parquet files + consolidate_parquet_files( + submissions_output_path, + "submissions_part_*.parquet", + os.path.join(output_dir, "submissions.parquet") + ) + + consolidate_parquet_files( + submissions_output_path, + "successful_submissions_part_*.parquet", + os.path.join(output_dir, "successful_submissions.parquet") + ) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 0698625..3b5b938 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -datasets pandas psycopg2-binary SQLAlchemy pyarrow python-dotenv -numpy \ No newline at end of file From b39b2928ccedb2bf765e4f18c0ae154c0653ed66 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Mon, 23 Jun 2025 16:45:07 -0400 Subject: [PATCH 5/9] Add deduplication logic --- README.md | 15 ++ dedup.py | 398 +++++++++++++++++++++++++++++++++++++++++++++++++ export.py | 31 ++++ test_dedup.py | 402 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 846 insertions(+) create mode 100644 dedup.py create mode 100644 test_dedup.py diff --git a/README.md b/README.md index 54fe6b8..0113e64 100644 --- a/README.md +++ b/README.md @@ -34,3 +34,18 @@ python export.py The script will create a directory at the specified output path containing the dataset in Parquet format. If `--output_dir` is not provided, it will save to `dataset` in the current working directory. +## Tests +The deduplication scripts can be tested by running +```bash +python test_dedup.py +# if you have pytest you can run +python -m pytest test_dedup.py -v +``` +To test things we actually create a fake dataset. Here are the features of it +The test creates a 50-entry dataset with: +- **Exact duplicates**: First 5 entries use identical code +- **Fuzzy duplicates**: Next 5 entries use similar code with small variations +- **Multiple run modes**: `leaderboard`, `test`, `benchmark` +- **Mixed success states**: Both `True` and `False` values for `run_passed` +- **Realistic struct data**: Complex nested structures for `run_result`, `run_compilation`, `run_meta`, and `run_system_info` +- **Proper timestamps**: All timestamp fields include timezone information diff --git a/dedup.py b/dedup.py new file mode 100644 index 0000000..b316e38 --- /dev/null +++ b/dedup.py @@ -0,0 +1,398 @@ +# script to dedup a huggingface dataset + +from datasets import load_dataset +import tqdm +from collections import defaultdict +import hashlib +from typing import Dict, List, Tuple, Union + +import datasketch +import pandas as pd + +def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]): + """ + Remove exact duplicates from the nested data structure returned by get_sorted_hf_data. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + Dictionary with same structure but duplicates removed + """ + deduplicated_dict = {} + + for run_mode, score_duration_dict in tqdm.tqdm(data_dict.items(), desc="Processing run modes"): + deduplicated_dict[run_mode] = {} + + for run_success, run_success_dict in tqdm.tqdm(score_duration_dict.items(), desc=f"Processing {run_mode}", leave=False): + deduplicated_dict[run_mode][run_success] = {} + for score_duration, rows in tqdm.tqdm(run_success_dict.items(), desc=f"Processing {run_mode}", leave=False): + # Use a dictionary to track unique entries by their content hash + unique_entries = {} + + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False): + # Create a hash of the relevant content (assuming 'input' or similar field exists) + # If the row has an 'input' field, use that; otherwise use the entire row + content = row.get('code', "") + content_hash = hashlib.sha256(content.encode()).hexdigest() + + if content_hash not in unique_entries: + unique_entries[content_hash] = row + else: + # If duplicate found, keep the one with better metrics + existing_row = unique_entries[content_hash] + + # For leaderboard mode with successful runs, prefer higher scores + if run_mode == 'leaderboard' and row.get('run_passed') == True: + if row.get('run_score', 0) > existing_row.get('run_score', 0): + unique_entries[content_hash] = row + # For other cases, prefer shorter duration (faster execution) + else: + existing_duration = existing_row.get('run_meta', {}).get('duration', float('inf')) + current_duration = row.get('run_meta', {}).get('duration', float('inf')) + if current_duration < existing_duration: + unique_entries[content_hash] = row + + deduplicated_dict[run_mode][run_success][score_duration] = list(unique_entries.values()) + + return deduplicated_dict + + +def create_minhashes( + documents: List[Dict[str, str]], + ngram_size: int = 5, + bands: int = 20, + rows_per_band: int = 128, +) -> Tuple[Dict[str, datasketch.MinHash], int]: + """ + Create MinHash signatures for a list of documents with LSH bands configuration. + + Args: + documents: List of dictionaries, each containing 'submission_id' and 'input' keys + num_permutations: Number of hash functions to use (default: 100) + ngram_size: Size of n-grams to generate from input text (default: 3) + bands: Number of bands for LSH (default: 20) + + Returns: + Tuple containing: + - Dictionary mapping document submission_ids to their MinHash signatures + - Rows per band (num_permutations / bands) + + Raises: + ValueError: If num_permutations is not divisible by bands + """ + + num_permutations = rows_per_band * bands + + def generate_ngrams(text: str, n: int) -> List[str]: + """Generate n-grams from input text.""" + return [text[i : i + n] for i in range(len(text) - n + 1)] + + # Initialize result dictionary + minhash_dict = {} + # Process each document + for doc in tqdm.tqdm(documents, desc="Creating minhashes"): + minhash = datasketch.MinHash(num_perm=num_permutations) + submission_id = doc["submission_id"] + text = doc["code"].lower() # Convert to lowercase for consistency + + # Generate n-grams + ngrams = generate_ngrams(text, ngram_size) + for ngram in ngrams: + minhash.update(ngram.encode("utf8")) + + minhash_dict[submission_id] = minhash + + return minhash_dict + + +# 16 bands with 128 rows +def create_similarity_matrix( + minhashes: Dict[str, datasketch.MinHash], + rows_per_band: int, + num_bands: int, + threshold: float, +) -> Dict[str, List[str]]: + lsh = datasketch.MinHashLSH(threshold=threshold, num_perm=num_bands * rows_per_band) + print(f"num_perm: {num_bands*rows_per_band}") + similarity_matrix = {} + for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Inserting minhashes into LSH"): + lsh.insert(submission_id, minhash) + for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Querying LSH"): + similar_submission_ids = lsh.query(minhash) + similarity_matrix[submission_id] = similar_submission_ids + for submission_id, similar_submission_ids in tqdm.tqdm( + similarity_matrix.items(), desc="Removing self-similarities" + ): + if submission_id in similar_submission_ids: + similar_submission_ids.remove(submission_id) + return similarity_matrix + + +def filter_matrix( + similarity_matrix: Dict[str, List[str]] +) -> set: + good_submission_ids = set() + processed = set() + + for submission_id, similar_submission_ids in similarity_matrix.items(): + if submission_id in processed: + continue + + # Find all submissions in the similarity cluster + cluster = {submission_id} + cluster.update(similar_submission_ids) + + # Keep the one with the largest ID (tiebreaker) + keeper = max(cluster) + good_submission_ids.add(keeper) + + # Mark all in cluster as processed + processed.update(cluster) + + return good_submission_ids + + +def fuzzy_filter( + data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]], + threshold: float = 0.7, + ngram_size: int = 5, + bands: int = 16, + rows_per_band: int = 128, +) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: + + total_categories = 0 + for run_mode, run_success_dict in data_dict.items(): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + total_categories += 1 + + deduped_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + current_category = 0 + for run_mode, run_success_dict in data_dict.items(): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + print(f"Processing {run_mode} {run_success} {score_duration} {len(rows)}") + print(f"This is {current_category} of {total_categories}") + current_category += 1 + deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(rows, threshold, ngram_size, bands, rows_per_band) + + return deduped_data + +def _fuzzy_filter( + data_list: List[Dict], + threshold: float = 0.7, + ngram_size: int = 5, + bands: int = 16, + rows_per_band: int = 128, +) -> List[Dict]: + """ + Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + threshold: Similarity threshold for LSH + ngram_size: Size of n-grams for MinHash + bands: Number of bands for LSH + rows_per_band: Rows per band for LSH + create_histogram: Whether to create similarity histogram + + Returns: + Dictionary with same structure but fuzzy duplicates removed + """ + # Flatten the data for processing + + # Create documents for MinHash processing + + if len(data_list) <= 1: + return data_list + + all_documents = [] + for i, row in tqdm.tqdm(enumerate(data_list), desc="Creating documents for MinHash"): + # Use 'input' field if available, otherwise use a string representation + content = row.get('code', str(row)) + document = { + "submission_id": str(i), + "code": content, + "original_row": row + } + all_documents.append(document) + + # Apply fuzzy deduplication + minhashes = create_minhashes( + all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band + ) + similarity_matrix = create_similarity_matrix( + minhashes, rows_per_band=rows_per_band, num_bands=bands, threshold=threshold + ) + + good_submission_ids = filter_matrix(similarity_matrix) + + # Keep only the documents that passed the filter + good_documents = [all_documents[int(submission_id)]["original_row"] for submission_id in good_submission_ids] + + # Reconstruct the nested structure + return good_documents + +def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]: + # Login using e.g. `huggingface-cli login` to access this dataset + ds = load_dataset("GPUMODE/kernelbot-data", "submissions") + + # we should divide things up into type + # run_mode + # run_sucess + # if run_mode is leaderboard then use score + # otherwise use run_meta[duration] + + + data = ds['train'] + + run_mode_dict = defaultdict(list) + run_success_dict = defaultdict(lambda: defaultdict(list)) + run_duration_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + + for _, row in tqdm.tqdm(enumerate(data), desc="Processing dataset rows"): + run_mode = row['run_mode'] + run_mode_dict[run_mode].append(row) + + for run_mode, rows in tqdm.tqdm(run_mode_dict.items(), desc="Processing run modes"): + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} success/failure", leave=False): + run_success_dict[run_mode][row['run_passed']].append(row) + + for run_mode, mode_dict in tqdm.tqdm(run_success_dict.items(), desc="Processing success/failure groups"): + for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False): + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False): + if run_mode == 'leaderboard' and run_success == True: + rounded_score = round(float(row['run_score']), 4) + run_duration_dict[run_mode][run_success][rounded_score].append(row) + else: + rounded_duration = round(float(row['run_meta']['duration']), 0) + run_duration_dict[run_mode][run_success][rounded_duration].append(row) + + return run_duration_dict + +def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: + """ + Convert a pandas DataFrame to a nested dictionary structure. + + Args: + df: pandas DataFrame + + Returns: + Nested dictionary structure + """ + data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + for _, row in tqdm.tqdm(df.iterrows(), desc="Processing DataFrame rows"): + run_mode = row['run_mode'] + run_success = row['run_passed'] + score_duration = row['run_meta']['duration'] + data_dict[run_mode][run_success][score_duration].append(row) + return data_dict + +def flatten_data(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]]) -> List[Dict]: + """ + Flatten the nested data structure to a list of documents with metadata. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + List of documents with additional metadata fields + """ + flattened = [] + for run_mode, run_success_dict in tqdm.tqdm(data_dict.items(), desc="Flattening data"): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False): + # Add metadata to each row + row_with_metadata = row.copy() + row_with_metadata['_run_mode'] = run_mode + row_with_metadata['_run_success'] = run_success + row_with_metadata['_score_duration'] = score_duration + flattened.append(row_with_metadata) + return flattened + +def count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int: + """ + Count total number of items in the nested data structure. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + Total number of items + """ + total = 0 + for run_mode in data_dict.values(): + for run_success_dict in run_mode.values(): + for rows in run_success_dict.values(): + total += len(rows) + return total + + +def example_usage(): + """ + Example of how to use the deduplication functions with get_hf_data output. + """ + # Load the data + data = get_hf_data() + + print(f"Original data has {count_items(data)} total items") + + # Remove exact duplicates + deduplicated_data = remove_duplicates(data) + print(f"After exact deduplication: {count_items(deduplicated_data)} items") + + # Apply fuzzy deduplication + fuzzy_deduplicated_data = fuzzy_filter( + deduplicated_data, + threshold=0.8, # High threshold for more strict deduplication + ngram_size=5, + bands=16, + rows_per_band=128 + ) + # convert to df + flattened_data = flatten_data(fuzzy_deduplicated_data) + df = pd.DataFrame(flattened_data) + + return df + +def dedup_df(df: pd.DataFrame) -> pd.DataFrame: + """ + Deduplicate a pandas DataFrame. + + Args: + df: pandas DataFrame + """ + # convert to dict + data_dict = convert_df_to_dict(df) + # deduplicate + deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128) + # convert to df + flattened_data = flatten_data(deduplicated_data) + df = pd.DataFrame(flattened_data) + return df + +def create_parquet_file(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]], filename: str): + """ + Create a Parquet file from the nested data structure. + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + filename: Name of the output Parquet file + """ + # Flatten the data + flattened_data = flatten_data(data_dict) + + # Create a pandas DataFrame from the flattened data + df = pd.DataFrame(flattened_data) + # Convert the DataFrame to a Parquet file + df.to_parquet(filename, index=False) + + + +def main(): + example_usage() + +if __name__ == "__main__": + main() diff --git a/export.py b/export.py index 6d3078f..d7cffdf 100644 --- a/export.py +++ b/export.py @@ -6,6 +6,7 @@ from sqlalchemy import create_engine, text import pyarrow as pa import pyarrow.parquet as pq +from dedup import dedup_df load_dotenv() @@ -304,6 +305,36 @@ def main(output_dir): os.path.join(output_dir, "successful_submissions.parquet") ) + # Apply deduplication to submissions + print("Applying deduplication to submissions...") + submissions_parquet_path = os.path.join(output_dir, "submissions.parquet") + try: + submissions_df = pd.read_parquet(submissions_parquet_path) + original_count = len(submissions_df) + + deduplicated_submissions_df = dedup_df(submissions_df.copy()) + deduplicated_submissions_path = os.path.join(output_dir, "deduplicated_submissions.parquet") + deduplicated_submissions_df.to_parquet(deduplicated_submissions_path, index=False) + + print(f"Deduplicated submissions saved to {deduplicated_submissions_path}") + print(f"Original submissions: {original_count}, After deduplication: {len(deduplicated_submissions_df)}") + + # Create deduplicated successful submissions + if 'run_passed' in deduplicated_submissions_df.columns: + print("Creating deduplicated successful submissions...") + deduplicated_successful_df = deduplicated_submissions_df[deduplicated_submissions_df['run_passed'] == True].copy() + deduplicated_successful_path = os.path.join(output_dir, "deduplicated_successful_submissions.parquet") + deduplicated_successful_df.to_parquet(deduplicated_successful_path, index=False) + + successful_parquet_path = os.path.join(output_dir, "successful_submissions.parquet") + successful_df = pd.read_parquet(successful_parquet_path) + print(f"Deduplicated successful submissions saved to {deduplicated_successful_path}") + print(f"Original successful: {len(successful_df)}, After deduplication: {len(deduplicated_successful_df)}") + + except Exception as e: + print(f"Warning: Deduplication failed with error: {e}") + print("Proceeding without deduplication...") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Export leaderboard data to a Hugging Face dataset.") diff --git a/test_dedup.py b/test_dedup.py new file mode 100644 index 0000000..f5ea811 --- /dev/null +++ b/test_dedup.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 +""" +Unit tests for the deduplication pipeline. +Tests the end-to-end flow with fake data matching the database schema. +""" + +import unittest +import pandas as pd +import numpy as np +from datetime import datetime, timezone +import random +from typing import Dict, List, Any +import tempfile +import os +import sys + +# Import the functions we want to test +try: + from dedup import ( + remove_duplicates, + fuzzy_filter, + convert_df_to_dict, + flatten_data, + dedup_df, + count_items, + create_parquet_file + ) +except ImportError as e: + print(f"Import error: {e}") + print("Some functions may not be available for testing") + + +class TestDedupEndToEnd(unittest.TestCase): + + def setUp(self): + """Set up test fixtures with fake data matching the schema.""" + random.seed(42) # For reproducible tests + np.random.seed(42) + self.fake_data = self.create_fake_dataset(50) + self.df = pd.DataFrame(self.fake_data) + + def create_fake_dataset(self, num_entries: int) -> List[Dict[str, Any]]: + """Create a fake dataset with the required schema fields.""" + fake_data = [] + + # Sample code snippets (some duplicates for testing) + code_samples = [ + "def hello_world():\n print('Hello World')", + "import numpy as np\nx = np.array([1, 2, 3])", + "for i in range(10):\n print(i)", + "class MyClass:\n def __init__(self):\n pass", + "def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)", + "import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})", + "def quicksort(arr):\n if len(arr) <= 1:\n return arr", + "x = [1, 2, 3, 4, 5]\ny = [i**2 for i in x]", + "try:\n result = 10 / 0\nexcept ZeroDivisionError:\n print('Error')", + "def hello_world():\n print('Hello World')", # Exact duplicate + ] + + run_modes = ['leaderboard', 'benchmark', 'test'] + file_names = ['solution.py', 'main.py', 'algorithm.py', 'test.py'] + + for i in range(num_entries): + # Create base timestamp + base_time = datetime(2024, 1, 1, tzinfo=timezone.utc) + submission_time = base_time.replace( + day=random.randint(1, 28), + hour=random.randint(0, 23), + minute=random.randint(0, 59) + ) + + # Select code (with some duplicates) + code = random.choice(code_samples) + if i < 5: # First 5 entries use the same code for exact duplicate testing + code = code_samples[0] + elif i < 10: # Next 5 use slightly modified versions for fuzzy testing + code = code_samples[0] + f"\n# Comment {i}" + + run_mode = random.choice(run_modes) + run_passed = random.choice([True, False]) + + # Generate run score based on mode and success + if run_mode == 'leaderboard' and run_passed: + run_score = round(random.uniform(0.1, 1.0), 4) + else: + run_score = 0.0 if not run_passed else round(random.uniform(0.1, 0.8), 4) + + # Create the entry matching the database schema + entry = { + 'submission_id': i + 1000, + 'leaderboard_id': random.randint(1, 10), + 'user_id': random.randint(100, 999), + 'submission_time': submission_time, + 'file_name': random.choice(file_names), + 'code': code, + 'code_id': i + 2000, + 'run_id': i + 3000, + 'run_start_time': submission_time, + 'run_end_time': submission_time.replace( + second=random.randint(1, 59) + ), + 'run_mode': run_mode, + 'run_score': run_score, + 'run_passed': run_passed, + 'run_result': { + 'benchmark-count': random.randint(1, 10), + 'benchmark.0.best': f'benchmark_{random.randint(1, 100)}.txt', + 'benchmark.0.err': '', + 'benchmark.0.mean': round(random.uniform(0.1, 2.0), 6), + 'benchmark.0.report': f'report_{i}.json' + }, + 'run_compilation': { + 'command': 'python', + 'exit_code': 0 if run_passed else random.randint(1, 255), + 'nvcc_found': random.choice([True, False]), + 'nvcc_version': f'11.{random.randint(0, 8)}', + 'stderr': '' if run_passed else f'Error message {i}', + 'stdout': f'Output {i}', + 'success': run_passed + }, + 'run_meta': { + 'command': 'python solution.py', + 'duration': round(random.uniform(0.1, 10.0), 3), + 'exit_code': 0 if run_passed else random.randint(1, 255), + 'stderr': '' if run_passed else f'Runtime error {i}', + 'stdout': f'Runtime output {i}', + 'success': run_passed + }, + 'run_system_info': { + 'cpu': f'Intel Core i{random.randint(5, 9)}', + 'gpu': random.choice(['NVIDIA RTX 3080', 'NVIDIA RTX 4090', 'None']), + 'platform': random.choice(['linux', 'darwin', 'win32']), + 'torch': f'2.{random.randint(0, 3)}.{random.randint(0, 9)}' + } + } + fake_data.append(entry) + + return fake_data + + def test_dataframe_creation(self): + """Test that the fake dataset creates a valid DataFrame.""" + self.assertEqual(len(self.df), 50) + + # Check required columns exist (matching the schema in the image) + required_columns = [ + 'submission_id', 'leaderboard_id', 'user_id', 'submission_time', + 'file_name', 'code', 'code_id', 'run_id', 'run_start_time', + 'run_end_time', 'run_mode', 'run_score', 'run_passed', + 'run_result', 'run_compilation', 'run_meta', 'run_system_info' + ] + + for col in required_columns: + self.assertIn(col, self.df.columns, f"Missing required column: {col}") + + # Check data types + self.assertTrue(self.df['submission_id'].dtype in ['int64', 'int32']) + self.assertTrue(self.df['run_passed'].dtype == 'bool') + self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32']) + + # Verify struct fields exist + sample_row = self.df.iloc[0] + self.assertIsInstance(sample_row['run_result'], dict) + self.assertIsInstance(sample_row['run_compilation'], dict) + self.assertIsInstance(sample_row['run_meta'], dict) + self.assertIsInstance(sample_row['run_system_info'], dict) + + def test_convert_df_to_dict(self): + """Test conversion from DataFrame to nested dictionary structure.""" + try: + data_dict = convert_df_to_dict(self.df) + + # Check structure + self.assertIsInstance(data_dict, dict) + + # Should have run_mode keys + run_modes = set(self.df['run_mode'].unique()) + self.assertEqual(set(data_dict.keys()), run_modes) + + # Check nested structure + for run_mode in data_dict: + self.assertIsInstance(data_dict[run_mode], dict) + for run_success in data_dict[run_mode]: + self.assertIsInstance(data_dict[run_mode][run_success], dict) + for score_duration in data_dict[run_mode][run_success]: + self.assertIsInstance( + data_dict[run_mode][run_success][score_duration], + list + ) + except NameError: + self.skipTest("convert_df_to_dict function not available") + + def test_exact_deduplication(self): + """Test exact duplicate removal.""" + try: + data_dict = convert_df_to_dict(self.df) + original_count = count_items(data_dict) + + deduplicated_data = remove_duplicates(data_dict) + deduplicated_count = count_items(deduplicated_data) + + # Should have fewer or equal items after deduplication + self.assertLessEqual(deduplicated_count, original_count) + + # Structure should be preserved + self.assertEqual(set(data_dict.keys()), set(deduplicated_data.keys())) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_fuzzy_deduplication_small(self): + """Test fuzzy duplicate removal with small threshold for faster testing.""" + try: + data_dict = convert_df_to_dict(self.df) + original_count = count_items(data_dict) + + # Use small parameters for faster testing + fuzzy_deduplicated_data = fuzzy_filter( + data_dict, + threshold=0.5, # Lower threshold for faster testing + ngram_size=3, # Smaller ngram size + bands=4, # Fewer bands + rows_per_band=32 # Fewer rows per band + ) + + fuzzy_count = count_items(fuzzy_deduplicated_data) + + # Should have fewer or equal items after fuzzy deduplication + self.assertLessEqual(fuzzy_count, original_count) + + # Structure should be preserved + self.assertEqual(set(data_dict.keys()), set(fuzzy_deduplicated_data.keys())) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_flatten_and_reconstruct(self): + """Test flattening and reconstruction of data.""" + try: + data_dict = convert_df_to_dict(self.df) + original_count = count_items(data_dict) + + # Flatten + flattened_data = flatten_data(data_dict) + self.assertEqual(len(flattened_data), original_count) + + # Check metadata fields were added + if flattened_data: + sample_row = flattened_data[0] + self.assertIn('_run_mode', sample_row) + self.assertIn('_run_success', sample_row) + self.assertIn('_score_duration', sample_row) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_dedup_df_end_to_end(self): + """Test the complete deduplication pipeline.""" + try: + original_length = len(self.df) + + # Run the complete deduplication pipeline + deduplicated_df = dedup_df(self.df) + + # Should return a DataFrame + self.assertIsInstance(deduplicated_df, pd.DataFrame) + + # Should have fewer or equal rows + self.assertLessEqual(len(deduplicated_df), original_length) + + # Should preserve required columns + required_columns = ['submission_id', 'code', 'run_mode', 'run_passed'] + for col in required_columns: + self.assertIn(col, deduplicated_df.columns) + + # Check data integrity + self.assertFalse(deduplicated_df.empty, "Deduplicated DataFrame should not be empty") + + except NameError as e: + self.skipTest(f"dedup_df function not available: {e}") + + def test_parquet_creation(self): + """Test Parquet file creation.""" + try: + data_dict = convert_df_to_dict(self.df) + + with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as tmp_file: + try: + create_parquet_file(data_dict, tmp_file.name) + + # Check file was created + self.assertTrue(os.path.exists(tmp_file.name)) + + # Check file is not empty + self.assertGreater(os.path.getsize(tmp_file.name), 0) + + # Try to read the file back + df_from_parquet = pd.read_parquet(tmp_file.name) + self.assertIsInstance(df_from_parquet, pd.DataFrame) + self.assertGreater(len(df_from_parquet), 0) + + finally: + # Clean up + if os.path.exists(tmp_file.name): + os.unlink(tmp_file.name) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_data_consistency_after_deduplication(self): + """Test that data remains consistent after deduplication.""" + try: + # Create dataset with known duplicates + duplicate_data = [] + + # Add the same code 3 times with different metadata + base_entry = self.fake_data[0].copy() + for i in range(3): + entry = base_entry.copy() + entry['submission_id'] = 9000 + i + entry['run_id'] = 9100 + i + duplicate_data.append(entry) + + # Add to main dataset + test_data = self.fake_data + duplicate_data + test_df = pd.DataFrame(test_data) + + original_length = len(test_df) + deduplicated_df = dedup_df(test_df) + + # Should have removed at least 2 duplicates + self.assertLess(len(deduplicated_df), original_length) + + # Check that essential fields are preserved + self.assertTrue(all(col in deduplicated_df.columns for col in + ['submission_id', 'code', 'run_mode', 'run_passed'])) + + except NameError as e: + self.skipTest(f"Required functions not available: {e}") + + def test_schema_compliance(self): + """Test that the fake dataset matches the expected schema from the database.""" + # Test all required fields exist and have correct types + + # Test BIGINT fields + bigint_fields = ['submission_id', 'leaderboard_id', 'user_id', 'code_id', 'run_id'] + for field in bigint_fields: + self.assertTrue(self.df[field].dtype in ['int64', 'int32'], + f"{field} should be integer type") + + # Test VARCHAR fields + varchar_fields = ['file_name', 'code', 'run_mode'] + for field in varchar_fields: + self.assertTrue(self.df[field].dtype == 'object', + f"{field} should be string/object type") + + # Test TIMESTAMP fields + timestamp_fields = ['submission_time', 'run_start_time', 'run_end_time'] + for field in timestamp_fields: + # Check that all values are datetime objects with timezone + sample_value = self.df[field].iloc[0] + self.assertIsInstance(sample_value, datetime) + self.assertIsNotNone(sample_value.tzinfo) + + # Test DOUBLE field + self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32']) + + # Test BOOLEAN field + self.assertTrue(self.df['run_passed'].dtype == 'bool') + + # Test STRUCT fields + struct_fields = ['run_result', 'run_compilation', 'run_meta', 'run_system_info'] + for field in struct_fields: + # All values should be dictionaries + self.assertTrue(all(isinstance(val, dict) for val in self.df[field])) + + def test_duplicate_detection(self): + """Test that we can detect exact and near duplicates in the dataset.""" + # Count exact duplicates by code + code_counts = self.df['code'].value_counts() + exact_duplicates = code_counts[code_counts > 1] + + # Should have some exact duplicates (first 5 entries) + self.assertGreater(len(exact_duplicates), 0, "Should have exact duplicates for testing") + + # Check that fuzzy duplicates exist (entries with similar code) + similar_code_count = 0 + base_code = "def hello_world():\n print('Hello World')" + for code in self.df['code']: + if base_code in code and code != base_code: + similar_code_count += 1 + + self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing") + + +if __name__ == '__main__': + # Add some helpful output + print("Running deduplication pipeline tests...") + print(f"Python version: {sys.version}") + print(f"Pandas version: {pd.__version__}") + + # Run the tests + unittest.main(verbosity=2) \ No newline at end of file From 216d47a05f1a23eec88dfd8394d617dd0d838818 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Mon, 23 Jun 2025 17:10:24 -0400 Subject: [PATCH 6/9] magic number removal --- dedup.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 18 deletions(-) diff --git a/dedup.py b/dedup.py index b316e38..8f209a6 100644 --- a/dedup.py +++ b/dedup.py @@ -9,6 +9,82 @@ import datasketch import pandas as pd +# ============================================================================= +# DEDUPLICATION CONFIGURATION CONSTANTS +# ============================================================================= + +# Fuzzy Deduplication Parameters +FUZZY_SIMILARITY_THRESHOLD = 0.8 +""" +Jaccard similarity threshold for considering two documents as duplicates. +Range: 0.0 to 1.0 +- 0.8 = High threshold, only very similar documents are considered duplicates +- 0.7 = Medium threshold, moderately similar documents are duplicates +- 0.5 = Low threshold, loosely similar documents are duplicates +Higher values = more strict deduplication, fewer items removed +""" + +NGRAM_SIZE = 5 +""" +Size of character n-grams used for MinHash fingerprinting. +- Smaller values (3-4): More sensitive to small changes, better for short text +- Larger values (5-7): Less sensitive to minor variations, better for longer text +- Too small: May create false positives (different texts seem similar) +- Too large: May miss actual duplicates with small variations +""" + +LSH_BANDS = 16 +""" +Number of bands for Locality Sensitive Hashing (LSH). +Used to speed up similarity detection by grouping similar hashes. +- More bands = faster but less accurate similarity detection +- Fewer bands = slower but more accurate similarity detection +Must divide evenly into ROWS_PER_BAND * LSH_BANDS = total permutations +""" + +ROWS_PER_BAND = 128 +""" +Number of rows per band in LSH configuration. +Total MinHash permutations = ROWS_PER_BAND * LSH_BANDS +- More rows per band = higher precision, may miss some similar pairs +- Fewer rows per band = higher recall, may include more false positives +Default: 128 rows × 16 bands = 2048 total permutations +""" + +# Score Processing Parameters +LEADERBOARD_SCORE_PRECISION = 4 +""" +Number of decimal places to round leaderboard scores when grouping submissions. +Used to group submissions with very similar scores together. +- Higher precision (more decimal places): More granular grouping +- Lower precision (fewer decimal places): Broader grouping of similar scores +""" + +DURATION_PRECISION = 0 +""" +Number of decimal places to round execution duration (in seconds). +Used to group submissions with similar execution times. +- 0: Round to nearest second (1.7s → 2s) +- 1: Round to nearest 0.1s (1.73s → 1.7s) +""" + +# ============================================================================= +# CONFIGURATION SUMMARY +# ============================================================================= +""" +Current deduplication configuration: +├─ Similarity Detection: 0.8 threshold (strict) +├─ Text Fingerprinting: 5-character n-grams +├─ LSH Performance: 16 bands × 128 rows = 2048 permutations +├─ Score Grouping: 4 decimal places for leaderboard scores +└─ Duration Grouping: 0 decimal places for execution times + +To adjust deduplication sensitivity: +- Increase FUZZY_SIMILARITY_THRESHOLD (0.8→0.9) for stricter deduplication +- Decrease FUZZY_SIMILARITY_THRESHOLD (0.8→0.7) for more aggressive deduplication +- Adjust NGRAM_SIZE for different text lengths (3-4 for short, 5-7 for long) +""" + def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]): """ Remove exact duplicates from the nested data structure returned by get_sorted_hf_data. @@ -60,9 +136,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li def create_minhashes( documents: List[Dict[str, str]], - ngram_size: int = 5, - bands: int = 20, - rows_per_band: int = 128, + ngram_size: int = NGRAM_SIZE, + bands: int = LSH_BANDS, + rows_per_band: int = ROWS_PER_BAND, ) -> Tuple[Dict[str, datasketch.MinHash], int]: """ Create MinHash signatures for a list of documents with LSH bands configuration. @@ -155,10 +231,10 @@ def filter_matrix( def fuzzy_filter( data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]], - threshold: float = 0.7, - ngram_size: int = 5, - bands: int = 16, - rows_per_band: int = 128, + threshold: float = FUZZY_SIMILARITY_THRESHOLD, + ngram_size: int = NGRAM_SIZE, + bands: int = LSH_BANDS, + rows_per_band: int = ROWS_PER_BAND, ) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: total_categories = 0 @@ -181,10 +257,10 @@ def fuzzy_filter( def _fuzzy_filter( data_list: List[Dict], - threshold: float = 0.7, - ngram_size: int = 5, - bands: int = 16, - rows_per_band: int = 128, + threshold: float = FUZZY_SIMILARITY_THRESHOLD, + ngram_size: int = NGRAM_SIZE, + bands: int = LSH_BANDS, + rows_per_band: int = ROWS_PER_BAND, ) -> List[Dict]: """ Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data. @@ -263,10 +339,10 @@ def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]: for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False): for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False): if run_mode == 'leaderboard' and run_success == True: - rounded_score = round(float(row['run_score']), 4) + rounded_score = round(float(row['run_score']), LEADERBOARD_SCORE_PRECISION) run_duration_dict[run_mode][run_success][rounded_score].append(row) else: - rounded_duration = round(float(row['run_meta']['duration']), 0) + rounded_duration = round(float(row['run_meta']['duration']), DURATION_PRECISION) run_duration_dict[run_mode][run_success][rounded_duration].append(row) return run_duration_dict @@ -346,10 +422,10 @@ def example_usage(): # Apply fuzzy deduplication fuzzy_deduplicated_data = fuzzy_filter( deduplicated_data, - threshold=0.8, # High threshold for more strict deduplication - ngram_size=5, - bands=16, - rows_per_band=128 + threshold=FUZZY_SIMILARITY_THRESHOLD, + ngram_size=NGRAM_SIZE, + bands=LSH_BANDS, + rows_per_band=ROWS_PER_BAND ) # convert to df flattened_data = flatten_data(fuzzy_deduplicated_data) @@ -367,7 +443,13 @@ def dedup_df(df: pd.DataFrame) -> pd.DataFrame: # convert to dict data_dict = convert_df_to_dict(df) # deduplicate - deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128) + deduplicated_data = fuzzy_filter( + data_dict, + threshold=FUZZY_SIMILARITY_THRESHOLD, + ngram_size=NGRAM_SIZE, + bands=LSH_BANDS, + rows_per_band=ROWS_PER_BAND + ) # convert to df flattened_data = flatten_data(deduplicated_data) df = pd.DataFrame(flattened_data) From 89573b4b80107071d752612242951b02d8e571f2 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Tue, 24 Jun 2025 08:43:42 -0400 Subject: [PATCH 7/9] Add deduplicated datasets --- dedup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedup.py b/dedup.py index 8f209a6..f6f3449 100644 --- a/dedup.py +++ b/dedup.py @@ -118,9 +118,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li # If duplicate found, keep the one with better metrics existing_row = unique_entries[content_hash] - # For leaderboard mode with successful runs, prefer higher scores + # For leaderboard mode with successful runs, prefer lower scores / faster times if run_mode == 'leaderboard' and row.get('run_passed') == True: - if row.get('run_score', 0) > existing_row.get('run_score', 0): + if row.get('run_score', 0) < existing_row.get('run_score', 0): unique_entries[content_hash] = row # For other cases, prefer shorter duration (faster execution) else: From d32c79ca2855ea2d27af77e80f326db33a2d69e6 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 28 Nov 2025 13:55:46 -0500 Subject: [PATCH 8/9] remove test --- test_dedup.py | 402 -------------------------------------------------- 1 file changed, 402 deletions(-) delete mode 100644 test_dedup.py diff --git a/test_dedup.py b/test_dedup.py deleted file mode 100644 index f5ea811..0000000 --- a/test_dedup.py +++ /dev/null @@ -1,402 +0,0 @@ -#!/usr/bin/env python3 -""" -Unit tests for the deduplication pipeline. -Tests the end-to-end flow with fake data matching the database schema. -""" - -import unittest -import pandas as pd -import numpy as np -from datetime import datetime, timezone -import random -from typing import Dict, List, Any -import tempfile -import os -import sys - -# Import the functions we want to test -try: - from dedup import ( - remove_duplicates, - fuzzy_filter, - convert_df_to_dict, - flatten_data, - dedup_df, - count_items, - create_parquet_file - ) -except ImportError as e: - print(f"Import error: {e}") - print("Some functions may not be available for testing") - - -class TestDedupEndToEnd(unittest.TestCase): - - def setUp(self): - """Set up test fixtures with fake data matching the schema.""" - random.seed(42) # For reproducible tests - np.random.seed(42) - self.fake_data = self.create_fake_dataset(50) - self.df = pd.DataFrame(self.fake_data) - - def create_fake_dataset(self, num_entries: int) -> List[Dict[str, Any]]: - """Create a fake dataset with the required schema fields.""" - fake_data = [] - - # Sample code snippets (some duplicates for testing) - code_samples = [ - "def hello_world():\n print('Hello World')", - "import numpy as np\nx = np.array([1, 2, 3])", - "for i in range(10):\n print(i)", - "class MyClass:\n def __init__(self):\n pass", - "def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)", - "import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})", - "def quicksort(arr):\n if len(arr) <= 1:\n return arr", - "x = [1, 2, 3, 4, 5]\ny = [i**2 for i in x]", - "try:\n result = 10 / 0\nexcept ZeroDivisionError:\n print('Error')", - "def hello_world():\n print('Hello World')", # Exact duplicate - ] - - run_modes = ['leaderboard', 'benchmark', 'test'] - file_names = ['solution.py', 'main.py', 'algorithm.py', 'test.py'] - - for i in range(num_entries): - # Create base timestamp - base_time = datetime(2024, 1, 1, tzinfo=timezone.utc) - submission_time = base_time.replace( - day=random.randint(1, 28), - hour=random.randint(0, 23), - minute=random.randint(0, 59) - ) - - # Select code (with some duplicates) - code = random.choice(code_samples) - if i < 5: # First 5 entries use the same code for exact duplicate testing - code = code_samples[0] - elif i < 10: # Next 5 use slightly modified versions for fuzzy testing - code = code_samples[0] + f"\n# Comment {i}" - - run_mode = random.choice(run_modes) - run_passed = random.choice([True, False]) - - # Generate run score based on mode and success - if run_mode == 'leaderboard' and run_passed: - run_score = round(random.uniform(0.1, 1.0), 4) - else: - run_score = 0.0 if not run_passed else round(random.uniform(0.1, 0.8), 4) - - # Create the entry matching the database schema - entry = { - 'submission_id': i + 1000, - 'leaderboard_id': random.randint(1, 10), - 'user_id': random.randint(100, 999), - 'submission_time': submission_time, - 'file_name': random.choice(file_names), - 'code': code, - 'code_id': i + 2000, - 'run_id': i + 3000, - 'run_start_time': submission_time, - 'run_end_time': submission_time.replace( - second=random.randint(1, 59) - ), - 'run_mode': run_mode, - 'run_score': run_score, - 'run_passed': run_passed, - 'run_result': { - 'benchmark-count': random.randint(1, 10), - 'benchmark.0.best': f'benchmark_{random.randint(1, 100)}.txt', - 'benchmark.0.err': '', - 'benchmark.0.mean': round(random.uniform(0.1, 2.0), 6), - 'benchmark.0.report': f'report_{i}.json' - }, - 'run_compilation': { - 'command': 'python', - 'exit_code': 0 if run_passed else random.randint(1, 255), - 'nvcc_found': random.choice([True, False]), - 'nvcc_version': f'11.{random.randint(0, 8)}', - 'stderr': '' if run_passed else f'Error message {i}', - 'stdout': f'Output {i}', - 'success': run_passed - }, - 'run_meta': { - 'command': 'python solution.py', - 'duration': round(random.uniform(0.1, 10.0), 3), - 'exit_code': 0 if run_passed else random.randint(1, 255), - 'stderr': '' if run_passed else f'Runtime error {i}', - 'stdout': f'Runtime output {i}', - 'success': run_passed - }, - 'run_system_info': { - 'cpu': f'Intel Core i{random.randint(5, 9)}', - 'gpu': random.choice(['NVIDIA RTX 3080', 'NVIDIA RTX 4090', 'None']), - 'platform': random.choice(['linux', 'darwin', 'win32']), - 'torch': f'2.{random.randint(0, 3)}.{random.randint(0, 9)}' - } - } - fake_data.append(entry) - - return fake_data - - def test_dataframe_creation(self): - """Test that the fake dataset creates a valid DataFrame.""" - self.assertEqual(len(self.df), 50) - - # Check required columns exist (matching the schema in the image) - required_columns = [ - 'submission_id', 'leaderboard_id', 'user_id', 'submission_time', - 'file_name', 'code', 'code_id', 'run_id', 'run_start_time', - 'run_end_time', 'run_mode', 'run_score', 'run_passed', - 'run_result', 'run_compilation', 'run_meta', 'run_system_info' - ] - - for col in required_columns: - self.assertIn(col, self.df.columns, f"Missing required column: {col}") - - # Check data types - self.assertTrue(self.df['submission_id'].dtype in ['int64', 'int32']) - self.assertTrue(self.df['run_passed'].dtype == 'bool') - self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32']) - - # Verify struct fields exist - sample_row = self.df.iloc[0] - self.assertIsInstance(sample_row['run_result'], dict) - self.assertIsInstance(sample_row['run_compilation'], dict) - self.assertIsInstance(sample_row['run_meta'], dict) - self.assertIsInstance(sample_row['run_system_info'], dict) - - def test_convert_df_to_dict(self): - """Test conversion from DataFrame to nested dictionary structure.""" - try: - data_dict = convert_df_to_dict(self.df) - - # Check structure - self.assertIsInstance(data_dict, dict) - - # Should have run_mode keys - run_modes = set(self.df['run_mode'].unique()) - self.assertEqual(set(data_dict.keys()), run_modes) - - # Check nested structure - for run_mode in data_dict: - self.assertIsInstance(data_dict[run_mode], dict) - for run_success in data_dict[run_mode]: - self.assertIsInstance(data_dict[run_mode][run_success], dict) - for score_duration in data_dict[run_mode][run_success]: - self.assertIsInstance( - data_dict[run_mode][run_success][score_duration], - list - ) - except NameError: - self.skipTest("convert_df_to_dict function not available") - - def test_exact_deduplication(self): - """Test exact duplicate removal.""" - try: - data_dict = convert_df_to_dict(self.df) - original_count = count_items(data_dict) - - deduplicated_data = remove_duplicates(data_dict) - deduplicated_count = count_items(deduplicated_data) - - # Should have fewer or equal items after deduplication - self.assertLessEqual(deduplicated_count, original_count) - - # Structure should be preserved - self.assertEqual(set(data_dict.keys()), set(deduplicated_data.keys())) - - except NameError as e: - self.skipTest(f"Required functions not available: {e}") - - def test_fuzzy_deduplication_small(self): - """Test fuzzy duplicate removal with small threshold for faster testing.""" - try: - data_dict = convert_df_to_dict(self.df) - original_count = count_items(data_dict) - - # Use small parameters for faster testing - fuzzy_deduplicated_data = fuzzy_filter( - data_dict, - threshold=0.5, # Lower threshold for faster testing - ngram_size=3, # Smaller ngram size - bands=4, # Fewer bands - rows_per_band=32 # Fewer rows per band - ) - - fuzzy_count = count_items(fuzzy_deduplicated_data) - - # Should have fewer or equal items after fuzzy deduplication - self.assertLessEqual(fuzzy_count, original_count) - - # Structure should be preserved - self.assertEqual(set(data_dict.keys()), set(fuzzy_deduplicated_data.keys())) - - except NameError as e: - self.skipTest(f"Required functions not available: {e}") - - def test_flatten_and_reconstruct(self): - """Test flattening and reconstruction of data.""" - try: - data_dict = convert_df_to_dict(self.df) - original_count = count_items(data_dict) - - # Flatten - flattened_data = flatten_data(data_dict) - self.assertEqual(len(flattened_data), original_count) - - # Check metadata fields were added - if flattened_data: - sample_row = flattened_data[0] - self.assertIn('_run_mode', sample_row) - self.assertIn('_run_success', sample_row) - self.assertIn('_score_duration', sample_row) - - except NameError as e: - self.skipTest(f"Required functions not available: {e}") - - def test_dedup_df_end_to_end(self): - """Test the complete deduplication pipeline.""" - try: - original_length = len(self.df) - - # Run the complete deduplication pipeline - deduplicated_df = dedup_df(self.df) - - # Should return a DataFrame - self.assertIsInstance(deduplicated_df, pd.DataFrame) - - # Should have fewer or equal rows - self.assertLessEqual(len(deduplicated_df), original_length) - - # Should preserve required columns - required_columns = ['submission_id', 'code', 'run_mode', 'run_passed'] - for col in required_columns: - self.assertIn(col, deduplicated_df.columns) - - # Check data integrity - self.assertFalse(deduplicated_df.empty, "Deduplicated DataFrame should not be empty") - - except NameError as e: - self.skipTest(f"dedup_df function not available: {e}") - - def test_parquet_creation(self): - """Test Parquet file creation.""" - try: - data_dict = convert_df_to_dict(self.df) - - with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as tmp_file: - try: - create_parquet_file(data_dict, tmp_file.name) - - # Check file was created - self.assertTrue(os.path.exists(tmp_file.name)) - - # Check file is not empty - self.assertGreater(os.path.getsize(tmp_file.name), 0) - - # Try to read the file back - df_from_parquet = pd.read_parquet(tmp_file.name) - self.assertIsInstance(df_from_parquet, pd.DataFrame) - self.assertGreater(len(df_from_parquet), 0) - - finally: - # Clean up - if os.path.exists(tmp_file.name): - os.unlink(tmp_file.name) - - except NameError as e: - self.skipTest(f"Required functions not available: {e}") - - def test_data_consistency_after_deduplication(self): - """Test that data remains consistent after deduplication.""" - try: - # Create dataset with known duplicates - duplicate_data = [] - - # Add the same code 3 times with different metadata - base_entry = self.fake_data[0].copy() - for i in range(3): - entry = base_entry.copy() - entry['submission_id'] = 9000 + i - entry['run_id'] = 9100 + i - duplicate_data.append(entry) - - # Add to main dataset - test_data = self.fake_data + duplicate_data - test_df = pd.DataFrame(test_data) - - original_length = len(test_df) - deduplicated_df = dedup_df(test_df) - - # Should have removed at least 2 duplicates - self.assertLess(len(deduplicated_df), original_length) - - # Check that essential fields are preserved - self.assertTrue(all(col in deduplicated_df.columns for col in - ['submission_id', 'code', 'run_mode', 'run_passed'])) - - except NameError as e: - self.skipTest(f"Required functions not available: {e}") - - def test_schema_compliance(self): - """Test that the fake dataset matches the expected schema from the database.""" - # Test all required fields exist and have correct types - - # Test BIGINT fields - bigint_fields = ['submission_id', 'leaderboard_id', 'user_id', 'code_id', 'run_id'] - for field in bigint_fields: - self.assertTrue(self.df[field].dtype in ['int64', 'int32'], - f"{field} should be integer type") - - # Test VARCHAR fields - varchar_fields = ['file_name', 'code', 'run_mode'] - for field in varchar_fields: - self.assertTrue(self.df[field].dtype == 'object', - f"{field} should be string/object type") - - # Test TIMESTAMP fields - timestamp_fields = ['submission_time', 'run_start_time', 'run_end_time'] - for field in timestamp_fields: - # Check that all values are datetime objects with timezone - sample_value = self.df[field].iloc[0] - self.assertIsInstance(sample_value, datetime) - self.assertIsNotNone(sample_value.tzinfo) - - # Test DOUBLE field - self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32']) - - # Test BOOLEAN field - self.assertTrue(self.df['run_passed'].dtype == 'bool') - - # Test STRUCT fields - struct_fields = ['run_result', 'run_compilation', 'run_meta', 'run_system_info'] - for field in struct_fields: - # All values should be dictionaries - self.assertTrue(all(isinstance(val, dict) for val in self.df[field])) - - def test_duplicate_detection(self): - """Test that we can detect exact and near duplicates in the dataset.""" - # Count exact duplicates by code - code_counts = self.df['code'].value_counts() - exact_duplicates = code_counts[code_counts > 1] - - # Should have some exact duplicates (first 5 entries) - self.assertGreater(len(exact_duplicates), 0, "Should have exact duplicates for testing") - - # Check that fuzzy duplicates exist (entries with similar code) - similar_code_count = 0 - base_code = "def hello_world():\n print('Hello World')" - for code in self.df['code']: - if base_code in code and code != base_code: - similar_code_count += 1 - - self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing") - - -if __name__ == '__main__': - # Add some helpful output - print("Running deduplication pipeline tests...") - print(f"Python version: {sys.version}") - print(f"Pandas version: {pd.__version__}") - - # Run the tests - unittest.main(verbosity=2) \ No newline at end of file From 9867a05da031f169ff0063b2d26706579e05ea6e Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Sun, 30 Nov 2025 00:20:25 -0500 Subject: [PATCH 9/9] update --- dedup.py | 484 ++++++++++-------- export.py | 93 ++-- tests/__init__.py | 0 tests/fixtures/__init__.py | 0 tests/fixtures/create_fixtures.py | 43 ++ tests/fixtures/submissions_fixture.parquet | Bin 0 -> 40107 bytes .../successful_submissions_fixture.parquet | Bin 0 -> 40452 bytes test_dedup.py => tests/test_dedup.py | 128 ++++- 8 files changed, 493 insertions(+), 255 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/fixtures/__init__.py create mode 100644 tests/fixtures/create_fixtures.py create mode 100644 tests/fixtures/submissions_fixture.parquet create mode 100644 tests/fixtures/successful_submissions_fixture.parquet rename test_dedup.py => tests/test_dedup.py (81%) diff --git a/dedup.py b/dedup.py index f6f3449..8f69f1c 100644 --- a/dedup.py +++ b/dedup.py @@ -1,13 +1,46 @@ -# script to dedup a huggingface dataset +""" +Deduplication Pipeline for Code Submissions +============================================ + +This module removes duplicate code submissions using a two-stage approach: + +1. EXACT DEDUPLICATION (remove_duplicates) + - Computes SHA-256 hash of each code submission + - Groups submissions by run_mode, run_passed, and score/duration + - Within each group, keeps only unique code (by hash) + - When duplicates exist, keeps the one with better metrics (lower score or faster duration) + +2. FUZZY DEDUPLICATION (fuzzy_filter) + - Uses MinHash + Locality Sensitive Hashing (LSH) to find near-duplicates + - Process: + a) Convert each code submission to a set of character n-grams (default: 5-char) + b) Create MinHash signature for each submission (compact fingerprint) + c) Use LSH to efficiently find candidate pairs with high Jaccard similarity + d) Group similar submissions into clusters + e) Keep one representative from each cluster (highest submission ID) + +Usage: + + In practice this should be part of export.py, but if + you need to run things adhoc just do: + + python dedup.py input.parquet output.parquet + +""" from datasets import load_dataset import tqdm from collections import defaultdict import hashlib from typing import Dict, List, Tuple, Union +from concurrent.futures import ProcessPoolExecutor, as_completed +import multiprocessing import datasketch import pandas as pd +import numpy as np +import pyarrow.parquet as pq +import os # ============================================================================= # DEDUPLICATION CONFIGURATION CONSTANTS @@ -88,36 +121,34 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]): """ Remove exact duplicates from the nested data structure returned by get_sorted_hf_data. - + Args: data_dict: Nested dictionary structure from get_sorted_hf_data - + Returns: Dictionary with same structure but duplicates removed """ deduplicated_dict = {} - - for run_mode, score_duration_dict in tqdm.tqdm(data_dict.items(), desc="Processing run modes"): + + for run_mode, score_duration_dict in data_dict.items(): deduplicated_dict[run_mode] = {} - for run_success, run_success_dict in tqdm.tqdm(score_duration_dict.items(), desc=f"Processing {run_mode}", leave=False): + for run_success, run_success_dict in score_duration_dict.items(): deduplicated_dict[run_mode][run_success] = {} - for score_duration, rows in tqdm.tqdm(run_success_dict.items(), desc=f"Processing {run_mode}", leave=False): + for score_duration, rows in run_success_dict.items(): # Use a dictionary to track unique entries by their content hash unique_entries = {} - - for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False): - # Create a hash of the relevant content (assuming 'input' or similar field exists) - # If the row has an 'input' field, use that; otherwise use the entire row + + for row in rows: content = row.get('code', "") content_hash = hashlib.sha256(content.encode()).hexdigest() - + if content_hash not in unique_entries: unique_entries[content_hash] = row else: # If duplicate found, keep the one with better metrics existing_row = unique_entries[content_hash] - + # For leaderboard mode with successful runs, prefer lower scores / faster times if run_mode == 'leaderboard' and row.get('run_passed') == True: if row.get('run_score', 0) < existing_row.get('run_score', 0): @@ -128,80 +159,99 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li current_duration = row.get('run_meta', {}).get('duration', float('inf')) if current_duration < existing_duration: unique_entries[content_hash] = row - + deduplicated_dict[run_mode][run_success][score_duration] = list(unique_entries.values()) - + return deduplicated_dict +def _create_single_minhash(args: Tuple[str, str, int, int]) -> Tuple[str, datasketch.MinHash]: + """Create a MinHash for a single document. Used for parallel processing.""" + submission_id, text, ngram_size, num_permutations = args + minhash = datasketch.MinHash(num_perm=num_permutations) + text_lower = text.lower() + text_bytes = text_lower.encode('utf8') + + # Generate n-grams directly as bytes to avoid repeated encoding + for i in range(len(text_bytes) - ngram_size + 1): + minhash.update(text_bytes[i:i + ngram_size]) + + return submission_id, minhash + + def create_minhashes( documents: List[Dict[str, str]], ngram_size: int = NGRAM_SIZE, bands: int = LSH_BANDS, rows_per_band: int = ROWS_PER_BAND, -) -> Tuple[Dict[str, datasketch.MinHash], int]: + n_jobs: int = None, + position: int = 0, +) -> Dict[str, datasketch.MinHash]: """ Create MinHash signatures for a list of documents with LSH bands configuration. Args: - documents: List of dictionaries, each containing 'submission_id' and 'input' keys - num_permutations: Number of hash functions to use (default: 100) - ngram_size: Size of n-grams to generate from input text (default: 3) - bands: Number of bands for LSH (default: 20) + documents: List of dictionaries, each containing 'submission_id' and 'code' keys + ngram_size: Size of n-grams to generate from input text (default: 5) + bands: Number of bands for LSH (default: 16) + rows_per_band: Rows per band for LSH (default: 128) + n_jobs: Number of parallel workers. Defaults to CPU count. + position: Position for nested tqdm progress bar Returns: - Tuple containing: - - Dictionary mapping document submission_ids to their MinHash signatures - - Rows per band (num_permutations / bands) - - Raises: - ValueError: If num_permutations is not divisible by bands + Dictionary mapping document submission_ids to their MinHash signatures """ - num_permutations = rows_per_band * bands - def generate_ngrams(text: str, n: int) -> List[str]: - """Generate n-grams from input text.""" - return [text[i : i + n] for i in range(len(text) - n + 1)] - - # Initialize result dictionary + if n_jobs is None: + n_jobs = multiprocessing.cpu_count() + + # Prepare arguments for parallel processing + args_list = [ + (doc["submission_id"], doc["code"], ngram_size, num_permutations) + for doc in documents + ] + + # Use parallel processing for large datasets + if len(documents) > 100 and n_jobs > 1: + minhash_dict = {} + with ProcessPoolExecutor(max_workers=n_jobs) as executor: + futures = {executor.submit(_create_single_minhash, args): args[0] for args in args_list} + for future in tqdm.tqdm(as_completed(futures), total=len(futures), + desc="Creating minhashes", position=position, leave=False): + submission_id, minhash = future.result() + minhash_dict[submission_id] = minhash + return minhash_dict + + # Sequential processing for small datasets minhash_dict = {} - # Process each document - for doc in tqdm.tqdm(documents, desc="Creating minhashes"): - minhash = datasketch.MinHash(num_perm=num_permutations) - submission_id = doc["submission_id"] - text = doc["code"].lower() # Convert to lowercase for consistency - - # Generate n-grams - ngrams = generate_ngrams(text, ngram_size) - for ngram in ngrams: - minhash.update(ngram.encode("utf8")) - + for args in tqdm.tqdm(args_list, desc="Creating minhashes", position=position, leave=False): + submission_id, minhash = _create_single_minhash(args) minhash_dict[submission_id] = minhash return minhash_dict -# 16 bands with 128 rows def create_similarity_matrix( minhashes: Dict[str, datasketch.MinHash], rows_per_band: int, num_bands: int, threshold: float, ) -> Dict[str, List[str]]: + """Build LSH index and query for similar documents.""" lsh = datasketch.MinHashLSH(threshold=threshold, num_perm=num_bands * rows_per_band) - print(f"num_perm: {num_bands*rows_per_band}") - similarity_matrix = {} - for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Inserting minhashes into LSH"): + + # Batch insert for better performance + for submission_id, minhash in minhashes.items(): lsh.insert(submission_id, minhash) - for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Querying LSH"): - similar_submission_ids = lsh.query(minhash) - similarity_matrix[submission_id] = similar_submission_ids - for submission_id, similar_submission_ids in tqdm.tqdm( - similarity_matrix.items(), desc="Removing self-similarities" - ): - if submission_id in similar_submission_ids: - similar_submission_ids.remove(submission_id) + + # Query all at once + similarity_matrix = {} + for submission_id, minhash in minhashes.items(): + similar_ids = lsh.query(minhash) + # Remove self from results inline + similarity_matrix[submission_id] = [s for s in similar_ids if s != submission_id] + return similarity_matrix @@ -236,203 +286,138 @@ def fuzzy_filter( bands: int = LSH_BANDS, rows_per_band: int = ROWS_PER_BAND, ) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: - - total_categories = 0 - for run_mode, run_success_dict in data_dict.items(): - for run_success, score_duration_dict in run_success_dict.items(): - for score_duration, rows in score_duration_dict.items(): - total_categories += 1 - + """Apply fuzzy deduplication to the nested data structure.""" deduped_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) - current_category = 0 - for run_mode, run_success_dict in data_dict.items(): - for run_success, score_duration_dict in run_success_dict.items(): - for score_duration, rows in score_duration_dict.items(): - print(f"Processing {run_mode} {run_success} {score_duration} {len(rows)}") - print(f"This is {current_category} of {total_categories}") - current_category += 1 - deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(rows, threshold, ngram_size, bands, rows_per_band) + + # Count total groups for progress bar + total_groups = sum( + len(score_duration_dict) + for run_success_dict in data_dict.values() + for score_duration_dict in run_success_dict.values() + ) + + with tqdm.tqdm(total=total_groups, desc="Fuzzy dedup groups", position=0) as pbar: + for run_mode, run_success_dict in data_dict.items(): + for run_success, score_duration_dict in run_success_dict.items(): + for score_duration, rows in score_duration_dict.items(): + pbar.set_postfix({"mode": run_mode, "rows": len(rows)}) + deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter( + rows, threshold, ngram_size, bands, rows_per_band, position=1 + ) + pbar.update(1) return deduped_data + def _fuzzy_filter( data_list: List[Dict], threshold: float = FUZZY_SIMILARITY_THRESHOLD, ngram_size: int = NGRAM_SIZE, bands: int = LSH_BANDS, rows_per_band: int = ROWS_PER_BAND, + position: int = 0, ) -> List[Dict]: """ - Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data. - + Apply fuzzy deduplication to a list of documents. + Args: - data_dict: Nested dictionary structure from get_sorted_hf_data + data_list: List of row dictionaries threshold: Similarity threshold for LSH ngram_size: Size of n-grams for MinHash bands: Number of bands for LSH rows_per_band: Rows per band for LSH - create_histogram: Whether to create similarity histogram - + position: Position for nested tqdm progress bar + Returns: - Dictionary with same structure but fuzzy duplicates removed + List with fuzzy duplicates removed """ - # Flatten the data for processing - - # Create documents for MinHash processing - if len(data_list) <= 1: return data_list - all_documents = [] - for i, row in tqdm.tqdm(enumerate(data_list), desc="Creating documents for MinHash"): - # Use 'input' field if available, otherwise use a string representation - content = row.get('code', str(row)) - document = { - "submission_id": str(i), - "code": content, - "original_row": row - } - all_documents.append(document) - + # Build documents list without tqdm overhead + all_documents = [ + {"submission_id": str(i), "code": row.get('code', str(row)), "original_row": row} + for i, row in enumerate(data_list) + ] + # Apply fuzzy deduplication minhashes = create_minhashes( - all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band + all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band, + position=position ) similarity_matrix = create_similarity_matrix( minhashes, rows_per_band=rows_per_band, num_bands=bands, threshold=threshold ) - - good_submission_ids = filter_matrix(similarity_matrix) - - # Keep only the documents that passed the filter - good_documents = [all_documents[int(submission_id)]["original_row"] for submission_id in good_submission_ids] - - # Reconstruct the nested structure - return good_documents - -def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]: - # Login using e.g. `huggingface-cli login` to access this dataset - ds = load_dataset("GPUMODE/kernelbot-data", "submissions") - - # we should divide things up into type - # run_mode - # run_sucess - # if run_mode is leaderboard then use score - # otherwise use run_meta[duration] - - - data = ds['train'] - run_mode_dict = defaultdict(list) - run_success_dict = defaultdict(lambda: defaultdict(list)) - run_duration_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) - - for _, row in tqdm.tqdm(enumerate(data), desc="Processing dataset rows"): - run_mode = row['run_mode'] - run_mode_dict[run_mode].append(row) - - for run_mode, rows in tqdm.tqdm(run_mode_dict.items(), desc="Processing run modes"): - for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} success/failure", leave=False): - run_success_dict[run_mode][row['run_passed']].append(row) + good_submission_ids = filter_matrix(similarity_matrix) - for run_mode, mode_dict in tqdm.tqdm(run_success_dict.items(), desc="Processing success/failure groups"): - for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False): - for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False): - if run_mode == 'leaderboard' and run_success == True: - rounded_score = round(float(row['run_score']), LEADERBOARD_SCORE_PRECISION) - run_duration_dict[run_mode][run_success][rounded_score].append(row) - else: - rounded_duration = round(float(row['run_meta']['duration']), DURATION_PRECISION) - run_duration_dict[run_mode][run_success][rounded_duration].append(row) + # Keep only the documents that passed the filter + return [all_documents[int(sid)]["original_row"] for sid in good_submission_ids] - return run_duration_dict def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]: """ Convert a pandas DataFrame to a nested dictionary structure. - + Args: df: pandas DataFrame - + Returns: - Nested dictionary structure + Nested dictionary structure grouped by run_mode, run_passed, and duration """ + # Extract duration from run_meta column (vectorized where possible) + if 'run_meta' in df.columns: + durations = df['run_meta'].apply(lambda x: x.get('duration', 0) if isinstance(x, dict) else 0) + else: + durations = pd.Series([0] * len(df)) + + # Add duration as a column for grouping + df = df.copy() + df['_duration'] = durations + data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) - for _, row in tqdm.tqdm(df.iterrows(), desc="Processing DataFrame rows"): - run_mode = row['run_mode'] - run_success = row['run_passed'] - score_duration = row['run_meta']['duration'] - data_dict[run_mode][run_success][score_duration].append(row) + + # Group by run_mode and run_passed, then iterate groups (much faster than iterrows) + for (run_mode, run_passed), group in df.groupby(['run_mode', 'run_passed'], sort=False): + # Convert group to list of dicts at once + records = group.drop(columns=['_duration']).to_dict('records') + group_durations = group['_duration'].tolist() + + for record, duration in zip(records, group_durations): + data_dict[run_mode][run_passed][duration].append(record) + return data_dict def flatten_data(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]]) -> List[Dict]: """ Flatten the nested data structure to a list of documents with metadata. - + Args: data_dict: Nested dictionary structure from get_sorted_hf_data - + Returns: List of documents with additional metadata fields """ flattened = [] - for run_mode, run_success_dict in tqdm.tqdm(data_dict.items(), desc="Flattening data"): + for run_mode, run_success_dict in data_dict.items(): for run_success, score_duration_dict in run_success_dict.items(): for score_duration, rows in score_duration_dict.items(): - for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False): - # Add metadata to each row - row_with_metadata = row.copy() - row_with_metadata['_run_mode'] = run_mode - row_with_metadata['_run_success'] = run_success - row_with_metadata['_score_duration'] = score_duration - flattened.append(row_with_metadata) + for row in rows: + # Add metadata directly to dict (avoid copy if possible) + if isinstance(row, dict): + row['_run_mode'] = run_mode + row['_run_success'] = run_success + row['_score_duration'] = score_duration + flattened.append(row) + else: + # Handle pandas Series + row_dict = row.to_dict() if hasattr(row, 'to_dict') else dict(row) + row_dict['_run_mode'] = run_mode + row_dict['_run_success'] = run_success + row_dict['_score_duration'] = score_duration + flattened.append(row_dict) return flattened -def count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int: - """ - Count total number of items in the nested data structure. - - Args: - data_dict: Nested dictionary structure from get_sorted_hf_data - - Returns: - Total number of items - """ - total = 0 - for run_mode in data_dict.values(): - for run_success_dict in run_mode.values(): - for rows in run_success_dict.values(): - total += len(rows) - return total - - -def example_usage(): - """ - Example of how to use the deduplication functions with get_hf_data output. - """ - # Load the data - data = get_hf_data() - - print(f"Original data has {count_items(data)} total items") - - # Remove exact duplicates - deduplicated_data = remove_duplicates(data) - print(f"After exact deduplication: {count_items(deduplicated_data)} items") - - # Apply fuzzy deduplication - fuzzy_deduplicated_data = fuzzy_filter( - deduplicated_data, - threshold=FUZZY_SIMILARITY_THRESHOLD, - ngram_size=NGRAM_SIZE, - bands=LSH_BANDS, - rows_per_band=ROWS_PER_BAND - ) - # convert to df - flattened_data = flatten_data(fuzzy_deduplicated_data) - df = pd.DataFrame(flattened_data) - - return df - def dedup_df(df: pd.DataFrame) -> pd.DataFrame: """ Deduplicate a pandas DataFrame. @@ -458,23 +443,122 @@ def dedup_df(df: pd.DataFrame) -> pd.DataFrame: def create_parquet_file(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]], filename: str): """ Create a Parquet file from the nested data structure. - + Args: data_dict: Nested dictionary structure from get_sorted_hf_data filename: Name of the output Parquet file """ # Flatten the data flattened_data = flatten_data(data_dict) - + # Create a pandas DataFrame from the flattened data df = pd.DataFrame(flattened_data) # Convert the DataFrame to a Parquet file df.to_parquet(filename, index=False) +def _count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int: + """ + Count total number of items in the nested data structure. (useful for testing) + + Args: + data_dict: Nested dictionary structure from get_sorted_hf_data + + Returns: + Total number of items + """ + total = 0 + for run_mode in data_dict.values(): + for run_success_dict in run_mode.values(): + for rows in run_success_dict.values(): + total += len(rows) + return total + + +# Columns required for deduplication +REQUIRED_COLUMNS = ['code', 'run_mode', 'run_passed', 'run_meta', 'submission_id'] + + +def dedup_file(input_path: str, output_path: str) -> None: + """ + Deduplicate a parquet file and save the result. + + Args: + input_path: Path to input parquet file + output_path: Path to output parquet file + """ + # Show file size + file_size = os.path.getsize(input_path) + print(f"Loading {input_path} ({file_size / 1e9:.2f} GB)...") + + # Use PyArrow for faster loading, only load required columns + pf = pq.ParquetFile(input_path) + available_columns = pf.schema.names + columns_to_load = [c for c in REQUIRED_COLUMNS if c in available_columns] + + print(f"Loading columns: {columns_to_load}") + table = pq.read_table(input_path, columns=columns_to_load) + df = table.to_pandas() + print(f"Loaded {len(df)} rows") + + # Decode bytes to string if needed + if 'code' in df.columns and len(df) > 0: + if isinstance(df['code'].iloc[0], bytes): + print("Decoding code column from bytes...") + df['code'] = df['code'].apply( + lambda x: x.decode('utf-8') if isinstance(x, bytes) else x + ) + + original_count = len(df) + + # Convert to nested dict structure + print("Converting to nested structure...") + data_dict = convert_df_to_dict(df) + + # Apply exact deduplication + print("Applying exact deduplication...") + exact_deduped = remove_duplicates(data_dict) + exact_count = _count_items(exact_deduped) + + + # Apply fuzzy deduplication + print("Applying fuzzy deduplication...") + fuzzy_deduped = fuzzy_filter( + exact_deduped, + threshold=FUZZY_SIMILARITY_THRESHOLD, + ngram_size=NGRAM_SIZE, + bands=LSH_BANDS, + rows_per_band=ROWS_PER_BAND + ) + + # Flatten and save + print("Flattening and saving...") + flattened = flatten_data(fuzzy_deduped) + result_df = pd.DataFrame(flattened) + result_df.to_parquet(output_path, index=False) + + final_count = len(result_df) + + print("Deduplication results Summary:") + print(f"Original rows: {original_count}") + print(f"After hash based dedup dedup: {exact_count} rows") + print(f"Final rows: {final_count}") + print(f"Removed {original_count - final_count} duplicates ({100 * (original_count - final_count) / original_count:.1f}%)") + print(f"Saved to {output_path}") + def main(): - example_usage() + import sys + + if len(sys.argv) == 3: + # File-based deduplication + input_path = sys.argv[1] + output_path = sys.argv[2] + dedup_file(input_path, output_path) + else: + print("Usage: python dedup.py ") + sys.exit(1) + if __name__ == "__main__": main() diff --git a/export.py b/export.py index d7cffdf..5e2687f 100644 --- a/export.py +++ b/export.py @@ -6,7 +6,8 @@ from sqlalchemy import create_engine, text import pyarrow as pa import pyarrow.parquet as pq -from dedup import dedup_df +import glob +from dedup import deduplicate_df load_dotenv() @@ -223,8 +224,8 @@ def consolidate_parquet_files(input_dir, pattern, output_file): input_dir: Directory containing the parquet part files pattern: Glob pattern to match the part files (e.g., "submissions_part_*.parquet") output_file: Path to the output consolidated parquet file + skip_deduplication: Whether to skip deduplication step (default: False) """ - import glob # Find all matching parquet files part_files = sorted(glob.glob(os.path.join(input_dir, pattern))) @@ -261,8 +262,21 @@ def consolidate_parquet_files(input_dir, pattern, output_file): print(f" Done! Consolidated {len(part_files)} files ({total_rows} total rows)") - -def main(output_dir): +def deduplicate_parquet_file(input_file, output_file): + """ + Deduplicates a parquet file using the dedup.py script. + """ + + # load the parquet file into a pandas dataframe + df = pd.read_parquet(input_file) + + # deduplicate the dataframe + deduplicated_df = deduplicate_df(df) + + # save the deduplicated dataframe to a new parquet file + deduplicated_df.to_parquet(output_file) + +def main(output_dir, skip_deduplication): """ Orchestrates the data export process. @@ -305,36 +319,40 @@ def main(output_dir): os.path.join(output_dir, "successful_submissions.parquet") ) - # Apply deduplication to submissions - print("Applying deduplication to submissions...") - submissions_parquet_path = os.path.join(output_dir, "submissions.parquet") - try: - submissions_df = pd.read_parquet(submissions_parquet_path) - original_count = len(submissions_df) - - deduplicated_submissions_df = dedup_df(submissions_df.copy()) - deduplicated_submissions_path = os.path.join(output_dir, "deduplicated_submissions.parquet") - deduplicated_submissions_df.to_parquet(deduplicated_submissions_path, index=False) - - print(f"Deduplicated submissions saved to {deduplicated_submissions_path}") - print(f"Original submissions: {original_count}, After deduplication: {len(deduplicated_submissions_df)}") - - # Create deduplicated successful submissions - if 'run_passed' in deduplicated_submissions_df.columns: - print("Creating deduplicated successful submissions...") - deduplicated_successful_df = deduplicated_submissions_df[deduplicated_submissions_df['run_passed'] == True].copy() - deduplicated_successful_path = os.path.join(output_dir, "deduplicated_successful_submissions.parquet") - deduplicated_successful_df.to_parquet(deduplicated_successful_path, index=False) - - successful_parquet_path = os.path.join(output_dir, "successful_submissions.parquet") - successful_df = pd.read_parquet(successful_parquet_path) - print(f"Deduplicated successful submissions saved to {deduplicated_successful_path}") - print(f"Original successful: {len(successful_df)}, After deduplication: {len(deduplicated_successful_df)}") - - except Exception as e: - print(f"Warning: Deduplication failed with error: {e}") - print("Proceeding without deduplication...") - + if not skip_deduplication: + deduplicated_submissions_output_path = os.path.join(output_dir, "deduplicated_submissions") + deduplicated_successful_submissions_output_path = os.path.join(output_dir, "deduplicated_successful_submissions") + os.makedirs(deduplicated_submissions_output_path, exist_ok=True) + # we do this as everything combined can be too much for pandas to handle + # if things get too big I'd multiprocess this + for file in glob.glob(os.path.join(output_dir, "submissions_part_*.parquet")): + deduplicate_parquet_file(file, os.path.join(deduplicated_submissions_output_path, os.path.basename(file))) + for file in glob.glob(os.path.join(output_dir, "successful_submissions_part_*.parquet")): + deduplicate_parquet_file(file, os.path.join(deduplicated_successful_submissions_output_path, os.path.basename(file))) + consolidate_parquet_files( + deduplicated_submissions_output_path, + "submissions_part_*.parquet", + os.path.join(output_dir, "deduplicated_submissions.parquet") + ) + consolidate_parquet_files( + deduplicated_successful_submissions_output_path, + "successful_submissions_part_*.parquet", + os.path.join(output_dir, "deduplicated_successful_submissions.parquet") + ) + original_submission_rows = pd.read_parquet(os.path.join(output_dir, "submissions.parquet")).shape[0] + deduplicated_submission_rows = pd.read_parquet(os.path.join(output_dir, "deduplicated_submissions.parquet")).shape[0] + original_successful_submission_rows = pd.read_parquet(os.path.join(output_dir, "successful_submissions.parquet")).shape[0] + deduplicated_successful_submission_rows = pd.read_parquet(os.path.join(output_dir, "deduplicated_successful_submissions.parquet")).shape[0] + + print("Deduplication results Summary:") + print(f"Original submissions rows: {original_submission_rows}") + print(f"Deduplicated submissions rows: {deduplicated_submission_rows}") + print(f"Removed {original_submission_rows - deduplicated_submission_rows} duplicates ({100 * (original_submission_rows - deduplicated_submission_rows) / original_submission_rows:.1f}%)") + print(f"Original successful submissions rows: {original_successful_submission_rows}") + print(f"Deduplicated successful submissions rows: {deduplicated_successful_submission_rows}") + print(f"Removed {original_successful_submission_rows - deduplicated_successful_submission_rows} duplicates ({100 * (original_successful_submission_rows - deduplicated_successful_submission_rows) / original_successful_submission_rows:.1f}%)") + else: + print("Skipping deduplication step") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Export leaderboard data to a Hugging Face dataset.") @@ -344,5 +362,10 @@ def main(output_dir): default="dataset", help="Directory to save the Hugging Face dataset." ) + parser.add_argument( + "--skip_deduplication", + action="store_true", + help="Skip deduplication step" + ) args = parser.parse_args() - main(args.output_dir) \ No newline at end of file + main(args.output_dir, args.skip_deduplication) \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/create_fixtures.py b/tests/fixtures/create_fixtures.py new file mode 100644 index 0000000..2823b52 --- /dev/null +++ b/tests/fixtures/create_fixtures.py @@ -0,0 +1,43 @@ +"""Script to create small test fixtures from the actual parquet data.""" + +import pyarrow.parquet as pq +import pandas as pd +import os + +FIXTURE_DIR = os.path.dirname(__file__) + +def create_fixture(parquet_name: str) -> pd.DataFrame: + """Create a small fixture with 5 rows from the specified parquet file. + + Args: + parquet_name: Name of the parquet file (e.g., 'submissions' or 'successful_submissions') + + Returns: + DataFrame with the fixture data + """ + source_path = os.path.join(FIXTURE_DIR, f'../../data/{parquet_name}.parquet') + output_path = os.path.join(FIXTURE_DIR, f'{parquet_name}_fixture.parquet') + + if not os.path.exists(source_path): + print(f"Error: {source_path} does not exist") + print(f"Please place a recent {parquet_name}.parquet in {source_path} and rerun this script") + exit(1) + + columns = [ + 'submission_id', 'leaderboard_id', 'user_id', 'submission_time', + 'file_name', 'code', 'code_id', 'run_id', 'run_start_time', + 'run_end_time', 'run_mode', 'run_score', 'run_passed', + 'run_compilation', 'run_meta', 'run_system_info' + ] + + pf = pq.ParquetFile(source_path) + table = pf.read_row_group(0, columns=columns) + df = table.to_pandas().head(5) + + df.to_parquet(output_path, index=False) + print(f"Created {output_path} with {len(df)} rows") + return df + +if __name__ == '__main__': + create_fixture('submissions') + create_fixture('successful_submissions') diff --git a/tests/fixtures/submissions_fixture.parquet b/tests/fixtures/submissions_fixture.parquet new file mode 100644 index 0000000000000000000000000000000000000000..67acda46176ce474d831bcba21488e69726cfd0f GIT binary patch literal 40107 zcmeIb33waT)i8XotVtwU8rwn=wqu)-ZP}6+S)1i0u@lLb9mh)?Cvg_XQAX0(maWCn zlGrW*QZP*@Ng-?@4HQ!hr4&jiv=mw(Ev77GYbga z7!0&~7MMbrnG7tElPOA+CBPP@l8zL{fC*AOn5W~089E+))u|`9(im51Y>W<9@gf_k8f|ZE#&k&qPk9DA5K%U|}jCFrS_y$_o?yUGb--P$0sE zBl_XdRJiqU+;BhV_rqfzJcgq#zc(EA1_Msk?{W4GS3CWFXMpwY;-CazHHR!#q+J?s zD>F5zC1_|K5iWl?MR~;~I6;UXzmBhZRfSN2x^r!3!`iM6#>cT9F60WbArAwP%K>t^ z-!I8hU)Vn!4KwX6W`jW@@%o2@p$HQUOZq}VKNDfYLySyO<_!!-BhE+}6O2YEiL78- zFytQ4MpK?BB{Mh!UQmC?1NrJn39GvChQ&KoDrr& zDM77D$#A=l`GSODynzT43hoBHUA~}usKqm)A7x6!GE1Y^4|kqg$yG;Rv3=@p<#pG(1-gn8Ky57 zf`YdoFFwC)IpbwkFs;j((o(PZVT`dwT3W|wDZ5B#>%pGF5~dY^dwZFZj7-Mf<7G;i zcBT}MqF(WKv`5p38f2!793fMO*W-NA9?71j`>`U%h12(7<2}>^N4|m6qhW8LUu=qu z7_ESa)5%66A+IYM;havb*6G~Exg)`lQ}b7aQwt?ZG#?9tedW0#olYlSrMbFei-2K! zs(Z^AS5LL3fVvs`GCk1F7YuvUO3gKt3q|zTV^nRr9*fu>uX?q})}!50BOK9{-;i~l ztshlu?!;R~4C4}4A-lc`DGTvR^=(-z(Fj$gegfaJ&*^kUy}pQ-pG5ucfYaaSXPtd! zlhbS(fv?f(RR7=-=$5u5#KBN;q2Ul0ak|-XWQ8vn=r5_)vN}L77v;F%>M3Ik;uNIQ zH0o)L23u}R9|R(s)HE7gcbi^=dY`hIv6f*PdbEQ;P@~mUml8slfj)VF#H9?b(cO<2 zO@`y)aZfs#uv+mIig1Vxc(nK-7(sV15RN$V_U5KT6|+fwndy9_XoY*D)qXxg?J~Vk z!gMtV_TDcnv16-ar3hG0Q&JX=cxr3iFzJ{TE5x$&KJN(UQPMeKE&}~=dygZb)}-6M zucCsn`G5`C2**SN-aan}%+C3Pp;6#eEzFvB7GGd!X9H|M=g}4ct9LlKjqk7dt$+*} zzG=-$!uU+Zv1=5#+6a27%RdNHsW=+JiF#yB&^r!)rdL_SG?Pz)#;cD zrpwmR=xn!b*6nuOTfd7^XXRdu?9&&czKhZEYs-I=$@!tD+6FG9#$DSm{@4N?ZRGku zurU20uSc_O>RHE|>h%hS@p%2_19j=)PA+6W{-;df$}*s>BSV%i3PkJsDH`40h|s9+5=U8C8ygL{2Vf3!%#d6yqMRZi zqBu)cDUvofw`uhb5h}}i6qVKAiZl+bp^O2N0s$M!=<`I<)-t9&HzM7{dLx=?Lv!aw zCK6--X>WLd8f7aQ8@XLxH@AKiAb?l|s%qI*>Vg%}x3$9&kfE&4H_8l$Z1uunmQItP zKI+Gc5EqX6BJ^KGqL6olGMUna!u&chHZV;sUF$b6>o?fCHZatADE~UOlnZ!#?Dzc_ zG+nv`b+mk~eU-Hs{qU&ci3g5iZ2B^cZ@x|>9PGsxit?F!@vb|!K(YZ*vaF-W4}GvjnH82#yG%89ITHT>xu{n9@2Q90lih8j zD`emNKqj-E_(+ynUk9@L!V9TYQn+hZ{ZyoTU0y8Ft&{bM)^D`aeu*PX*j`rivs3q) zN|3O_srhfwdZtdSN3B*pqBn@;@-(`fE@ElbS79F5^;EBI|hSqi269jQVkYR)rAO zj0fe7X{c13ExudY0gH+LdxG|7s2fp*=@~?=6<4bNLS27KwB|@!_KT_O{!6NZ6>D5r z$VO^wH-I7#4EbM2+FZE^rbRA|>?Rt?9_w(W=q<>`ek54yTx!+XC;56gt6YEGp3o{UQU%MECQ{IU!b*55D5SeAXC zgwvhgBP*BcM|5RgISQP*HsgFWI`K8cl$YHt1`_U*)~{HxB3~rZS0ibCeZ98)7v)z> zv?8wPUb#hQ(|gzm>l}`Rw7RnG8W?X7_Oj0M{mSgTNvTj*wr=^FWk|L{cI7&ilqu0Q zD9f=eGf>71l`($Z+WmeZ(_pI>=I7J33K6J5B{hn=_weh;Kiw)g)G9#B#b3_x2R%_A z$JFURQ0R6l0<52_%NJjb^2P5{59S1;z~14y9!e$(xqD^HX$1(-wLr#%9if#^RfNdqAg!*&M#x~%gcj(dv_Z=R_CE(d8{wMe0pyg?(BeMqGM%^(sCjl7`d0&@V&qsxz1bRIj=*nfvRN+urcOhta0cD}kHB`GD$(oTnt6As{bQd5} zKTAV?x2{2=5Ih7?2p890hxR)Q*EiVO9JPRveo`pqMutQC->cn&Wcu`$tqy0qy#fiX zmOec~9rwXc>*ac+Xzp6OvD3Nr3RJ%zAzjO3n(qjQz4T#48MHAR3igLsztc-!bRM-V z2NoFTu-D@}o~2Hu|0&6TQzRa;>d^5W89LOF_qvFJ3p^Ksw$dL8Mu(X?)Atc9@JvPo z1*k7l8swyI#orKCR4ADYJ5>zJd?*}I-leQWC8l0Pm56gr3aFQ*es4gV5fb^Ca(xRz zWu`hLe$*(_4dj;x2`NblI6PDvpg!;bt zioAeW;})OP>k#h8Iuro(P=|PZda$oA%!xZxkBOB|RH~k0SkPhmIauNJzZD=Ieo5?= zt;8M^XbFKn_~`(xwS#(5wvvuWgb`nsU*DA?YSl?vzCJ#Q82VkY2u_g_tb*c z5e#s2L)L*(DeGrSmA$A3=K>eh>KP;)(uqGt1yhgGcV{A(>J3Fs7<3Hx0OygdxdeDA z$bbUUA7H`*>@XK*wC!cBI%XB4Z7u82fij!k{>#ag^n|o_0->Qb6Tl=xSraJ0-vwk7 zcaNSNxN}VZ0iqvO-s#@^kCv@>Pk;`rl-Q11GX$t2?`6sOoAlRGWz7>=CznjTFJ#@v ze^dxVJ;sHCpgQOU0@JM+b8lsXZ8r3K)+{tCad-E2YX(g2qluoh(zg<_(+)iacl|L)Oy**jlWkYLQD{ zi#FD7twqY^TeKqBUS;1-YU~pMJbF48X&n=wwDh4mYN!n6Xb=>XP#x?ObQw@;Z-&`X zN_H47*kLFyPKT)gU9`4Kveu)J=>pU)Xtx3M-Ra_0Ii6rd%a*xxHGS&e!xA7aKqc5) zqlDir8*j=wC>?UTqc#=|)p~L5W`XjN^dVCnl4Ds`Giak1%ZNVyTQStXk9_Q5jf8q$@rD+^h- zTOSE(p=WvTr0Tva_lRjh`hhb1_{5j=d$9jnr9i>9#KCN+taCl87CI4X_DpF?W zHVIcS2YyIqGl;Mdr=<=76S?)*3y^!SLT;9PcUdZZztnaK)5Z01AuizNXuUe!$N9Mc zosyj%3^2jIr|v@K@-8_&B?UgcT0&JyZK3|K>Q)g@Xie(sT4qCZ7{qpD0Cs^OFhJ{_ zxFt&i!^ze%s=L)HPa5hOzgl%CLw|!oy*M)YjDqR$!^)7oR+@p4>-Ks?e^XXlkFqIg zMe>dBy;6F41I!`WD*AXDOwu)L5OUWxqIG*~_aR}RR}(;8@&^=k zJ9R7R=`<Ijs@_-bTajVrIQ@N;}{zs0qj$!qh$Eow^^YV~O zoG$D!9JoW?vus5j<753U56iSWHrQ$zZT82~slA}ifRyej-g8FOUfjF)Yx-i40&9wq zqNBJMR&VcawmwA!`xb>x&RwpDNwWhK48ZMb+}lGw5cCu%E-} z$RutD$;Uhu%}z~IxFwcSgyffKo?KqkX*e-{}vY3nRO?d zFxsdR)>Zd*jeV6Ybk=d?I)do5oUtRQ{vBG0vyLFk?MIN%Xj)T=?MkGsoc=0;ulTF= zYw+|D^fM?oj?~XjTQGbFEm-r=>*vKDDlM}tg^y98|x9;M+Eq>uD=V#CDZQ-?|DyX{}RGS#b2uZ zia@NKK8ns9Mby!b_whkM=UU2oEmeOxQd{k<*mez|_XJ=Vs{0{63MfNi_~st1Z^QVj zlwy}pF#RIF{6*YVNjmVOBghUlU#OQ;fVSR(<w6N_>VFPeW|vd z7OXIqF~z&tU0m2X%!Qm>lx~rrJ=R@IN&;e2jihgE$*yC_elkUU1)oA&10x*xd4*!F-K?ZaLRoyJbes_k8!;R9~O4pi|C@%S{5Vov|KPI7eGS20)5j7 zhLDH8jZklPO&)!RZ0oHlLT_JvA&qgimd4fe8Cg>u;>1$U7v_pi$#X4Ta-jlrIElSg zb!U!6vTmadShdTsz79l-{F^lPNnK!osI}9=0!CFvi=rV`eoaP>iw%QSAQ+IZ)`7qX z?gFjK19l6#yi@@?DOgBU*Ph?;8j{^sbQO)5+M{CXl`L$(R#^W-%)Z%WOcB~!yKkXu zS+w_3VK_n`N*C5y>i?We<;ou+x4BhFr^-6dx(6fX%&(C6SN1!F)4xW!zZRMb?68L{ z!2j$zfuxcGW=FOuH4}8f9mSldXjq-YxY+>X0@K2CsGQ!JuJACfQ6?(?rC9{4o3aKo z_=3hAjE8dvLoA5_OS+@eAeyjs}jU3XX-hDCtt#5fZ=&z29f>kfjc1m>!~jM;J1_eJeH zbaGdP!X5MvdwngxLP7a-c~(WGTDU_;Pbx%_&?x1S0@OeyqK_#oB}3e331Aisa||Gt znM+HRDv2>23?)9F?YD~lelEmtSSjg>l- zPWPe$(1$W=qrVUrmO-<1J;m+Cz>^)AQnVKLmPvHVVKkU6%2$9q^GAKzJsQ=c`Ql44 zmKHFbLHQRg^w9aZ6buD!Jzhp1&#YaZFMsxYh$#}OWNK?w!5TH%T3H}lsk^!4mI7G@8tQ08;+))&EW=uk0G)R04Jnlw z`WsTL&*Y;reGaNs7MOBC(P{;oLs6l@|vi<4wTPUAC zQEdBIwLAxKDE}Qs)N0m^hW8H9&lMChUQd|W0sDs?<0pj!oNCvyi7N{m7;gkr-C)#5 zZ_h~ggYlgczr^l_^~QDvd2IJdLEU6khw|z3MbZ$M^)=_AAa-Z^gJ1w`Gs!2j(qZ7D zVP(VW^&ITQT)|x&v!l(#G_asi%lDW?e(%`i1&Z(vCd|3zJ4JLEqk3vZ_KO+x+5-9C z@=C5LOlL>D;kxX$3nsotzmh+FxG?#9=5S&5;leZD7PQ|ZY6x+3rV$Ol=(DBS?v!(y zmT#Z@Fz-P1y4!@CxYJ`&*0YPHcjTdn?S7eq&d~{Xa}yt&x)XP#w}RC!)Us1HLjS5- z0oq^`xa+V{ro8hR6rTJOV#RydGSs90koHCJ8=5bk(17t>b)lLr&pFjd-&ZJmVoka_|JSfP+y0sccj+#!Kd9Mt zC8=JLtk@}$%Vkdsha)s9$WUOK{zg%U4-B~#74JSE)RmYX62T66+OYH= z25Ae3br9E);Gh47IQ@qyd+Sld$&ZN%@uodDE*HTxrc1R#W=CI0cB-^owFp1lr(Gh`LS`;70C#QH`^E>(k4aelDQ91hMjG z3hBDkbl^<)ki1_6lVI}U>~g92=e(S1>7$Q}fbr>@^Kqxv6^sTvVK|iGA~;J5yFx=& zh5epl6wdx#q4(+0t=(x_)6rslF$lEhi)%NaF}JpB1FSyuvl`l1OdZy%g?UeF{MA<< z&_9I4%d*u4q8?+fij`n2%a%I9mc94Nym4*D`BK=phO^4AQ;Ruv0@cavRNK3d3;(F)C-PBUhD!O(tS2h(P(I)F5xVKRtQ8lTk=oF{N_}tl(-;lY zJL+Y(>c(=}rd*`@NnvgQ{iZ1TLlI0&db)vY1}kJ!dWv+|YDTCk_EZcdxETCp?A z1|nVZs|v{OPqm~(#JF_XN44_8Vz9&NN_1<)-;l{P{+ed-cQV%2e;X;k39A&N%P7uT z36^#doC?XV6q-s@22E~W{Z~=eGqOu*=OD&h#$YM3|45Lv4Qb`dQlUosi#Jh4$=zbu_nWT<_2 zN$#lb-rh&>x1U~*awp22vKxq&c@?S~5?`40v~Z{dXf3S(^%S^Vg@`7iZ3YosYJF8pJ1cn z3v%mU+$he^XESyDmohJEv^2z;lUhV&ZUprtW zX!-xq_m6@W)9+T$4=z*fTh=6KneXEO-UM&t#!vWTR@=mRf9y$5j^&fAw@E$#@mk4# z0K7*4>_b+?D}zeU8Q<4mMt5Uxx|BU*J44=>wbmSSFZj7s+l1H4088%b2;f zW=+Ci*p69;!5+V88CL+7idY%f5S;8Mf8yzGFxn(;1B({{G-nIJ!&zE_PRxRXhXqK* z4x(ml(fp|}u||`FU{m!x;PKMDg_<{ACl|L`&R)9XbCquW^wKd~`1-{$nU9FBmM1g+ z16nO~4OXuii@68Rdibn_(VRFsKml^B2R2KVK0$S+QpudglN#jRMMzO%u1~ZGHhVY$2CIIo zp|UxZW9X9;1X3-j7@Xf}wFEcK0@T3DAIBvIZ?+9Wr?XaqPR4o=t0m#_<{emKX9GM2 zF-%I5N!-4mAOS-P27s=~V-ahTKSzl0t!d{r4~{WD{miLCQq6Yo2+%MwK=JB)SCXp- zfFft9-oaPz;PH%C9|9DUODE$AXB07s!3!xU7`p<&6R0Q7E8y@mAs@)01y~K>h$P}1 z>LG_2aCVWLKLm$sM8p&@#0LsoXc$+_QzCJe5pU^J&ojvRMx3pA@d*gVuCoXZErOtH zXcoc1A_%$yaRlRn7n)7n%{J_x?JyC1G0ut$XsD35BfL3xY=F7q<4I22bhFJ6!AOoB z329)T2aUk6#7|a|Ibhzb2EUxdSqtxt1xhd91aXJr4_p#wQ=~awfWT6b)LyGDE{h0U zz9zYWpaUgD@l&Ufb5oT)m?CvemQ zJ~RYB@*#LYp9m8rR#AdqT@1gunG7@drSU~d2%uzcg{Eeha~2zN`)R14xKO89T)bdLe;lt0}g-e&yiUd$=8oOk45;Lrg~4dCla zDv?vZSTp1)25P}kACb?qg-Ag_2yD;b^BQw+>EX#9^B*PdqrJc`v7_nOAs|?3Nv*`k z_1SBK201jZHaN%CY%qtLCDowLCtAmtuWz&ob9c zcV>z}PoVYI*&@&;Z&lIF~u5#GvC;v6ZZpjOTaaLzI*PvauR594+L6#dpXpA&$0M*{-!xol*q{xL0$es0s|JNLe?ij&ae{5+wLcjdjazH z`(v%`h+h{%PUK{YpzaLhg{g}|P6~<`kZm*NLQur&u`919!4R*!wQi>uu7=4y-~KXO ztF7OO-o9v1L?S0slxTzWePQaSVeptp{Yo)lka_yxb$)1hT%Gra>3F=a2>uV z&vn8yel|1&q@EM&0+tc+OlIcnE!3GC z*@BdUHmFP@Ti{noE9J8>ELOsMOEdY{l;m6#3)q39MvMS={yJL{pU(%(SQ40KZUg^F zG&~v^fR$`ET4b@{5Rq1~Sh>taQ=w5wVG8`F2oM%5hysEP{sWzw6M>Q`O61JkU7`|c z7fFS3a{wR`W<;oy)PD?BN&iU{kSHgC4G=)2lqLuuloE*Z@vw@tD3fH&fgw~rE>gZm zC92*ke7imm=)aMKFEAAYK|fakE*SwXRH1j1qpu2n5uXiL30Dk{nvDjhdpJrw7CT2b zU^vI)l#B&}JAx6tUN3?3EjEI?l{tmvxQY*a7=36EWmd$3CUEz*Wj!?}|N5vKJUPRC z(U|i#oSFu4dKnzC!-7NtPuv4|bXy^2AUhJkg&p#lU{P@4z7PcX@fIP zCLXZ^{!$VCD3~g&1}}Z*nzi=zb)eb7SS@1aPpiy0AxI8 z@@5C(bB=L*%*e2wZP3UdCXX5ce70eWWx&Xh;Uxv<3?E5I8b7}9zlLuG&j5P9F@n3D z@WWh+#JM2GlcWsK46`Wws|*o=7@K}R4q`q}@Jut0D?SaW%ySHqG{0v@hacZ_45B9s z=sfxU_E)g-P;NrNzu)|J)87g}41O-r9-sCVe%Nx5a7D7EtoXYfZRiOPd54EN2>%KM z!EJ}QI){5DFW-Q`JKn?U-BG6e#P#UyRS^86hd-t#IMgfI5R9v>U5VW^U7?UEzXvXhPI834rf-$ zzSU5;bBn#DqtR$|Hmq&$Y-w|JIX7(SbkuFjclNv8+w%4KNU?rHV_SUZcTi#hM&1xJiOFhq2y|_mLfzuLA19>PgfTv)*W^ z2*VjJxKX)>h=N|>7+@+Ap`6pJk!*F#BTR)G0zE6(VehmP*_}wVVfE_VyDY)sNJS{< z_Lp1r(9W128JuK^s&GbvL0?6uMu{ras!YAHs$#@y$*8L{v@zvRkfB{2KGs0 zI}LV|{y;E9L`!@}#E0j>!vE?w-rfsNbl`#o9LR_76$gSIFPIiWo?>Puz?11I-xLi* zVGS&g#hmdQ`N}HhLfC0(0bV;Wb+9^3m- z%_3&|MOKWaMHp4X4>(+-YU()BHi@v%UZxQW+Zjkc1IabTY8;{-7bpg zbkhPsNFb0_8EdW6l9cNuDZ(lv^#Nj9goW{tK?vx){yo(DPsCVLQcM_qke&kIXnZTg zcUD7?=ORIPweRBG>0JE5pOLVpa{8b6=6~YY4cuS;Pb@MTEYoTDNE&}vmp1(&e)&Tz zG#aNrpl<$vf}8Jupdi6*T3hX}1<(E*G1JcrNcI$U^Aw*wMg1J28-I?r{~QTThUvpg zjvQVB{@@13wI^?ZtOm&13Rzam@o>o(#1^n=dEMX!2DTF7XJgG+!SFHJ#SqV0OfVCY_(Kjosve?^q0jMTeRi;v0*t>DyLd-y3C7u=?O8i z?mU38Ar_w*i;HzO9TdldG}pG(Yx&?+I8>>aiPE$lTY3h3*D$(hXMy4(Nkudq%0EL# zU>brb&=1sBSJfJ;YeAGoz*ZIWnddyZ{7gUV4gt0qM^G_-r>vT=QbEiHue?9w_DTn; z!lmGPZUv_b&idWB&Bcf?r#R~&ne{obPh%9?imE+)iyQ0 zu=WZ@ym14>$isZJS{RTWarH19B%881+dM1T(9KzTKiBr{M=8o5&AfZyB)$Z~i4t2_ z4g?TvUctjRzV~J-BK~14%<+x|$AUF3|Dke}Ghnjp3RhWta9?Qk1!}6J10Ef^;!7yU zX!Y;jIoh|2lrrx$4OfSD1_1dv@4|rHyDq*FUK|>Mm;jgg=|vMTp%Za~jv(>oj80{e zSp{9P@xq)xT2~>|75Lf}M!4R7UhVcuxSn|X7MU5YipZt!m93hOFYKh_HOZbjR)!cg z7N(+w7%z$qdCaLZEV zBe1cKL3$zO?fNHScK#dNyZFZXl9_$o*>+(6{zmKsR2 zRxz+`sm6I&vB6ZsKpg}8SZg5PS`GO%mADD75L&CS0=FU~E=aeSp#W6{I*&KArW)?e z7Q6y$K~co5cq6LBHfj%*hN|(3GKlGynDr(Lv}3{RGR;+tNncfi_bE(}Vlay(Qj;D+ zx+S7Y#w5O%x(gX_0kU99T5VxWcu)c8Ss4q?%CzVWm5fm@z8ycRFjWFfBkojGo1nQG z9EQk5ti?8k!3wQbVN`zTdA+Y z&CnYKB=sPRjDjvmkXaX@sx$|rKp(SOSw0~&7}ec)g+_v^)pk^=ZbLQr=g6vl6j|We zf=8fXEvi!Y2+fs{P=lu+;RG~$0r=ZmX07D5#4njKg8;Db-z5$ul7XUI( zp(^V4xE%Ua4sf-QlBYQeb+3dWrI11M7C`5q*Ox)s6x5ec7okeqUy;eypc!-2I8FnR z%}}?-_Wru>0wx!Ufc*es6@EZ-K=UnRRX?VHVFj+khoJgJ;w*eSpz=kjwUV*o=b*+z z05c8XKLCMkOZYx?Ypwz4Fsib*pf#hd9Wcv7HRWwks(>;9Ht(QHb|*56pQC<{%(i35 zOqHv@MJa?PGd^nD23>wd^B$_CwCWvRjQ=G5JBIK}{FJcL;J6H@a#DbL*dcY4APc-q zz=#L;ug4O{N8T%J^-vXPX}bfiKs6lHJSMF%1Cb#LuEnhR1E5*}-G3e#Y&SuR5~>_J zmk3hpO1GI4a;ysk_zIfDC)5fy@$Nm0_aP76lAVz!*yaXBcki7-7pgsIm!}s4Dga zs$8>A0}$08a@(L>8n+)AsRH`R$Oj`!JT9DkTRGTsYlRy+Q@ z{|a^9sZ~%yl5Vy#l1i9lmG~(46=Y#usB$6&RjQXfyGmMZWUMX6g*8AC;2RUj9IJ`3 z>K;Dzvr}Qft$h5&QyYOAtECnzQ)N?%sz8JsK$XWIoZ52w5oFoxVoz%hNzK)a6>pV- zEUmI#cIQbVR@kph4mgfG#*o1_26I5wlDhXnWL95u`p1N2G#tz!i>(j(cGxk5s`h@J zYSkQd=;T+6o(eg-sVcm2O09Wy@;2a!Zwafdz&BNRo#r6-1)#zKt{3{URJ!Kyl$A;we*+o! z`s7ujQ{M)Z-*H@qs%SLWeo(^;0L2k$uAP7DYUekaO z*1U%=b+l`4LWbk-BCG8MWS;PgVX@JS%Ckg{_Q{_@Kg&<8ntB_WY}M!{SDxB_>Wj$W zScmf*Q<|ewA?WBG!m7%NJLPp^dA;PRwfLo$_t{HMeUZCc{+MKZCpFP>EAG)eEoV}x zMO*a()E_i_JI+cI=O8Gi5ve+ytODcX?ey<2sPn=JB zGj;vcNuccWj?>6;O00fw3b4gh^4mq4LTPn1W88ZjXaG`5{1^G7qVc;O$EXp&cKuTy z+s4Ly+)ctNGxx+^H8O%eRJpgv@q+FIR-9S^%-&61di;J$K^=mQyrwxexsu{I6}0^* zs^K))so8;QPKr4Vva(<2G~5{XTNu+;&7dYC4~v8rv-&N$M#5b(Ud#F94~x0`G(%9g z?$j$>KPPx~IY^vl$_DMhBpAOBRoSroHn16C1$S^t4%`}|HCdWJIIt!Q`bLd^oxWBI z&F_N;kdwA2kWm+%`l4$0g|Qy$H$y0-Dv>IgEjAfe^QB!I~Knqf*wy+rN1`W|_HntV^1+zJbrxEltzid!mw zS$6eT??aU!u}$Kz_^Fcv8oT_zY57>{*Q?RPvO|C{vG1C9pyw$!Oxs65Rdj7k`fH1UTxBEOD4eWkj4+%>uP)Rm6Qs3z47 zVo(+S;?T>}B>*ms2yVw~j%`-Ec&l|~h4>VPi{wpYRO$7-H3!Yd1EKrXMR(&SG|y#H`+qgAk##Se6w`&jq%M6{pn6BGI15M%7;>I zAB|VYKa$EXkZ5|iVMjHWK6NLuf^uR#{!0fe=M(ohmN`oH|Ac-_tjPu{?~wmluDL_^ z>f{I1rFc|xhvTxzpE&Aj1~m};>bQye`{eoK7P&J+vv1thau7jKxk%o39x|TXNDayj z(n%wy8LyS!m`WX$i_;v}Xa>PJ`6b{8DU9w9q*N!jOmpOB11>E-x! z>N(X&y8QiA+hO^oa=Dxqf$m)k?^%Kulc}l|!n!(TcH%u_8x_-M|JpYykoow{&pt8z zV#e9NCtl3R`(q~kplJG4)!Du$AnWWYg0|$WlS>*`c3#Ri%O#dRjgiPRT|^ z_Hso>w_^I6+W*(SN4}{Y?g7J2FH>!_miJvq8IPVh zJ}WVw9pbYR_wEr~0Gs$Lv2t@_3*WSgyxX+}UNq^`cIlYSTrhw|mQEiyB7)}yQ*Eg- zFk|@$w4^Pi3(?T#&8bORm{T>A`J{Z(CIFwcFr*Ws-6!}d&GrKmHz2*zlsH~Je=~WL zFr1YEz%Zchj>O&zyLl0u&s(He=3`?gRU5NV_wb|$d3E;pL3 z<-Aef>nW=W8MJbCzGwgU>H>q&uysV^VlaWsW<5Sj$g_S6C zNf=I^{j8D!4$ufPWwVZE5D^tiqm$kkJd?N-P6k6WPiaZj$a`;mSITo@Hfg04S``z= z8F;s`AC#0f2suW4s?Z(ua4`@|$?Z&QOTblY=7A;;{QJtB^I9hON=YufJ!gI%fhr<- zV9}icDj}%w&A}@~05XDS33(nS!Mq++DacGCkFm2O-gg*|&v;w`LBAi~{gl9;Z;Bi7 z!mDKYPVh|ycDdcoJ`$K7t0d(U;LMJ^5IIqkV73H~%D~}jeiC>O@;|B!X9wKlY=T9V(x=VrL{J_psqj=xEco~%>D<|tirU>N#$5x^x z=~?r5R4GtNR3Bp1-*C}Dk?8Zh4n3=Toxy7(_DRGzLpVj^_FT9# zi8osiN}U`v4ACwSTS(sC9govK7w!x%Oyzxfk^_l>jOW6g&xJdm3wMUM>Nz2r>0G!o zdF49)#)#J02;Xzz&ha=xvLc@gcZS!(kof7OP)Xj4ln+F7o(p$&M$Uyh&kMLd7w!yh zzwEhi=h;BwbK%Yl_sJ(7N}01!P8d)U*0#icA513?J%Ni}nx3$c3q&5#${6 zV-mpc{uCEI@eBdDFg}?8T$uVP024Nj8Ji{#>}IUG9T#2p94>nLSxn3vvjNM=vC9iN zW9uBgkR+@#GqzE_^$n`6MF*e9MQ=X`ZA%tMkXXxja~aCgEJ0bx=2X6oPSu8@r=Q0w zAksHE=s7C~Y;IsYpJ#K6_fk9G3Y)OL@=d;pRe!`qXI{WXH@^UgeZJV|g{05|DZuHS z1ZxwsUz`>TP?$BQ@<=>{RX@T-5B>}n?fV%SWCM4w$S z!C4zGm|nOFtNxx_1pkw+=x2W-l4+54|L_%Z;E*Iw&myH02X-?e`pGK=A7KWli?0?1@poCO=#CbctQ=Pk!iVio8E_g^h2`rB0m?q?2m`V_c{ zrELao@|t#in(9GW(XEFCMUNli2jyI-&i^&x<63 z4u`~FBisQlMhlotb}S!NkME?H*yZH`DG%Jh9FW<*ND8)>fAH``VMzxk*5+yF*Mg z3RIV+6n*$xLDBnf@hE&w-=oj%tF#dDpUxBJXB&!yOUBG79Qi%4$?yKvQHc3-@Qpnv zP~9dddioEr*#4d;!aR4S&*;gtuxlSAjfv`^6H8SR_*XDzP77epWiBEAw2cjC9!dW~ zzBl0EM!-+S7xf3i`S4Of^1lAqBY3z#9yc$QfBZr|@m+%A`AL3GUQa&Mg@3W^gO?wl zEjtFI7v}#5 zfyceQ7D7$R_z!|hA~W#~S_pQL0`tHF8Q~wn#Oltab|S}W=?R2;N!y=9_8Hf;g-FcF zng<^K`!?|mBbDYtkNrEUPlKAMJD0r@sqO7ZVfH zV$f#O|HHs$T?ZF-k+*k|e}&4s50dPeXv@FAh18^B)J9q{1x@d0v5k8v%aBChuYlhG3ce zLO!^IhuF|4Szd=n*-$9Bo8S@mx+SS5BluYx@?kMeOz-4Oc-fRWe-Fx(+Pb>dZmNyB zLuXzB|3wQv#{N#5&9)By$fwn2v)JtYwkDgc-`3F2r`hdu@^_O&$4q{cttIvhzkDlv zKBVooK|Q$gjRYGY8=m1O2x<1J7QZjLwYl4}+0?PiwYodv3bcFIdhM>w>v#9J_~U85 znqil}%f~f$kF^X2`!^X4;g*IqgD%sE(Y2{EmSzrnUH+!X*7Yq`$Q$!)S`%S6cZ{){ zYNDG>zM&Q`X>aTB)=eWrn@yDiu1y;Oj?L>fcY9r%d}D5tZ+c@)#P~I`6?hTaFl-_7qZ#<3 zd8DSTfy}?zxscS}LOQ}++zLEIa9#|bEdifl>*n?tUj@1&?$y5JxwC5vyxMDv`E&3c z53l=^_zvcO7tr#5b1tkgCdr6}Gz!egUyQ&0(=ucf_@K{i1{o0O`jmOV<2D9zY1GqH z!}BlCw~NY(jWmubz;t(!XH zI?BR1JvaX|=`eBm;{&mKIn+HH&}MJ1bBUOcC7u~ zqrARI+H7bUZ5wQlwvVo<0j;)~sJ4mzz%8!{`)yl5Q{GMTwt!A;GubwRhP@7SYrDk` zCA!G7o#@()TZnEPB-K_E-P%@7o*T*Y8XKt2b_coT_2;(1%8H6=$aQppvJQoK{hHJv zdOLrC#CCEsOnz2FMR><8e@lLBq^^CjF|qCJ3{_O@ZnVbRZ%S&EzvZhFoNac4%e-cg zB?%pT`C#d;iVBheogyFTIC+@)tR{CvCh$7ex<(5rzmbOr8S6F@B%$l%!{73EWY1t* zZ4bjZ`T<@fmbjs+qN1ic0nZ+TN03T}*GQhXn(aoLV-U)h_9x26@#86#gda~Y*pHY% zPJ?dIu?yB$18Kvyn$)jYm#F_q@U{?mYsN@~7A(y$E;JUk_v!cRF;1SH?b@wRn2AQ`c z+t$Ae;1h8Dwv7aTxI0^ac~*mUG2+?;IyS%~i~^L zp5KBb`xE9DY2V%KBmB$b*xqTbsIZXn_#ETcG*|=t%Z~>@J6nI|jbDSUqOYRDG}E8h z{Q9)<+ez@--3ah_{9^N;pM-qJNpImp2n2gZf5+fv^Nif&b4;ZE*i8L6f6gC&LbZ5( z(^fLi2tzn3fYmnh@Zo7)`{MF|q=Lnir= z3550I)9Ul`Y-Z1AYnxZ!wgE8V`PH$wJS0%;e0>5A;F0p-IR1(GG@pKJNKw#b!dtsz z_89UE^aT)nd+tR2IQ@B1kVJo8bj-6c&5QxaO(6A_#Ci==s$ol_JzH!%=gVha{&+Tb z_}s)+1u8+jd_rEf%&R}YKWhp8uI6sA1%S--*9^E}55|v2+aOdW^ox%NbOIET7=F8L zJRik7&6kPeH^09fBy(PW0Z;Pj2PPryxrrhZ{{RH|piE|%4eg9_k#cw?e>rd1Wa5TP YCT7xvxOjM(!vE+OY3&Gg+PaMY9|ZVQTmS$7 literal 0 HcmV?d00001 diff --git a/tests/fixtures/successful_submissions_fixture.parquet b/tests/fixtures/successful_submissions_fixture.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e4c853fb2b1a5791a784f5f6830b6e6f649f61d GIT binary patch literal 40452 zcmeIb3tU^}`8fW*A*VeF$pK;#fdZZYfdoiMLI{@vZGb>2mr~l&3)1u?IY1y6noC=l zRXfF5XY2jC>colH>8w-dbW^*pbzbIeb57mttghSYcXQ5pn^XKf?@0pTQZIAg@8|#j z{TeYj=e(Ead7t-rpZ9s6%W1d0$E*^ox>d~`Dya%75Yo|2ryvT8f7DAY6{82_blpR} zNcil`iv6mPO0CK+gZk;JC8{MknRKE?CTS~t)X>kur%;rt6T!C>r=*%aJp6`9ooZ~uA_endxHTd=l3`VMys8Ezcavjckoan z4Qjw+gV)0cBHqXtJOW(j5X**iOXJ;Vrp>Dht>=;F%6%!S*Uts?Vtn`XfxN>kQ<(c^7Heh1>S($ z7xnOLoi`Ycgm}*1K%U8!)kEIVa`+zzX|@@M8gh*|yO0pp!$&x;k4ly3QT3%dBz6rn ztVPJHI%}8I=^XS0U7XM9WOoF;9@fuAI-|aJJ`~`6I%mdEwUf8{E1Uy?=r~(qFJpT$ zk*u*y$_6lM9)M9q*pj9)HtZeeoe{Q6B}MHj>1d~({f3lfy@3cD3ho5FUA~}uq|LL- zIL4MpZbFmG&~qg(Ai) zF{-v)iD|CSt64?c`*fRY#Jlu{tFlh9529-AZFmdKvMxy#YBW|MRUuxXxhZP}+C^1q z9>+JF?{vDNUSGs3tgS(J!08|GbIySZi?hPA3;xVDr{*VT!?1KEAs(iR4~>TSh||r5 zBXz!DV6db?$LRsRT$JO2Ur!lplB6KLw%N#FG~9Mm`Y;gLqG8bR+MA48)c>Td0_#|| zsZTcy1U1_%^(i5QSs0TCNL-sh&o3`$?LJ^bF2b|XfOo*l1GDq~U}y|DRR^o4 zlf$Q(I=KKh$a{1}!0KHNe#1wr{~#h$hOb+_f-pYYKB0YS1wu`dGf`*O3N5Ba0W18> z>1_dT#LM|$9k6y5#>6(VJH3%1R&#YB$cP=BFUp6dy&sCHSL_?bUj4+6kex51QZZJB zBV5GmZhL0+`$*iNGinfZHT>e8R-InYma{$fu4ZSaeUpBt^F4n-BH5Gfmm79gisJ*oqXxJ5fCaV;U zZEZ$qOn&fJJ>Bj=51r%kQh8`wyY1*E;{(Z*Jyyc-{QEF@OMK5vZ!; z+Nsm(U~FqfBOpULpKpvE4cQySqa2eaMFZ4-DMNfX>WeUcr|FP)7iF=ei^ci%5^Q2y z+IrTlXVP5Zs$r{G5YBN$K&@Mz}WI_nBROo zEgtU2XVCd9^T{uNn zVJTxL1Ndb=e+(gWe|B3``}B_eqN8rH-#>l62>#waU*tGnxz5qFrmJ~+uPFAnb+1T% zgXLDO;L*p)a+Z1xPJ%w}<+v>`t%+oZVH2^EV4xRZuxxDV!g@v#YIKytv z$Cue)`)rh}$InBOpCA|2&U}!1>4)+=?M%7+_K)Rq+mTP@nGN+IyU#d-S}B98PW3Nn z{cDP1seY||fL^zuk?~6%S>n#JnqMBf+fstWT~6)a=yhzp#E9B$M#QX_C=_Xofhpn` z#c^8hQJGB^l>od+lCGxH(KYivm;;0|zXO)X#&v-!5oQ6sz*^)bDU9g!Le*XJ73XH4u<>4L#SJK~G(R#^3Rqlb%d&e2Fnr!U*8h4}_yFYh+& zRb}T*$;A4ywaZs8L-IQLg=^I^wnX2gGGKdVpo|?UWBvLyd;Maz$zCna&u40tG^jx( zHOl%A@oUIG-7dPcU4)iPzMbO_dZIp_tv7zG)NfY?I6q&XFS!KeOFp6=$O%S)y~Fi= zl$;K^`{m0Na+NBFVpO@Z0T4jU9jP=FW~`N0p&iU$Rmw15zdjW8E0mybg*Yc=Pj?Ua zxIt{y8P-(fm$CTz(gT1kg<+kqxmXR>OSK&HJYj{duiH!Ojq;u6G zrC$0vgHZORY9(j}kzgS2UbTclMy5lpi^49$a$z>&lV(q;LrwNQ8WONEJbc=DfJ^2rhP+UN-btXa*6FUV9&ho$!({iLQn!hM+OzTOqD7gDeEl+ z4KJW76uU-hw<38fa%Z)Q-GSZ$MA~O*$@iAkhz`L`5QT7Y?G})JYVw-iqh)~x(@YQ~v5h+`H)@_tf5_NexU z;!!ViwXzKQ7!3snL!95~WzIZ>T9yMFjC0iMaUROjq%!}K=D$HpMr?X?Xj_IJb>+QA zQ}6@#g`lkrhJw*iw%+n%1RFe?5kUd!o0JwgsT=XF;_`A8o6$%W!!{oZM^tyHDp850 zA5kTeoTDP@C7ItF&}D>ZKWi|yAyj6mN0LX(a^onH!J^mU14!HW9YpI{tz{HtHEVRP zN(6mcY;abf9#EV`E1j?%ha%LEB^TrcBwDxRs8NsbAl9P*poh97>(YY*17Tj$rG8YR za-verVU`0OW{`&+&iDrr(&HB;Uik{_v4EBk7=W)X&|15w7vw9Lh*TW$W%-RgDRjGD z+V;JPDa11GOK4bIn(gv>*56kPT1POzGfi3hN@bj%EmifSKAa0&P-|q7ctkJx3>6%H zgt;>lxzw*KbHbowxQBR;eD&GDOFT* ztzaf)wUY>qteylW8OfSN0pThjpS*MI=+JHB#*Y#6i0U@?o`1G&xpNY9V3pKFfnCfvp6a>}5C=yw2#Hfq$XXZr4M)`UuIyy*4 zDlC!N5o(08Uw zR_1tu5gk|N($@@V{sdcqqyUv*Z;cARH*dHuYrky7>5ketG*auuwVOn$hto%#fn5=< zlpZNH+x`8kT~@@sCU%bC4ZxG{$GD=a&+Q*OnQ>8aBJ$iKXUn;%!5nOEf zE>dkkTvmzp{sGtqT_gHZZbc#Ib{iu>9gHmR-BkVk3Xg;lrSB`lk4=8dxEuQ~NABx> z(N!u&?&DfV2!~f^Qi`AEp(0gwZi~2%-S<0gb`6vw=U4n zdc&YA@&=CiTaK8EKcehhsY(>+Q9yF8edt;XR}JhL|&Y-&I_hp^`I~XNcJXl@_>Oi&XMj*ni!LD}pF=otq{$>$9#vAzKk#8;cOr zkSg2Bg+?duM%9Y*GMHAWc#y~G$RgwDVPujz7j_q27bciK0s zU$cHgS6f$WF>?z`7n_SSY;CDF*!PSLXQVL|>0)nKF`kZe*-y~&KS)-bZVW}in&an< zo)DOMLPpLVA?BVz<^_!}8UjuP8wwX1gfora=;GWXpsBO7alkuRJBm=>hGO|RoXd*PH*9|tq1ru0aT#k=q0Fx0TGmo&v#Ai>UR=vO z!id>o<3=VO9#`)+zKO;475bws7;R9C>#O^F#=k>0I@<(tU5l8sobhW>!@G+96VKtt<7X1?HO(4y4(^d?B!&a<)@U>H7Hvt5QRC4R%>{%~-|jkym~GM<4u$sprPgv7RZJ^H;|dWxXi%ctonQ!g@COL>XV>I0x5&5Lm?HKLG!!xzXX_YT&Acsd zsYkp-#{0s2(J@7?wMQXVf(|EbY**iwW0kJmUIDunGhNidA|L7{MK&Re8W}!5B)Fpp$}yM1A?GU9TefO+^#Yrc&82b`50lH>Dx_CuonqUK z5&QOUk>t0Hw~42JhjM=>wiGnN8L|NXtLF%kNekF**_PBy&;_>@^PZwnO%Chk0;~&6 z3(q10vprquVO?WvRPmb%8g@5T4QBCa&D&TH?+%7I5cg&5&LOXRi1~i%DmZlT;acdD zgS~yuhfXQ>6s}d9SMz5 zE*U@#R3he}(poaYkCgys!7$GPa+$e|Or@5Z)4@>U^V$EP9313BERR)kKN?_;D>YVX z1IE!V^{SrW5jVE*~gbAs%*@Fi0+T z2&|p{D6_}J20`0#gLN~^6lRKfmeJ+uV9It5@d4H!+@X0)26yJ@A3*cr)%!9vPatm3 z%b8#Yiu$-v*3~&i)&XQ@x2dmHfhOeN#=NfX2_dwN`a9Z|yA`<<@2hE$k($p`J{lB8 z#Zx6FPjDxk@$Kvo94k7(bP(1TJ;kU-^VJ1RZ2@A7Xti8pt14KnL0c*dom)uYw z&p;zx?MRZ78bMYE-W<+9bop=zWE) z5aHF?KUL9SkXHZDq8LiwQ!9=}>~(}8Ig}1GXPcNNy^P((HtO%uA68tN$)vMlww;mc zw6DmO!~)c{cQ0x(o{H>$IsPWfXO0xxKT|Ky0UQi}#E4qOxzXsJ5$4%~Le}dEv)kbO zux;X~c!*c;ST=cKVH4|(fT|me`k1X5>3%T2^O6_2ov`27-$ow$Ju*-?IrXLa%&D|2 z1ZI8hDJY2Dnf@Rc06Q#-sjPIExM*0_v}zp>XE9fB2hVQnu&_-WXw-_`6|~ZpC(*DPz@7)@8qt!K^7zyq8yUSz$W2%NwrG?l^7oN6gFl(^nTJ|Gj;6VfNL9 zZ~s8lc^BOj;+ae{8hycMPqW{q;I(Z(IQmK6zUs9%i8u1c$7P&n2ghv7LlOJEatD*6 z7w_aJKR$LF?n-Y5t6ivVyL=b(+iE3fgHhnFtIcxNZBL`{)L#%M+0B)qKI6~CwyVXc zKO?&LZ$Q3nhyM|3LN;*$>S}C4yX*gYj*hj%33LdSvDe*GlR`bhw6kI-$ElhABIIPW zg~kU^?KN7MXT`PJqF1zHr%Um&mgB_%)-q6}D6QUdP}}gj_M1nvV0>4fpwAbdev-1M0W}@{jF=Fw+kMS) z8kR9rsuQ!@28zUVSN}HSH}Rfzilz$2C=>JHiGL`Eyn}GSpf<`voO>)|7;RGx(ZSHR zz3+-$Jd-b@LC0a-8m1}_|ClLkRPz&iCAZ4u zT7ONeNAS7R)D3Q2CtC(Lb0VpZPMoEHGBtUJuN?nK@Q12HXAGC3#-E9@PDeV0 zs#L7i{rU}5UUH`d&bW-E5SePQ+^fEGA~B;Ba$<+o)V9g z0Ig-^pq^}*q?xIb`WrjX&HJt{`$uJOuao7im!)rn^~?V?$M_y_dG<3pSuRD7#+@@7iHERm{@mm@d+wxqwL zT8c{ZUdVvEVaaJlzmUq;;v-yCa$0V~3mYU^S=ET$l2^uDuHW}husrpG(lV8ky-MlG zLOSm+DH$Uvmq=t_`*LRwWPrG7I{JI&Fb#f;5FEksp42G0aF`E87_4Wun+fZ1;=I@O zE6w2R%5Mu0Z2=q}uyGdMaf&#cPyHDcs8%wU7sC>dcn71wXqY*@2IXh(P%7=PnT8|T zjS@L4sqflaqTXI2-y?CSv*u>jr+&X#ea#BSqs$EPEDu6^Z!7b`DQproU6N|2rjy{gO%j6k`g<#<)`E;+0UF4u6%VHbUUL?LJ}h|R zrP89C^K&>h=D%mI*sOSVCG-4giuKDfM}woFga#}H+fXr6xGkd<4qKpoZU!rEwsac` zGqr5yKoA^pJHyPwI=X>f&unDX&bAylqDMo#Z;a|={?(B;nVox9CY(CKOJQVwT!cc| zlWZm^z$JlTiF$u73ghjC4J>rr0ROcy;ccjvso$#DzD=>LtLW@)R2tIDOU0}~{hk$+ zx>HfDq)stbkd?3gtx{Ehx-I+amEiw~jIojxjAl2Qv2Oxm2Ed--QP3@7j>Wr~oKjSf z-|te?xs-L_QVTcaH)@%?8xU`3l5f#p*(3ab>o(~F_K;X1 zShg1_?gWh!9Jpod+*Y&JbMOJg4C&yyShS8S0K00ej%x&hAd)}v=WeiXB`yPtR{=C< zE5XePdV)dBf`gj{NX2}1vu1n2fldtf#Aj})o(DW$TQKGeR{rGbHtUINcYUSW?O$9w z<~CTjI41KE(bo!O7QWyB07HX~e#W`r0k@b?l<+T*m=LHyJfL9yW|Id!w_+fvB5*}g z#jd#(XFU>ZGUDKxNWcsT7zl0)r+^{B6TG;I@WgFAcjdvGKo7ER3uB?PX)`9SD`M2-o*se~JkbSDT9 z*ea6xYuCqR5rHeTBp2YhiUr#c-&J_#F}WiitVQNG?$Dac=KK% z6V4|Wk<3YaH8Wztc?b1%;JqWvmOS$(YC{&UbA${l)*2>Zq$0~_*30M;R)*OYn>9hlg7Vn!jI5e* zQzywPcakhyOjNjIo20;T1mq%ev5tkX(W04>q+THSGlmdwWXzix)BadDgqX}QvYW_+ z*{}`2b0ikTO2Sua!Hgp)m?VIM1n3GkG1Y>E)Dr%iafGs={C6*(VmH4oXG2H-~zaXV4p#7;MB90SZClKk8L{fkV&@`vIyqq zILt|1sZJxQD|ap@sVjFu>dL|8&;mZ45Wz=2^AF9LbfQKk4Q+)F$*x%V6pB61@hI^|;ZG8cUcok|N+;5$Wx zu!y8qrK^+)pfs4zV;>mQoSdCZI#Dunb+($;E%FrV%>jT2079KPQa%gXNJA8<+K& z{L*F!P;f_}ylF$TU6KXiUZWX$lg~qrMnR-r|if` zb>g3(M%;~^H45;M3emV(uczCIofa2Tw%5tV2Id2~mJR$)9(CN0^Q0H$ik+L~xu_h1HPL;9R|?aa}zqci{Ot^NPi0 z=z8(jpghsAATTYSb_IU=6d$-)VWT0L0)|67BJ2U+#xf z2TATEr^@mV8qxZ`@Q8PGl!s)wKoEkQNMLxlU;5Hj2yzKLoY5U+4M(m-@2rGWDI@&P z_60}!rR#$c&X*)mqk(XAl;oJkg*i(S9fUIo9Nc{2S{9N}8!OQ}JrH>014p=rjU9F2 zBtcTLAl{2LOZ&{_75!{HiuT!Q(Clz#m7Ko{DtB*gZ0l+^o1IN-I=kCC96iqUo4XzLr{_Bd z-R{%#jrmBqZhdn{TO(cuL1r$C27(ALrHv^Tcl!b111U7X8nHkoZ$o8yhS z_gLG#_p!vk;zH6sB&i{81$8QYgMRRqs}y%tIBuT ztQqz7rViFH(B0O=8sa%1<|@|Ut{udW-;+WIfnpC|90UqDovYfqo!uSw^(||9I-T}T zcv5eMf(`2&&Ym?*os_%~YBYA(*ExGFl8b3+pT+(_!zYM#c>~>??PweREH>0JErUy-<`a{6ER`hVfr1zex^ zFHD2+bh#JxGm{cz+rAHDQ!JY`mPPDr*)NHww65uB&3Ud`eAhD_5$UZ((-6Hl>aspfn^Ayz&KP}T~%wYt_4vV0drL>!+`hb^E3UNI|SHf zT#JhFyJgjcl?q~Jc-6fbH&;4P6$a-&>rnT?OBS~N`r9AxT_UzsPb)?6*Ps;b{h0inJKmKR= zl{|+eiW;ST70lat5X*G4hB;$d_QT8aT2B*)hagx(Z0L9uYlygJ4L-NoBw4~7%mBgT zQ{-x7|eQ;f9^95?EqeC7&y5L)gt=MiEtqyGu6t);pH#waP%}EzFZrBwH4DE!W%z51i z9ODb;!CtgorhZa=4DwAqI|xPtigQPIP>qW7C*9w4>Q80RnVL zKC@5ZCT{8wVJR^W+^|&jFr2Jo?_PO6<(-BnV0nIx^IiO8efG@3?nFnhuYYbXOFnnu zmIoh0ISCywc7A4Oj#1dDdN~JzyoE!w#Doj~w~x^YYsZ{v10H@N#zzFv8o22ue87uQ-wq-vlt-*UP@Q87RlYz9ms?WkQGz1YAb8O!%9HU##(V!rqyVw zWX(p&&G-?er4nG8aksMC0^QZ%FeGwf9kwe?Ht4k)e*nE)1^)-iYAWG@1#p1+8Y3Wo zm}*28oQ5o#Y^d-iwNDA7tu$8QRv3*E9`zxsoPr@pQH4H2RcZIhfIbx(m0?nBGHZHq zomPsfHI1lJ(}8O6uaHgi2(rSx74L$Mb*M_yC$6Z32Q~OGJU9Z~9)aJ(r~noM$;&^RN)AI6Uy=cT&uXMTGNMHv@hzHzyphB2{b5xYy%tiKy5-73RP(<@CW)1 zM=IU|-F_3A2E9`(GMHzJ<}AREhw^>273i@IFvc+WPvA)!V0ajM z-3KsKfaEYdhj9aOsx()j3e9PN%#)~!`V%(5m<#|{2T$^}2cYeRFr^eSY2O6sJdFBW zcy<`tOQ|zarTrhsVsFxpJ8B%qfyfomw#NR^+WP^MGihKyfLMj^)9%xLAK5gIDq&iI z>mbO~jL(#0;hO=KZ^~?ytPMX4Egl4zX#oE|$OBv=jG{F$^zKDiz>P8s6z5A^(RzeKZq(QgXa5`Qf#Tf2kfWAkYCn*h$<2I_*A=92|AhT3lWtuFt zM*%|%Fvb$V8KxTsM%c0*nruWCs)~D_GHB1&0z}Ox{OM3Hjo*vR)Fe#er2wP?S*czP z2V_4Ba2*32-i6jG$@!Z7)R|M?T??s~xPX5fRd6Es{~p=`Y>po2Q;f4f8Be5Chyovs3A?Su(8rgSY(y>0Dlp(axPRk znSv@cOP*ONt2VQ?wnO3?pa}4d1!RuR!rJr?9sA|6FyLmGc;VOvpvEeh)y7uYRdf}I zkbS7~&;y4zAAcBG_qe#@+Dl~>)vOI~k%26&vY&g~Q6g5ji>8JghaBU`WFLn$pl(au z^8l*QTz34w2+L@BSVLC(0F3Qw#|Wz0^F69vd%&SrTtYt?a`aMFc*9|h_LZrdfG55$ zuC@W+RN=MS{roq83j6qe7|XUN|C_VIwB2#|cZ3dJttJ(i&dQxV_4?uehQ1$yqj&{O zQI+N(P=rD@Ja%j=e76Y95{870gM1Ia>dHQYZZ9%$Q- zFQp!Q(oY%ew@`L%<5Z>Nh+|ZdMl0^1fqwPKcIbWWBTxQ}TZSsRW!eX|9b$`Fka|ZB zFW0LyM#uN47m*cTMJWN%waA1&q*|%RwEL%CqSk8lRDt3?8gREz`zYNp@f6MXII>Z@iwO1duQE3yeBl8}gqKZED13>v*$GNE5UZXVvLGGUDg8uFo zxM&y#%u}x&JBrNQgcZkKqbeDttReqjeepTS?rhb*H8SGdKowxRvW6pl=c8+Q1DWq zszvc9ntytNl}Ncyk!9+(iMO-^$Yg(23qn}?AwI{^sl5)F4!w_T_UBQ>q+bG?jdo0t zMLRmDegWe&99wz#9q6)MtDjnNZ0oUaB9mh+&T|~r9ylC=f!-pns+_z{Q7=(6NS|DT zUu^q`JNwu-`8ySlN+-5clWjNRKJ8NqHkEp0NAf+UKS3E*b-03(-?E-AqJ^7h^e8R_HC$6gCAK#-vX3&Q!_Y^sv*B`;k zV|Bpnz0^5}?xmE}r7)0JwFjqGP&}`Oz8^s~ycRpP+fdC>39m&q?t8qJALsu7bK0&Q z)~=fe%iN&Dl-tdAc4rhXiNoG0gH+7}fs z$e{Zy(xJAGQv=m1rc6^W@gGfH>)@tJ9qEei$q)aI@@QYtcTB9(M39*S39UFN1w=m5 zj#4V>Me6)Z?o#BZDR!qRY_NezdSQaA_@#=Mf;q7+6qsUOkY z53 z9{9p!e+5i`oc2jGyy2k+H=C*t#GH^W||nJ-fbJluS}Q}qtYoe4Et~~ve^$QHqcb7 zLq)xdER#8kO|q%iCpI~Z$GfS>P zQ>RW?70wLp`4g_T{RndKX~n=P$b57IHLNhnrp&x{qE>NDDs@01NpoDL9R};%}UM_$Otdn=Q9fS!IueWJf3P;_D_ zUs#j}ZBL8<6se)~#L(YC^S-*-d;ffN5*D4}uXP^n$InR+-rGd<0t3=F){|=YBt6V&n#q1%Iq4IC&srF0 zVzm35KqaUg4^zU#&}URx68_awH=P27;4A}xVL;s-iM>X5(;_&ZvPiYe#U3W(CRi3d zs-Xlx%VY3*GSmS?+GT)~2^_5r<_eoZuV!uo7i134;&XHo z1+6GkJ{xid=~1y~OwyZ{XC5vEA7N0K@85$eP$TB~hxcC(1QWnhz}=LxOb7%(M8GlUPdp@TmjcCwiJfUi2~m3SI%;WFPSz)E5y}CMtwc-Gv*z)rN~D&m zKf&tvaM4f^{Z-yJpH&tUlw@N_SBX%aeo0mlB0(t0(B^T!0PYE_ejXS7L{#(%27niV zV}yLZ@TK!#4q#$s36S2x>ObP58^lGwNC8L-Un@_XISZenF9&J;$Pr z`hN7u6!rc?xaht8xab`T>HcK?GsJMjK8d7f$gfCTp3HwH31|y)tCQ1+A^ioC4au9& znGud5ecA<;gY@_RD>Ii6TZ zcI1=!&+wiYl2DzLJ1NAJ3Ym${lljli$jSWYc^TLz^PeFIm^+#OJeyg3GXHtuIr;4i zQ|26$6GoJT#Vv8>2lL61r*YBrQ^caNFu$3c`WHU^f>VD2R@|PK1nQU1;G%bbK|n1` zXeOW*KK+uQ=2;69Ai<*X(sQ`zkIxd23p1Dr$c0b8B*-}?$RvQTzJQC~dY%AWn6OL$ zE`0h0024Nj8M~$c>?W-KJ}!FTSGefyUt(hBSdhfLkSV(7u!J;WoS89=3LURkZz+1< zMO<|4i_oofaomWpOt6-rEbS7Mm26EF`sh}#FM97qTnDMo$=T0YIbd=FBlgfxTdIJ>gIN7D zT=d3oaM3-#Au~+KKt3~9CrXlDfB@JWlfcKwxB$9z*0L;s?@{*_z4!7ASrbXsbI6+1 z>xt+($387|Jgg2Cz55C-dg>K{v?S$v0mf=Hmt-gEJ#OR{`n@?@{phJh_rDHC(^rL_ zPiBIi%mhu!=w6V&d@@xy`BeZXQ-v2W$Rcv-C-j#WOyFu01k>{uWAz8QMR#8WzMQ`k z$+Sqnf4+$L8j|Gc38ZwwUpFJ7-@Qnb*!jM!h@SED&DrxFq||>%DSG@;QPC3@i^v}T zAJ1@FDCI9EdL#L#wqRRJ-Wg{wobfga;{^{KC|L)t5EXs@auL}PziejH0?1@XoCO=# zA$2og=k3G~VD+Q8==Cc_MR#8*z!mo4q@7s!_n6}kXZPnt_O-ci6T8|B+~n=-$~5%@ z@}g(26BWI4l`tPCvvvNjc}K+}oT0vW>D;xjD03u4$_<}k_5BS+*S;w#x^;?h2Sgby zU~=_c(}GZ$W%RgEnc=rN(uC;IN8X%~CW(NgxeDKGUkUu?`|pY0URbSf369rWkBN%z z`Gdf~Uq1qA;Slc@&83-@ru||5;S@vp)$VT`WXt@wlahBYJq=oSg_dod|S@iAIt7+?1kQ-x3ww zbX-8;D~2C^<#45ii2q`qFh9{$B!V$!X5sO_02lw|-<^e6B!|%12O{jMs-^Ic=Y0QfB3N^oOUOTS$AvSmP5(x|H{juSL4=Ag z>JNnT;oXAdUpR=}LTn3^@$;Svx8KMoQA&b_<@Q+ga;2p>(Dvp76lDF8B z=8%AIvyx^I4>&qD^B4mAVvor`+C*MS2z3MS5~@4WH@mC9BFD(BL0*}XeErgm> z@E-)1L}n5dv=Hnf73P5lGQvNd6Kgw{+KCdIwJ#9vCw+e&*=K^*79ugHXdZa@_sU63 z7-=*Ydh8$Y`XZ=_wsYAl@pK+6*BIose;r5=X18#31X#Ra9=xx^#l*U_7_`~v|6yRW zL4ym2D44p)KOQRt4wCYDz!R*pkhqDGFULd#P$E8YaWs-k{|AvEfsl(MkSLi4{A=>D zKw%PRxj4{pqG%pe;pMuENy?;x&s($vY%H8}ak#T(^8mZxeM6+l+%=nc`b8jrybhC zuh2=b35ww!zJl?QpH?jkp4x-ZkDv*QRwl2iyGdXTF+Im%qoy zxAu;=4F?A|noZ%hrq#nP%PzBPV`c1FMcC`|w?wwAYqLSwxM$<)2)C(goZDCv-DL5N zw0TK?TSm8R+%>YvQaR+>xB=kUz3z%$uWO@k+->pgZ~>exO>Ly_otrmSj%?o4HQ-t0 z+Zpe3)D>83>57#{JsZtC`Q926zp2qjaB3UzMLo@%O))rTpQp72+V>2RGFPBC>{IV|-Z0!pH*Yk@`u6YgZQj`9bNg5C*t}^q;2$4P zpm7N3GUPTV;Q{oV#iP>f+SFLNsiLRS-5MM3x{Z~luHm)bjwU;dyVnH$4tZJ~gyt}& zUEu+sJ3qF17IvU_9B>@qHpbd)@%wzP)}DbRTASUL-m(8(oVv`L{pL}3Mc1gyQaRvh z^^tie^R|$7ffLqUY>vsC!Q9RCw{V_;-k#Q869;qCHQeKJ1-d+6Va@@+UA_eWkvUj| zo(t-@TU%g_w2UXuqi18~=w?gJNPOIj;92A4{Jq1T7{5lg053uxrp;u1v;tqW?yBi% zBI|E1QAv}3Px(0fq+{W2;1pC6*Uneo18;7+!c48SAoBO0Xi_y^UYONXiyuD+Ke;4dO zGkf5dk|}N&^QMZ%A&}L+_}+RV+Uc6>{GpIKDK_-l5!PPG5R-srOI{Hi<%GQ_+S zWHQKf*#EsJ+Cv2TEmn6iI?k1K|G7C9=p9>+i|8?g&K9C~I8G>|U6$3`w`}Z+>nID? z^xX0x zZ$VQ0IqQq`?{4)G{uOZS?5-#;x03nz3iH=8Tm$?o%m+X_(Rk*~Uz5FjpuF5NGoIM` z`l9*UPVn2=4DbZ}V(VX6gu=kdXyHQ$1ZPHn*YKu_8M!HxSV;Tvnf7u1oIn4BYVr1# zEo7Y$hH#Vvt8EhC!`<4>#pMBc3N{07Q_lds`ZKSejdItc?oARIP(I&O>G2U>kPcl!FN@X4o2`@RRyd~=-Xy@5cmSW zeV+Ui+6($6G_u>*tS9ZO34FL~Y#I`t!s>$?;Zpb#!SX4>tTi9CS_$Qwqpr3UhP6+&Fd%J)STsq?GT|gyiy%SLr+i$>@0K*=SkKazV zK-fRNsJ$T1X3l)}j(P3v>j4viUmc6fLjt8yXivZaJW@X#$3L;2=F@LAsS3JGcuQ~0 z9z*Vdz5qh#&z)!=r@tTylISmpj(IkwnK=Nt38dbf*sozpHEmAxXOGS2eEBTMAJ3*P zpPSgKKqZLRPsq!*dF|(qXAQyM)!GZT0Fasfnjtrw!G!tf7>1^Ve)0K$L4ZOM!*8cu z;G_7Ug*tKk=8w0F6wVtj;7LA%z$BzUH&JBbAAkTKl*x{Aq3uyVVu0838wA598#i3C YF_R`_#KUV8{zv~%ej`HN_8#;90m3+yy#N3J literal 0 HcmV?d00001 diff --git a/test_dedup.py b/tests/test_dedup.py similarity index 81% rename from test_dedup.py rename to tests/test_dedup.py index f5ea811..721fd71 100644 --- a/test_dedup.py +++ b/tests/test_dedup.py @@ -14,20 +14,16 @@ import os import sys -# Import the functions we want to test -try: - from dedup import ( - remove_duplicates, - fuzzy_filter, - convert_df_to_dict, - flatten_data, - dedup_df, - count_items, - create_parquet_file - ) -except ImportError as e: - print(f"Import error: {e}") - print("Some functions may not be available for testing") + +from dedup import ( + remove_duplicates, + fuzzy_filter, + convert_df_to_dict, + flatten_data, + dedup_df, + _count_items, + create_parquet_file +) class TestDedupEndToEnd(unittest.TestCase): @@ -193,10 +189,10 @@ def test_exact_deduplication(self): """Test exact duplicate removal.""" try: data_dict = convert_df_to_dict(self.df) - original_count = count_items(data_dict) + original_count = _count_items(data_dict) deduplicated_data = remove_duplicates(data_dict) - deduplicated_count = count_items(deduplicated_data) + deduplicated_count = _count_items(deduplicated_data) # Should have fewer or equal items after deduplication self.assertLessEqual(deduplicated_count, original_count) @@ -211,7 +207,7 @@ def test_fuzzy_deduplication_small(self): """Test fuzzy duplicate removal with small threshold for faster testing.""" try: data_dict = convert_df_to_dict(self.df) - original_count = count_items(data_dict) + original_count = _count_items(data_dict) # Use small parameters for faster testing fuzzy_deduplicated_data = fuzzy_filter( @@ -222,7 +218,7 @@ def test_fuzzy_deduplication_small(self): rows_per_band=32 # Fewer rows per band ) - fuzzy_count = count_items(fuzzy_deduplicated_data) + fuzzy_count = _count_items(fuzzy_deduplicated_data) # Should have fewer or equal items after fuzzy deduplication self.assertLessEqual(fuzzy_count, original_count) @@ -237,7 +233,7 @@ def test_flatten_and_reconstruct(self): """Test flattening and reconstruction of data.""" try: data_dict = convert_df_to_dict(self.df) - original_count = count_items(data_dict) + original_count = _count_items(data_dict) # Flatten flattened_data = flatten_data(data_dict) @@ -392,11 +388,103 @@ def test_duplicate_detection(self): self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing") +class TestIntegrationWithParquetFixtures(unittest.TestCase): + """Integration tests using real parquet fixtures.""" + + FIXTURES_DIR = os.path.join(os.path.dirname(__file__), 'fixtures') + + @classmethod + def setUpClass(cls): + """Load parquet fixtures once for all tests.""" + submissions_path = os.path.join(cls.FIXTURES_DIR, 'submissions_fixture.parquet') + if os.path.exists(submissions_path): + cls.submissions_df = pd.read_parquet(submissions_path) + # Decode bytes to string if needed + if cls.submissions_df['code'].dtype == object and len(cls.submissions_df) > 0: + if isinstance(cls.submissions_df['code'].iloc[0], bytes): + cls.submissions_df['code'] = cls.submissions_df['code'].apply( + lambda x: x.decode('utf-8') if isinstance(x, bytes) else x + ) + else: + cls.submissions_df = None + + def test_exact_dedup_on_fixture(self): + """Test exact deduplication on real fixture data.""" + if self.submissions_df is None: + self.skipTest("Fixture not available") + + data_dict = convert_df_to_dict(self.submissions_df) + original_count = _count_items(data_dict) + + deduplicated = remove_duplicates(data_dict) + dedup_count = _count_items(deduplicated) + + # Should have same or fewer items + self.assertLessEqual(dedup_count, original_count) + # Structure should be preserved + self.assertEqual(set(data_dict.keys()), set(deduplicated.keys())) + + def test_fuzzy_dedup_on_fixture(self): + """Test fuzzy deduplication on real fixture data.""" + if self.submissions_df is None: + self.skipTest("Fixture not available") + + data_dict = convert_df_to_dict(self.submissions_df) + original_count = _count_items(data_dict) + + # Use smaller parameters for faster testing + fuzzy_deduped = fuzzy_filter( + data_dict, + threshold=0.5, + ngram_size=3, + bands=4, + rows_per_band=32 + ) + + fuzzy_count = _count_items(fuzzy_deduped) + + # Should have same or fewer items + self.assertLessEqual(fuzzy_count, original_count) + + def test_full_pipeline_on_fixture(self): + """Test the full dedup pipeline on real fixture data.""" + if self.submissions_df is None: + self.skipTest("Fixture not available") + + data_dict = convert_df_to_dict(self.submissions_df) + original_count = _count_items(data_dict) + + # Run exact dedup first + exact_deduped = remove_duplicates(data_dict) + + # Then fuzzy dedup with smaller params + fuzzy_deduped = fuzzy_filter( + exact_deduped, + threshold=0.5, + ngram_size=3, + bands=4, + rows_per_band=32 + ) + + # Flatten to DataFrame + flattened = flatten_data(fuzzy_deduped) + result_df = pd.DataFrame(flattened) + + # Verify output + self.assertIsInstance(result_df, pd.DataFrame) + self.assertLessEqual(len(result_df), original_count) + + # Should preserve key columns + if len(result_df) > 0: + self.assertIn('code', result_df.columns) + self.assertIn('run_mode', result_df.columns) + + if __name__ == '__main__': # Add some helpful output print("Running deduplication pipeline tests...") print(f"Python version: {sys.version}") print(f"Pandas version: {pd.__version__}") - + # Run the tests unittest.main(verbosity=2) \ No newline at end of file