From 15c53ebe903bedec04b3fe4971188e0535a00c96 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Mon, 23 Jun 2025 16:45:07 -0400
Subject: [PATCH 1/9] Add deduplication logic

---
 README.md     |  15 ++
 dedup.py      | 398 +++++++++++++++++++++++++++++++++++++++++++++++++
 export.py     |  28 +++-
 test_dedup.py | 402 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 842 insertions(+), 1 deletion(-)
 create mode 100644 dedup.py
 create mode 100644 test_dedup.py

diff --git a/README.md b/README.md
index 54fe6b8..0113e64 100644
--- a/README.md
+++ b/README.md
@@ -34,3 +34,18 @@ python export.py
 
 The script will create a directory at the specified output path containing the dataset in Parquet format. If `--output_dir` is not provided, it will save to `dataset` in the current working directory.
 
+## Tests
+The deduplication scripts can be tested by running
+```bash
+python test_dedup.py
+# if you have pytest you can run 
+python -m pytest test_dedup.py -v
+```
+To test things we actually create a fake dataset. Here are the features of it
+The test creates a 50-entry dataset with:
+- **Exact duplicates**: First 5 entries use identical code
+- **Fuzzy duplicates**: Next 5 entries use similar code with small variations
+- **Multiple run modes**: `leaderboard`, `test`, `benchmark`
+- **Mixed success states**: Both `True` and `False` values for `run_passed`
+- **Realistic struct data**: Complex nested structures for `run_result`, `run_compilation`, `run_meta`, and `run_system_info`
+- **Proper timestamps**: All timestamp fields include timezone information
diff --git a/dedup.py b/dedup.py
new file mode 100644
index 0000000..b316e38
--- /dev/null
+++ b/dedup.py
@@ -0,0 +1,398 @@
+# script to dedup a huggingface dataset
+
+from datasets import load_dataset
+import tqdm
+from collections import defaultdict
+import hashlib
+from typing import Dict, List, Tuple, Union
+
+import datasketch
+import pandas as pd
+
+def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]):
+    """
+    Remove exact duplicates from the nested data structure returned by get_sorted_hf_data.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        
+    Returns:
+        Dictionary with same structure but duplicates removed
+    """
+    deduplicated_dict = {}
+    
+    for run_mode, score_duration_dict in tqdm.tqdm(data_dict.items(), desc="Processing run modes"):
+        deduplicated_dict[run_mode] = {}
+
+        for run_success, run_success_dict in tqdm.tqdm(score_duration_dict.items(), desc=f"Processing {run_mode}", leave=False):
+            deduplicated_dict[run_mode][run_success] = {}
+            for score_duration, rows in tqdm.tqdm(run_success_dict.items(), desc=f"Processing {run_mode}", leave=False):
+                # Use a dictionary to track unique entries by their content hash
+                unique_entries = {}
+                
+                for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False):
+                    # Create a hash of the relevant content (assuming 'input' or similar field exists)
+                    # If the row has an 'input' field, use that; otherwise use the entire row
+                    content = row.get('code', "")
+                    content_hash = hashlib.sha256(content.encode()).hexdigest()
+                    
+                    if content_hash not in unique_entries:
+                        unique_entries[content_hash] = row
+                    else:
+                        # If duplicate found, keep the one with better metrics
+                        existing_row = unique_entries[content_hash]
+                        
+                        # For leaderboard mode with successful runs, prefer higher scores
+                        if run_mode == 'leaderboard' and row.get('run_passed') == True:
+                            if row.get('run_score', 0) > existing_row.get('run_score', 0):
+                                unique_entries[content_hash] = row
+                        # For other cases, prefer shorter duration (faster execution)
+                        else:
+                            existing_duration = existing_row.get('run_meta', {}).get('duration', float('inf'))
+                            current_duration = row.get('run_meta', {}).get('duration', float('inf'))
+                            if current_duration < existing_duration:
+                                unique_entries[content_hash] = row
+                
+                deduplicated_dict[run_mode][run_success][score_duration] = list(unique_entries.values())
+    
+    return deduplicated_dict
+
+
+def create_minhashes(
+    documents: List[Dict[str, str]],
+    ngram_size: int = 5,
+    bands: int = 20,
+    rows_per_band: int = 128,
+) -> Tuple[Dict[str, datasketch.MinHash], int]:
+    """
+    Create MinHash signatures for a list of documents with LSH bands configuration.
+
+    Args:
+        documents: List of dictionaries, each containing 'submission_id' and 'input' keys
+        num_permutations: Number of hash functions to use (default: 100)
+        ngram_size: Size of n-grams to generate from input text (default: 3)
+        bands: Number of bands for LSH (default: 20)
+
+    Returns:
+        Tuple containing:
+        - Dictionary mapping document submission_ids to their MinHash signatures
+        - Rows per band (num_permutations / bands)
+
+    Raises:
+        ValueError: If num_permutations is not divisible by bands
+    """
+
+    num_permutations = rows_per_band * bands
+
+    def generate_ngrams(text: str, n: int) -> List[str]:
+        """Generate n-grams from input text."""
+        return [text[i : i + n] for i in range(len(text) - n + 1)]
+
+    # Initialize result dictionary
+    minhash_dict = {}
+    # Process each document
+    for doc in tqdm.tqdm(documents, desc="Creating minhashes"):
+        minhash = datasketch.MinHash(num_perm=num_permutations)
+        submission_id = doc["submission_id"]
+        text = doc["code"].lower()  # Convert to lowercase for consistency
+
+        # Generate n-grams
+        ngrams = generate_ngrams(text, ngram_size)
+        for ngram in ngrams:
+            minhash.update(ngram.encode("utf8"))
+
+        minhash_dict[submission_id] = minhash
+
+    return minhash_dict
+
+
+# 16 bands with 128 rows
+def create_similarity_matrix(
+    minhashes: Dict[str, datasketch.MinHash],
+    rows_per_band: int,
+    num_bands: int,
+    threshold: float,
+) -> Dict[str, List[str]]:
+    lsh = datasketch.MinHashLSH(threshold=threshold, num_perm=num_bands * rows_per_band)
+    print(f"num_perm: {num_bands*rows_per_band}")
+    similarity_matrix = {}
+    for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Inserting minhashes into LSH"):
+        lsh.insert(submission_id, minhash)
+    for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Querying LSH"):
+        similar_submission_ids = lsh.query(minhash)
+        similarity_matrix[submission_id] = similar_submission_ids
+    for submission_id, similar_submission_ids in tqdm.tqdm(
+        similarity_matrix.items(), desc="Removing self-similarities"
+    ):
+        if submission_id in similar_submission_ids:
+            similar_submission_ids.remove(submission_id)
+    return similarity_matrix
+
+
+def filter_matrix(
+    similarity_matrix: Dict[str, List[str]]
+) -> set:
+    good_submission_ids = set()
+    processed = set()
+    
+    for submission_id, similar_submission_ids in similarity_matrix.items():
+        if submission_id in processed:
+            continue
+            
+        # Find all submissions in the similarity cluster
+        cluster = {submission_id}
+        cluster.update(similar_submission_ids)
+        
+        # Keep the one with the largest ID (tiebreaker)
+        keeper = max(cluster)
+        good_submission_ids.add(keeper)
+        
+        # Mark all in cluster as processed
+        processed.update(cluster)
+    
+    return good_submission_ids
+
+
+def fuzzy_filter(
+    data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]],
+    threshold: float = 0.7,
+    ngram_size: int = 5,
+    bands: int = 16,
+    rows_per_band: int = 128,
+) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
+    
+    total_categories = 0
+    for run_mode, run_success_dict in data_dict.items():
+        for run_success, score_duration_dict in run_success_dict.items():
+            for score_duration, rows in score_duration_dict.items():
+                total_categories += 1
+
+    deduped_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    current_category = 0
+    for run_mode, run_success_dict in data_dict.items():
+        for run_success, score_duration_dict in run_success_dict.items():
+            for score_duration, rows in score_duration_dict.items():
+                print(f"Processing {run_mode} {run_success} {score_duration} {len(rows)}")
+                print(f"This is {current_category} of {total_categories}")
+                current_category += 1
+                deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(rows, threshold, ngram_size, bands, rows_per_band)
+
+    return deduped_data
+
+def _fuzzy_filter(
+    data_list: List[Dict],
+    threshold: float = 0.7,
+    ngram_size: int = 5,
+    bands: int = 16,
+    rows_per_band: int = 128,
+) -> List[Dict]:
+    """
+    Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        threshold: Similarity threshold for LSH
+        ngram_size: Size of n-grams for MinHash
+        bands: Number of bands for LSH
+        rows_per_band: Rows per band for LSH
+        create_histogram: Whether to create similarity histogram
+        
+    Returns:
+        Dictionary with same structure but fuzzy duplicates removed
+    """
+    # Flatten the data for processing
+    
+    # Create documents for MinHash processing
+
+    if len(data_list) <= 1:
+        return data_list
+
+    all_documents = []
+    for i, row in tqdm.tqdm(enumerate(data_list), desc="Creating documents for MinHash"):
+        # Use 'input' field if available, otherwise use a string representation
+        content = row.get('code', str(row))
+        document = {
+            "submission_id": str(i),
+            "code": content,
+            "original_row": row
+        }
+        all_documents.append(document)
+    
+    # Apply fuzzy deduplication
+    minhashes = create_minhashes(
+        all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band
+    )
+    similarity_matrix = create_similarity_matrix(
+        minhashes, rows_per_band=rows_per_band, num_bands=bands, threshold=threshold
+    )
+    
+    good_submission_ids = filter_matrix(similarity_matrix)
+    
+    # Keep only the documents that passed the filter
+    good_documents = [all_documents[int(submission_id)]["original_row"] for submission_id in good_submission_ids]
+    
+    # Reconstruct the nested structure
+    return good_documents
+
+def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]:
+    # Login using e.g. `huggingface-cli login` to access this dataset
+    ds = load_dataset("GPUMODE/kernelbot-data", "submissions")
+
+    # we should divide things up into type
+    # run_mode
+    # run_sucess
+    # if run_mode is leaderboard then use score
+    # otherwise use run_meta[duration]
+
+
+    data = ds['train']
+
+    run_mode_dict = defaultdict(list)
+    run_success_dict = defaultdict(lambda: defaultdict(list))
+    run_duration_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+
+    for _, row in tqdm.tqdm(enumerate(data), desc="Processing dataset rows"):
+        run_mode = row['run_mode']
+        run_mode_dict[run_mode].append(row)
+
+    for run_mode, rows in tqdm.tqdm(run_mode_dict.items(), desc="Processing run modes"):
+        for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} success/failure", leave=False):
+            run_success_dict[run_mode][row['run_passed']].append(row)
+
+    for run_mode, mode_dict in tqdm.tqdm(run_success_dict.items(), desc="Processing success/failure groups"):
+        for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False):
+            for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False):
+                if run_mode == 'leaderboard' and run_success == True:
+                    rounded_score = round(float(row['run_score']), 4)
+                    run_duration_dict[run_mode][run_success][rounded_score].append(row)
+                else:
+                    rounded_duration = round(float(row['run_meta']['duration']), 0)
+                    run_duration_dict[run_mode][run_success][rounded_duration].append(row)
+
+    return run_duration_dict
+
+def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
+    """
+    Convert a pandas DataFrame to a nested dictionary structure.
+    
+    Args:
+        df: pandas DataFrame
+        
+    Returns:
+        Nested dictionary structure
+    """
+    data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for _, row in tqdm.tqdm(df.iterrows(), desc="Processing DataFrame rows"):
+        run_mode = row['run_mode']
+        run_success = row['run_passed']
+        score_duration = row['run_meta']['duration']
+        data_dict[run_mode][run_success][score_duration].append(row)
+    return data_dict
+
+def flatten_data(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]]) -> List[Dict]:
+    """
+    Flatten the nested data structure to a list of documents with metadata.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        
+    Returns:
+        List of documents with additional metadata fields
+    """
+    flattened = []
+    for run_mode, run_success_dict in tqdm.tqdm(data_dict.items(), desc="Flattening data"):
+        for run_success, score_duration_dict in run_success_dict.items():
+            for score_duration, rows in score_duration_dict.items():
+                for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False):
+                    # Add metadata to each row
+                    row_with_metadata = row.copy()
+                    row_with_metadata['_run_mode'] = run_mode
+                    row_with_metadata['_run_success'] = run_success
+                    row_with_metadata['_score_duration'] = score_duration
+                    flattened.append(row_with_metadata)
+    return flattened
+
+def count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int:
+    """
+    Count total number of items in the nested data structure.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        
+    Returns:
+        Total number of items
+    """
+    total = 0
+    for run_mode in data_dict.values():
+        for run_success_dict in run_mode.values():
+            for rows in run_success_dict.values():
+                total += len(rows)
+    return total
+
+
+def example_usage():
+    """
+    Example of how to use the deduplication functions with get_hf_data output.
+    """
+    # Load the data
+    data = get_hf_data()
+    
+    print(f"Original data has {count_items(data)} total items")
+    
+    # Remove exact duplicates
+    deduplicated_data = remove_duplicates(data)
+    print(f"After exact deduplication: {count_items(deduplicated_data)} items")
+    
+    # Apply fuzzy deduplication
+    fuzzy_deduplicated_data = fuzzy_filter(
+        deduplicated_data,
+        threshold=0.8,  # High threshold for more strict deduplication
+        ngram_size=5,
+        bands=16,
+        rows_per_band=128
+    )
+    # convert to df
+    flattened_data = flatten_data(fuzzy_deduplicated_data)
+    df = pd.DataFrame(flattened_data)
+    
+    return df
+
+def dedup_df(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Deduplicate a pandas DataFrame.
+    
+    Args:
+        df: pandas DataFrame
+    """
+    # convert to dict
+    data_dict = convert_df_to_dict(df)
+    # deduplicate
+    deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128)
+    # convert to df
+    flattened_data = flatten_data(deduplicated_data)
+    df = pd.DataFrame(flattened_data)
+    return df
+
+def create_parquet_file(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]], filename: str):
+    """
+    Create a Parquet file from the nested data structure.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        filename: Name of the output Parquet file
+    """
+    # Flatten the data
+    flattened_data = flatten_data(data_dict)
+    
+    # Create a pandas DataFrame from the flattened data
+    df = pd.DataFrame(flattened_data)
+    # Convert the DataFrame to a Parquet file
+    df.to_parquet(filename, index=False)
+
+
+
+def main():
+    example_usage()
+
+if __name__ == "__main__":
+    main()
diff --git a/export.py b/export.py
index b4e3cdb..8c534ae 100644
--- a/export.py
+++ b/export.py
@@ -5,6 +5,7 @@
 from datasets import Dataset
 from dotenv import load_dotenv
 from sqlalchemy import create_engine, text
+from dedup import dedup_df
 
 load_dotenv()
 
@@ -199,11 +200,27 @@ def main(output_dir):
     submissions_dataset.to_parquet(submissions_output_path)
     print(f"Submissions dataset successfully saved to {submissions_output_path}")
 
+    # Deduplicate submissions  
+    print("Applying deduplication to submissions...")
+    try:
+        deduplicated_submissions_df = dedup_df(submissions_df.copy())
+        deduplicated_submissions_path = os.path.join(output_dir, "deduplicated_submissions.parquet")
+        
+        # Convert to dataset and save
+        deduplicated_submissions_dataset = Dataset.from_pandas(deduplicated_submissions_df)
+        deduplicated_submissions_dataset.to_parquet(deduplicated_submissions_path)
+        print(f"Deduplicated submissions dataset successfully saved to {deduplicated_submissions_path}")
+        print(f"Original submissions: {len(submissions_df)}, After deduplication: {len(deduplicated_submissions_df)}")
+        
+    except Exception as e:
+        print(f"Warning: Deduplication failed with error: {e}")
+        print("Proceeding without deduplication...")
+        deduplicated_submissions_df = submissions_df.copy()
+
     # Filter for and save successful submissions from the anonymized data
     if 'run_passed' in submissions_df.columns:
         print("Creating successful submissions dataset...")
         successful_submissions_df = submissions_df[submissions_df['run_passed'] == True].copy()
-
         # Convert to dataset and save
         successful_submissions_dataset = Dataset.from_pandas(successful_submissions_df)
         successful_output_path = os.path.join(
@@ -215,6 +232,15 @@ def main(output_dir):
             f"{successful_output_path}"
         )
 
+        # Create deduplicated successful submissions
+        print("Creating deduplicated successful submissions dataset...")
+        deduplicated_successful_submissions_df = deduplicated_submissions_df[deduplicated_submissions_df['run_passed'] == True].copy()
+        deduplicated_successful_submissions_dataset = Dataset.from_pandas(deduplicated_successful_submissions_df)
+        deduplicated_successful_submissions_path = os.path.join(output_dir, "deduplicated_successful_submissions.parquet")
+        deduplicated_successful_submissions_dataset.to_parquet(deduplicated_successful_submissions_path)
+        print(f"Deduplicated successful submissions dataset successfully saved to {deduplicated_successful_submissions_path}")
+        print(f"Original successful submissions: {len(successful_submissions_df)}, After deduplication: {len(deduplicated_successful_submissions_df)}")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Export leaderboard data to a Hugging Face dataset.")
diff --git a/test_dedup.py b/test_dedup.py
new file mode 100644
index 0000000..f5ea811
--- /dev/null
+++ b/test_dedup.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+"""
+Unit tests for the deduplication pipeline.
+Tests the end-to-end flow with fake data matching the database schema.
+"""
+
+import unittest
+import pandas as pd
+import numpy as np
+from datetime import datetime, timezone
+import random
+from typing import Dict, List, Any
+import tempfile
+import os
+import sys
+
+# Import the functions we want to test
+try:
+    from dedup import (
+        remove_duplicates,
+        fuzzy_filter,
+        convert_df_to_dict,
+        flatten_data,
+        dedup_df,
+        count_items,
+        create_parquet_file
+    )
+except ImportError as e:
+    print(f"Import error: {e}")
+    print("Some functions may not be available for testing")
+
+
+class TestDedupEndToEnd(unittest.TestCase):
+    
+    def setUp(self):
+        """Set up test fixtures with fake data matching the schema."""
+        random.seed(42)  # For reproducible tests
+        np.random.seed(42)
+        self.fake_data = self.create_fake_dataset(50)
+        self.df = pd.DataFrame(self.fake_data)
+        
+    def create_fake_dataset(self, num_entries: int) -> List[Dict[str, Any]]:
+        """Create a fake dataset with the required schema fields."""
+        fake_data = []
+        
+        # Sample code snippets (some duplicates for testing)
+        code_samples = [
+            "def hello_world():\n    print('Hello World')",
+            "import numpy as np\nx = np.array([1, 2, 3])",
+            "for i in range(10):\n    print(i)",
+            "class MyClass:\n    def __init__(self):\n        pass",
+            "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
+            "import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})",
+            "def quicksort(arr):\n    if len(arr) <= 1:\n        return arr",
+            "x = [1, 2, 3, 4, 5]\ny = [i**2 for i in x]",
+            "try:\n    result = 10 / 0\nexcept ZeroDivisionError:\n    print('Error')",
+            "def hello_world():\n    print('Hello World')",  # Exact duplicate
+        ]
+        
+        run_modes = ['leaderboard', 'benchmark', 'test']
+        file_names = ['solution.py', 'main.py', 'algorithm.py', 'test.py']
+        
+        for i in range(num_entries):
+            # Create base timestamp
+            base_time = datetime(2024, 1, 1, tzinfo=timezone.utc)
+            submission_time = base_time.replace(
+                day=random.randint(1, 28),
+                hour=random.randint(0, 23),
+                minute=random.randint(0, 59)
+            )
+            
+            # Select code (with some duplicates)
+            code = random.choice(code_samples)
+            if i < 5:  # First 5 entries use the same code for exact duplicate testing
+                code = code_samples[0]
+            elif i < 10:  # Next 5 use slightly modified versions for fuzzy testing
+                code = code_samples[0] + f"\n# Comment {i}"
+                
+            run_mode = random.choice(run_modes)
+            run_passed = random.choice([True, False])
+            
+            # Generate run score based on mode and success
+            if run_mode == 'leaderboard' and run_passed:
+                run_score = round(random.uniform(0.1, 1.0), 4)
+            else:
+                run_score = 0.0 if not run_passed else round(random.uniform(0.1, 0.8), 4)
+                
+            # Create the entry matching the database schema
+            entry = {
+                'submission_id': i + 1000,
+                'leaderboard_id': random.randint(1, 10),
+                'user_id': random.randint(100, 999),
+                'submission_time': submission_time,
+                'file_name': random.choice(file_names),
+                'code': code,
+                'code_id': i + 2000,
+                'run_id': i + 3000,
+                'run_start_time': submission_time,
+                'run_end_time': submission_time.replace(
+                    second=random.randint(1, 59)
+                ),
+                'run_mode': run_mode,
+                'run_score': run_score,
+                'run_passed': run_passed,
+                'run_result': {
+                    'benchmark-count': random.randint(1, 10),
+                    'benchmark.0.best': f'benchmark_{random.randint(1, 100)}.txt',
+                    'benchmark.0.err': '',
+                    'benchmark.0.mean': round(random.uniform(0.1, 2.0), 6),
+                    'benchmark.0.report': f'report_{i}.json'
+                },
+                'run_compilation': {
+                    'command': 'python',
+                    'exit_code': 0 if run_passed else random.randint(1, 255),
+                    'nvcc_found': random.choice([True, False]),
+                    'nvcc_version': f'11.{random.randint(0, 8)}',
+                    'stderr': '' if run_passed else f'Error message {i}',
+                    'stdout': f'Output {i}',
+                    'success': run_passed
+                },
+                'run_meta': {
+                    'command': 'python solution.py',
+                    'duration': round(random.uniform(0.1, 10.0), 3),
+                    'exit_code': 0 if run_passed else random.randint(1, 255),
+                    'stderr': '' if run_passed else f'Runtime error {i}',
+                    'stdout': f'Runtime output {i}',
+                    'success': run_passed
+                },
+                'run_system_info': {
+                    'cpu': f'Intel Core i{random.randint(5, 9)}',
+                    'gpu': random.choice(['NVIDIA RTX 3080', 'NVIDIA RTX 4090', 'None']),
+                    'platform': random.choice(['linux', 'darwin', 'win32']),
+                    'torch': f'2.{random.randint(0, 3)}.{random.randint(0, 9)}'
+                }
+            }
+            fake_data.append(entry)
+            
+        return fake_data
+    
+    def test_dataframe_creation(self):
+        """Test that the fake dataset creates a valid DataFrame."""
+        self.assertEqual(len(self.df), 50)
+        
+        # Check required columns exist (matching the schema in the image)
+        required_columns = [
+            'submission_id', 'leaderboard_id', 'user_id', 'submission_time',
+            'file_name', 'code', 'code_id', 'run_id', 'run_start_time',
+            'run_end_time', 'run_mode', 'run_score', 'run_passed',
+            'run_result', 'run_compilation', 'run_meta', 'run_system_info'
+        ]
+        
+        for col in required_columns:
+            self.assertIn(col, self.df.columns, f"Missing required column: {col}")
+            
+        # Check data types
+        self.assertTrue(self.df['submission_id'].dtype in ['int64', 'int32'])
+        self.assertTrue(self.df['run_passed'].dtype == 'bool')
+        self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32'])
+        
+        # Verify struct fields exist
+        sample_row = self.df.iloc[0]
+        self.assertIsInstance(sample_row['run_result'], dict)
+        self.assertIsInstance(sample_row['run_compilation'], dict)
+        self.assertIsInstance(sample_row['run_meta'], dict)
+        self.assertIsInstance(sample_row['run_system_info'], dict)
+        
+    def test_convert_df_to_dict(self):
+        """Test conversion from DataFrame to nested dictionary structure."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            
+            # Check structure
+            self.assertIsInstance(data_dict, dict)
+            
+            # Should have run_mode keys
+            run_modes = set(self.df['run_mode'].unique())
+            self.assertEqual(set(data_dict.keys()), run_modes)
+            
+            # Check nested structure
+            for run_mode in data_dict:
+                self.assertIsInstance(data_dict[run_mode], dict)
+                for run_success in data_dict[run_mode]:
+                    self.assertIsInstance(data_dict[run_mode][run_success], dict)
+                    for score_duration in data_dict[run_mode][run_success]:
+                        self.assertIsInstance(
+                            data_dict[run_mode][run_success][score_duration], 
+                            list
+                        )
+        except NameError:
+            self.skipTest("convert_df_to_dict function not available")
+    
+    def test_exact_deduplication(self):
+        """Test exact duplicate removal."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            original_count = count_items(data_dict)
+            
+            deduplicated_data = remove_duplicates(data_dict)
+            deduplicated_count = count_items(deduplicated_data)
+            
+            # Should have fewer or equal items after deduplication
+            self.assertLessEqual(deduplicated_count, original_count)
+            
+            # Structure should be preserved
+            self.assertEqual(set(data_dict.keys()), set(deduplicated_data.keys()))
+            
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+        
+    def test_fuzzy_deduplication_small(self):
+        """Test fuzzy duplicate removal with small threshold for faster testing."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            original_count = count_items(data_dict)
+            
+            # Use small parameters for faster testing
+            fuzzy_deduplicated_data = fuzzy_filter(
+                data_dict,
+                threshold=0.5,  # Lower threshold for faster testing
+                ngram_size=3,   # Smaller ngram size
+                bands=4,        # Fewer bands
+                rows_per_band=32  # Fewer rows per band
+            )
+            
+            fuzzy_count = count_items(fuzzy_deduplicated_data)
+            
+            # Should have fewer or equal items after fuzzy deduplication
+            self.assertLessEqual(fuzzy_count, original_count)
+            
+            # Structure should be preserved
+            self.assertEqual(set(data_dict.keys()), set(fuzzy_deduplicated_data.keys()))
+            
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+    
+    def test_flatten_and_reconstruct(self):
+        """Test flattening and reconstruction of data."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            original_count = count_items(data_dict)
+            
+            # Flatten
+            flattened_data = flatten_data(data_dict)
+            self.assertEqual(len(flattened_data), original_count)
+            
+            # Check metadata fields were added
+            if flattened_data:
+                sample_row = flattened_data[0]
+                self.assertIn('_run_mode', sample_row)
+                self.assertIn('_run_success', sample_row)
+                self.assertIn('_score_duration', sample_row)
+                
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+    
+    def test_dedup_df_end_to_end(self):
+        """Test the complete deduplication pipeline."""
+        try:
+            original_length = len(self.df)
+            
+            # Run the complete deduplication pipeline
+            deduplicated_df = dedup_df(self.df)
+            
+            # Should return a DataFrame
+            self.assertIsInstance(deduplicated_df, pd.DataFrame)
+            
+            # Should have fewer or equal rows
+            self.assertLessEqual(len(deduplicated_df), original_length)
+            
+            # Should preserve required columns
+            required_columns = ['submission_id', 'code', 'run_mode', 'run_passed']
+            for col in required_columns:
+                self.assertIn(col, deduplicated_df.columns)
+                
+            # Check data integrity
+            self.assertFalse(deduplicated_df.empty, "Deduplicated DataFrame should not be empty")
+            
+        except NameError as e:
+            self.skipTest(f"dedup_df function not available: {e}")
+        
+    def test_parquet_creation(self):
+        """Test Parquet file creation."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            
+            with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as tmp_file:
+                try:
+                    create_parquet_file(data_dict, tmp_file.name)
+                    
+                    # Check file was created
+                    self.assertTrue(os.path.exists(tmp_file.name))
+                    
+                    # Check file is not empty
+                    self.assertGreater(os.path.getsize(tmp_file.name), 0)
+                    
+                    # Try to read the file back
+                    df_from_parquet = pd.read_parquet(tmp_file.name)
+                    self.assertIsInstance(df_from_parquet, pd.DataFrame)
+                    self.assertGreater(len(df_from_parquet), 0)
+                    
+                finally:
+                    # Clean up
+                    if os.path.exists(tmp_file.name):
+                        os.unlink(tmp_file.name)
+                        
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+    
+    def test_data_consistency_after_deduplication(self):
+        """Test that data remains consistent after deduplication."""
+        try:
+            # Create dataset with known duplicates
+            duplicate_data = []
+            
+            # Add the same code 3 times with different metadata
+            base_entry = self.fake_data[0].copy()
+            for i in range(3):
+                entry = base_entry.copy()
+                entry['submission_id'] = 9000 + i
+                entry['run_id'] = 9100 + i
+                duplicate_data.append(entry)
+                
+            # Add to main dataset
+            test_data = self.fake_data + duplicate_data
+            test_df = pd.DataFrame(test_data)
+            
+            original_length = len(test_df)
+            deduplicated_df = dedup_df(test_df)
+            
+            # Should have removed at least 2 duplicates
+            self.assertLess(len(deduplicated_df), original_length)
+            
+            # Check that essential fields are preserved
+            self.assertTrue(all(col in deduplicated_df.columns for col in 
+                              ['submission_id', 'code', 'run_mode', 'run_passed']))
+                              
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+
+    def test_schema_compliance(self):
+        """Test that the fake dataset matches the expected schema from the database."""
+        # Test all required fields exist and have correct types
+        
+        # Test BIGINT fields
+        bigint_fields = ['submission_id', 'leaderboard_id', 'user_id', 'code_id', 'run_id']
+        for field in bigint_fields:
+            self.assertTrue(self.df[field].dtype in ['int64', 'int32'], 
+                          f"{field} should be integer type")
+            
+        # Test VARCHAR fields
+        varchar_fields = ['file_name', 'code', 'run_mode']
+        for field in varchar_fields:
+            self.assertTrue(self.df[field].dtype == 'object', 
+                          f"{field} should be string/object type")
+            
+        # Test TIMESTAMP fields
+        timestamp_fields = ['submission_time', 'run_start_time', 'run_end_time']
+        for field in timestamp_fields:
+            # Check that all values are datetime objects with timezone
+            sample_value = self.df[field].iloc[0]
+            self.assertIsInstance(sample_value, datetime)
+            self.assertIsNotNone(sample_value.tzinfo)
+            
+        # Test DOUBLE field
+        self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32'])
+        
+        # Test BOOLEAN field
+        self.assertTrue(self.df['run_passed'].dtype == 'bool')
+        
+        # Test STRUCT fields
+        struct_fields = ['run_result', 'run_compilation', 'run_meta', 'run_system_info']
+        for field in struct_fields:
+            # All values should be dictionaries
+            self.assertTrue(all(isinstance(val, dict) for val in self.df[field]))
+            
+    def test_duplicate_detection(self):
+        """Test that we can detect exact and near duplicates in the dataset."""
+        # Count exact duplicates by code
+        code_counts = self.df['code'].value_counts()
+        exact_duplicates = code_counts[code_counts > 1]
+        
+        # Should have some exact duplicates (first 5 entries)
+        self.assertGreater(len(exact_duplicates), 0, "Should have exact duplicates for testing")
+        
+        # Check that fuzzy duplicates exist (entries with similar code)
+        similar_code_count = 0
+        base_code = "def hello_world():\n    print('Hello World')"
+        for code in self.df['code']:
+            if base_code in code and code != base_code:
+                similar_code_count += 1
+                
+        self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing")
+
+
+if __name__ == '__main__':
+    # Add some helpful output
+    print("Running deduplication pipeline tests...")
+    print(f"Python version: {sys.version}")
+    print(f"Pandas version: {pd.__version__}")
+    
+    # Run the tests
+    unittest.main(verbosity=2) 
\ No newline at end of file

From 2ad5e808d56243da49aa25a44dbac861628263d7 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Mon, 23 Jun 2025 17:10:24 -0400
Subject: [PATCH 2/9] magic number removal

---
 dedup.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 100 insertions(+), 18 deletions(-)

diff --git a/dedup.py b/dedup.py
index b316e38..8f209a6 100644
--- a/dedup.py
+++ b/dedup.py
@@ -9,6 +9,82 @@
 import datasketch
 import pandas as pd
 
+# =============================================================================
+# DEDUPLICATION CONFIGURATION CONSTANTS
+# =============================================================================
+
+# Fuzzy Deduplication Parameters
+FUZZY_SIMILARITY_THRESHOLD = 0.8
+"""
+Jaccard similarity threshold for considering two documents as duplicates.
+Range: 0.0 to 1.0
+- 0.8 = High threshold, only very similar documents are considered duplicates
+- 0.7 = Medium threshold, moderately similar documents are duplicates  
+- 0.5 = Low threshold, loosely similar documents are duplicates
+Higher values = more strict deduplication, fewer items removed
+"""
+
+NGRAM_SIZE = 5
+"""
+Size of character n-grams used for MinHash fingerprinting.
+- Smaller values (3-4): More sensitive to small changes, better for short text
+- Larger values (5-7): Less sensitive to minor variations, better for longer text
+- Too small: May create false positives (different texts seem similar)
+- Too large: May miss actual duplicates with small variations
+"""
+
+LSH_BANDS = 16
+"""
+Number of bands for Locality Sensitive Hashing (LSH).
+Used to speed up similarity detection by grouping similar hashes.
+- More bands = faster but less accurate similarity detection
+- Fewer bands = slower but more accurate similarity detection
+Must divide evenly into ROWS_PER_BAND * LSH_BANDS = total permutations
+"""
+
+ROWS_PER_BAND = 128
+"""
+Number of rows per band in LSH configuration.
+Total MinHash permutations = ROWS_PER_BAND * LSH_BANDS
+- More rows per band = higher precision, may miss some similar pairs
+- Fewer rows per band = higher recall, may include more false positives
+Default: 128 rows × 16 bands = 2048 total permutations
+"""
+
+# Score Processing Parameters
+LEADERBOARD_SCORE_PRECISION = 4
+"""
+Number of decimal places to round leaderboard scores when grouping submissions.
+Used to group submissions with very similar scores together.
+- Higher precision (more decimal places): More granular grouping
+- Lower precision (fewer decimal places): Broader grouping of similar scores
+"""
+
+DURATION_PRECISION = 0
+"""
+Number of decimal places to round execution duration (in seconds).
+Used to group submissions with similar execution times.
+- 0: Round to nearest second (1.7s → 2s)
+- 1: Round to nearest 0.1s (1.73s → 1.7s)
+"""
+
+# =============================================================================
+# CONFIGURATION SUMMARY
+# =============================================================================
+"""
+Current deduplication configuration:
+├─ Similarity Detection: 0.8 threshold (strict)
+├─ Text Fingerprinting: 5-character n-grams  
+├─ LSH Performance: 16 bands × 128 rows = 2048 permutations
+├─ Score Grouping: 4 decimal places for leaderboard scores
+└─ Duration Grouping: 0 decimal places for execution times
+
+To adjust deduplication sensitivity:
+- Increase FUZZY_SIMILARITY_THRESHOLD (0.8→0.9) for stricter deduplication
+- Decrease FUZZY_SIMILARITY_THRESHOLD (0.8→0.7) for more aggressive deduplication  
+- Adjust NGRAM_SIZE for different text lengths (3-4 for short, 5-7 for long)
+"""
+
 def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]):
     """
     Remove exact duplicates from the nested data structure returned by get_sorted_hf_data.
@@ -60,9 +136,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li
 
 def create_minhashes(
     documents: List[Dict[str, str]],
-    ngram_size: int = 5,
-    bands: int = 20,
-    rows_per_band: int = 128,
+    ngram_size: int = NGRAM_SIZE,
+    bands: int = LSH_BANDS,
+    rows_per_band: int = ROWS_PER_BAND,
 ) -> Tuple[Dict[str, datasketch.MinHash], int]:
     """
     Create MinHash signatures for a list of documents with LSH bands configuration.
@@ -155,10 +231,10 @@ def filter_matrix(
 
 def fuzzy_filter(
     data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]],
-    threshold: float = 0.7,
-    ngram_size: int = 5,
-    bands: int = 16,
-    rows_per_band: int = 128,
+    threshold: float = FUZZY_SIMILARITY_THRESHOLD,
+    ngram_size: int = NGRAM_SIZE,
+    bands: int = LSH_BANDS,
+    rows_per_band: int = ROWS_PER_BAND,
 ) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
     
     total_categories = 0
@@ -181,10 +257,10 @@ def fuzzy_filter(
 
 def _fuzzy_filter(
     data_list: List[Dict],
-    threshold: float = 0.7,
-    ngram_size: int = 5,
-    bands: int = 16,
-    rows_per_band: int = 128,
+    threshold: float = FUZZY_SIMILARITY_THRESHOLD,
+    ngram_size: int = NGRAM_SIZE,
+    bands: int = LSH_BANDS,
+    rows_per_band: int = ROWS_PER_BAND,
 ) -> List[Dict]:
     """
     Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data.
@@ -263,10 +339,10 @@ def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]:
         for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False):
             for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False):
                 if run_mode == 'leaderboard' and run_success == True:
-                    rounded_score = round(float(row['run_score']), 4)
+                    rounded_score = round(float(row['run_score']), LEADERBOARD_SCORE_PRECISION)
                     run_duration_dict[run_mode][run_success][rounded_score].append(row)
                 else:
-                    rounded_duration = round(float(row['run_meta']['duration']), 0)
+                    rounded_duration = round(float(row['run_meta']['duration']), DURATION_PRECISION)
                     run_duration_dict[run_mode][run_success][rounded_duration].append(row)
 
     return run_duration_dict
@@ -346,10 +422,10 @@ def example_usage():
     # Apply fuzzy deduplication
     fuzzy_deduplicated_data = fuzzy_filter(
         deduplicated_data,
-        threshold=0.8,  # High threshold for more strict deduplication
-        ngram_size=5,
-        bands=16,
-        rows_per_band=128
+        threshold=FUZZY_SIMILARITY_THRESHOLD,
+        ngram_size=NGRAM_SIZE,
+        bands=LSH_BANDS,
+        rows_per_band=ROWS_PER_BAND
     )
     # convert to df
     flattened_data = flatten_data(fuzzy_deduplicated_data)
@@ -367,7 +443,13 @@ def dedup_df(df: pd.DataFrame) -> pd.DataFrame:
     # convert to dict
     data_dict = convert_df_to_dict(df)
     # deduplicate
-    deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128)
+    deduplicated_data = fuzzy_filter(
+        data_dict, 
+        threshold=FUZZY_SIMILARITY_THRESHOLD, 
+        ngram_size=NGRAM_SIZE, 
+        bands=LSH_BANDS, 
+        rows_per_band=ROWS_PER_BAND
+    )
     # convert to df
     flattened_data = flatten_data(deduplicated_data)
     df = pd.DataFrame(flattened_data)

From a30f85d0353b6347aef785e827edc7f25f97bae9 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Tue, 24 Jun 2025 08:43:42 -0400
Subject: [PATCH 3/9] Add deduplicated datasets

---
 dedup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedup.py b/dedup.py
index 8f209a6..f6f3449 100644
--- a/dedup.py
+++ b/dedup.py
@@ -118,9 +118,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li
                         # If duplicate found, keep the one with better metrics
                         existing_row = unique_entries[content_hash]
                         
-                        # For leaderboard mode with successful runs, prefer higher scores
+                        # For leaderboard mode with successful runs, prefer lower scores / faster times
                         if run_mode == 'leaderboard' and row.get('run_passed') == True:
-                            if row.get('run_score', 0) > existing_row.get('run_score', 0):
+                            if row.get('run_score', 0) < existing_row.get('run_score', 0):
                                 unique_entries[content_hash] = row
                         # For other cases, prefer shorter duration (faster execution)
                         else:

From 16567f949cecc27b946ff352d90f2323681e97a4 Mon Sep 17 00:00:00 2001
From: Benjamin Horowitz <Ben Horowitz>
Date: Mon, 24 Nov 2025 15:58:03 -0800
Subject: [PATCH 4/9] Updates for 2nd comptetition

This change modifies the extraction processes so that it uses a lot less
memory. In particular, the process no longer loads the whole dataset
into memory before exporting to parquet files. Instead, it processes the
dataset into small, incremental parquet files, and then consolidates
these files into a single file as the final step.
---
 .gitignore        |   2 +
 dataset/README.md |   8 +-
 export.py         | 259 +++++++++++++++++++++++++++++++---------------
 requirements.txt  |   2 -
 4 files changed, 183 insertions(+), 88 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4c49bd7..76ffd48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 .env
+dataset/*.parquet
+dataset/submissions/*
\ No newline at end of file
diff --git a/dataset/README.md b/dataset/README.md
index 6084549..52550e2 100644
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -11,7 +11,13 @@ tags:
 license: mit
 ---
 
-If you use GPUMODE/amd-kernels-2025 in your work, please cite:
+This is the dataset that was created from the first and second AMD $100K kernel competitions, containing roughly 110K kernels for fp8-gemm, moe, mla, all2all, gemm+reducescatter, and allgather+gemm optimized to run on MI300. Learn more at gpumode.com/v2/news
+
+To see the full list of kernel competitions we've ran and are running you can checkout https://github.com/gpu-mode/reference-kernels which also contains details on reference kernels and their input shapes and distributions
+
+We are planning on adding kernels optimized for NVFP4 on Blackwell next
+
+If you use this dataset in your work, please cite:
 
 ```bibtex
 @inproceedings{
diff --git a/export.py b/export.py
index b4e3cdb..6d3078f 100644
--- a/export.py
+++ b/export.py
@@ -1,10 +1,11 @@
 import argparse
+import gc
 import os
-import numpy as np
 import pandas as pd
-from datasets import Dataset
 from dotenv import load_dotenv
 from sqlalchemy import create_engine, text
+import pyarrow as pa
+import pyarrow.parquet as pq
 
 load_dotenv()
 
@@ -24,32 +25,32 @@
 DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
 
 # The leaderboard IDs to export
-LEADERBOARD_IDS = [463, 430, 399, 398]
+LEADERBOARD_IDS = [463, 430, 399, 398, 563, 564, 565]
 
 
-def fetch_leaderboards(engine, leaderboard_ids) -> Dataset:
+def fetch_and_save_leaderboards(engine, leaderboard_ids, output_path):
     """
-    Fetches and processes leaderboard data from the database.
+    Fetches leaderboard data from the database and saves it directly to parquet.
 
     This function queries the database for specific leaderboards, selecting
     key fields and fetching all associated GPU types for each leaderboard
-    using a subquery.
+    using a subquery. It saves the leaderboards directly to parquet.
 
     Args:
         engine: The SQLAlchemy engine instance for database connection.
         leaderboard_ids: A list of integer IDs for the leaderboards to fetch.
 
     Returns:
-        A Hugging Face `Dataset` object containing the leaderboard data.
+        The number of leaderboards.
     """
-    print("Fetching leaderboards...")
+    print("Fetching and saving leaderboards...")
     query = text("""
         SELECT
             id,
             name,
-            deadline,
+            deadline AT TIME ZONE 'UTC' AS deadline,
             task->>'lang' AS lang,
-            task->>'description' AS description,
+            description,
             task->'files'->>'reference.py' AS reference,
             (
                 SELECT array_agg(gpu_type)
@@ -60,10 +61,45 @@ def fetch_leaderboards(engine, leaderboard_ids) -> Dataset:
         WHERE id = ANY(:leaderboard_ids)
     """)
     df = pd.read_sql_query(query, engine, params={'leaderboard_ids': leaderboard_ids})
-    return Dataset.from_pandas(df)
+    df.to_parquet(output_path, index=False)
+    print(f"Leaderboards saved to {output_path}")
+
+
+def anonymize_users_in_db(engine, leaderboard_ids):
+    """Create a temporary mapping table in the database."""
+    with engine.begin() as conn:
+        # Create temporary table with anonymized IDs
+        conn.execute(text("""
+            CREATE TEMP TABLE user_mapping AS
+            SELECT
+                user_id as original_user_id,
+                ROW_NUMBER() OVER (ORDER BY RANDOM()) as anonymized_user_id
+            FROM (
+                SELECT DISTINCT user_id
+                FROM leaderboard.submission
+                WHERE leaderboard_id = ANY(:leaderboard_ids)
+            ) AS distinct_users
+        """), {'leaderboard_ids': leaderboard_ids})
+
+
+def handle_empty_structs(df):
+    """
+    Replace empty struct/dict values with None to avoid PyArrow serialization errors.
+
+    PyArrow cannot write empty struct types to Parquet. This function checks
+    columns that contain dict/struct values and replaces empty ones with None.
+    """
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Check if column contains dict-like objects
+            sample = df[col].dropna().head(1)
+            if len(sample) > 0 and isinstance(sample.iloc[0], dict):
+                # Replace empty dicts with None
+                df[col] = df[col].apply(lambda x: None if isinstance(x, dict) and len(x) == 0 else x)
+    return df
 
 
-def fetch_submissions(engine, leaderboard_ids) -> Dataset:
+def fetch_and_save_submissions(engine, leaderboard_ids, output_path, chunksize=8192):
     """
     Fetches and processes submission data from the database.
 
@@ -76,23 +112,20 @@ def fetch_submissions(engine, leaderboard_ids) -> Dataset:
         engine: The SQLAlchemy engine instance for database connection.
         leaderboard_ids: A list of integer IDs for the leaderboards whose
             submissions are to be fetched.
-
-    Returns:
-        A Hugging Face `Dataset` object containing the submissions data.
     """
     print("Fetching submissions...")
-    query = text("""
+    query = """
         SELECT
             s.id AS submission_id,
             s.leaderboard_id,
-            s.user_id,
-            s.submission_time,
+            um.anonymized_user_id AS user_id,
+            s.submission_time AT TIME ZONE 'UTC' AS submission_time,
             s.file_name,
             c.code,
             c.id AS code_id,
             r.id AS run_id,
-            r.start_time AS run_start_time,
-            r.end_time AS run_end_time,
+            r.start_time AT TIME ZONE 'UTC' AS run_start_time,
+            r.end_time AT TIME ZONE 'UTC' AS run_end_time,
             r.mode AS run_mode,
             r.score AS run_score,
             r.passed AS run_passed,
@@ -101,12 +134,61 @@ def fetch_submissions(engine, leaderboard_ids) -> Dataset:
             r.meta as run_meta,
             r.system_info AS run_system_info
         FROM leaderboard.submission AS s
-        JOIN leaderboard.runs AS r ON s.id = r.submission_id
+        LEFT JOIN leaderboard.runs AS r ON s.id = r.submission_id
         JOIN leaderboard.code_files AS c ON s.code_id = c.id
+        LEFT JOIN user_mapping um ON s.user_id = um.original_user_id
         WHERE s.leaderboard_id = ANY(:leaderboard_ids)
-    """)
-    df = pd.read_sql_query(query, engine, params={'leaderboard_ids': leaderboard_ids})
-    return Dataset.from_pandas(df)
+    """
+
+    part = 0
+
+    with engine.connect().execution_options(stream_results=True) as conn:
+        for chunk_df in pd.read_sql_query(
+            text(query),
+            conn,
+            params={'leaderboard_ids': leaderboard_ids},
+            chunksize=chunksize
+        ):
+            # Decode hex values code column
+            if 'code' in chunk_df.columns:
+                chunk_df['code'] = chunk_df['code'].apply(decode_hex_if_needed)
+
+            # Convert nullable integer columns to consistent types
+            # This prevents type mismatches when some chunks have all NULLs
+            nullable_int_cols = ['run_id', 'code_id', 'submission_id', 'leaderboard_id', 'user_id']
+            for col in nullable_int_cols:
+                if col in chunk_df.columns:
+                    chunk_df[col] = chunk_df[col].astype('Int64')
+
+            # Handle empty structs that PyArrow can't serialize
+            chunk_df = handle_empty_structs(chunk_df)
+
+            # Convert to arrow table
+            table = pa.Table.from_pandas(chunk_df)
+
+            # Write chunk as separate parquet file
+            filename = os.path.join(output_path, f"submissions_part_{part:05d}.parquet")
+            pq.write_table(table, filename)
+
+            print(f"  Wrote {len(chunk_df)} submissions to part {part}")
+
+            # Filter for and save successful submissions
+            if 'run_passed' in chunk_df.columns:
+                success_mask = chunk_df['run_passed'] == True
+                if success_mask.any():
+                    success_df = chunk_df[success_mask]
+                    success_table = pa.Table.from_pandas(success_df)
+                    success_filename = os.path.join(output_path, f"successful_submissions_part_{part:05d}.parquet")
+                    pq.write_table(success_table, success_filename)
+                    print(f"  Wrote {len(success_df)} successful submissions to part {part}")
+                    del success_df, success_table, success_mask
+
+            del chunk_df, table
+            gc.collect()
+
+            part += 1
+
+    print(f"Submissions saved to {part} parquet files in {output_path}")
 
 
 def decode_hex_if_needed(code_val: str) -> str:
@@ -132,6 +214,53 @@ def decode_hex_if_needed(code_val: str) -> str:
     return code_val
 
 
+def consolidate_parquet_files(input_dir, pattern, output_file):
+    """
+    Consolidates multiple parquet part files into a single parquet file.
+
+    Args:
+        input_dir: Directory containing the parquet part files
+        pattern: Glob pattern to match the part files (e.g., "submissions_part_*.parquet")
+        output_file: Path to the output consolidated parquet file
+    """
+    import glob
+
+    # Find all matching parquet files
+    part_files = sorted(glob.glob(os.path.join(input_dir, pattern)))
+
+    if not part_files:
+        print(f"  No files found matching pattern {pattern}")
+        return
+
+    print(f"  Consolidating {len(part_files)} {pattern} files into {output_file}...")
+
+    # First pass: Read only schemas (not data) from all files to unify them
+    schemas = []
+    for part_file in part_files:
+        parquet_file = pq.ParquetFile(part_file)
+        schemas.append(parquet_file.schema_arrow)
+
+    # Unify schemas across all tables to handle struct field variations
+    unified_schema = pa.unify_schemas(schemas)
+
+    # Second pass: Read each file, cast to unified schema, and write incrementally
+    total_rows = 0
+    with pq.ParquetWriter(output_file, unified_schema) as writer:
+        for part_file in part_files:
+            # Read one file at a time
+            table = pq.read_table(part_file)
+
+            # Cast to unified schema (fills missing fields with nulls)
+            unified_table = table.cast(unified_schema)
+
+            # Write to output file
+            writer.write_table(unified_table)
+
+            total_rows += len(unified_table)
+
+    print(f"  Done! Consolidated {len(part_files)} files ({total_rows} total rows)")
+
+
 def main(output_dir):
     """
     Orchestrates the data export process.
@@ -140,80 +269,40 @@ def main(output_dir):
     and submission data, anonymizes user IDs, and saves the results to
     separate Parquet files: `leaderboards.parquet`, `submissions.parquet`,
     and `successful_submissions.parquet`. The user ID mapping is not saved.
+    Temporary files are not deleted and should be manually removed if
+    desired.
 
     Args:
         output_dir (str): The local directory path to save the Parquet files.
     """
     engine = create_engine(DATABASE_URL)
-    rng = np.random.default_rng()
 
     # Ensure the output directory exists
     os.makedirs(output_dir, exist_ok=True)
 
     # Fetch and save leaderboards
-    leaderboards_dataset = fetch_leaderboards(engine, LEADERBOARD_IDS)
     leaderboards_output_path = os.path.join(output_dir, "leaderboards.parquet")
-    leaderboards_dataset.to_parquet(leaderboards_output_path)
-    print(f"Leaderboards dataset successfully saved to {leaderboards_output_path}")
+    fetch_and_save_leaderboards(engine, LEADERBOARD_IDS, leaderboards_output_path)
+
+    anonymize_users_in_db(engine, LEADERBOARD_IDS)
 
     # Fetch submissions
-    submissions_dataset = fetch_submissions(engine, LEADERBOARD_IDS)
-    submissions_df = submissions_dataset.to_pandas()
-
-    # Decode hexadecimal 'code' values
-    if 'code' in submissions_df.columns:
-        print("Decoding 'code' column from hexadecimal where necessary...")
-        submissions_df['code'] = submissions_df['code'].apply(decode_hex_if_needed)
-
-    # Anonymize user IDs if submissions exist
-    if not submissions_df.empty and 'user_id' in submissions_df.columns:
-        print("Anonymizing user IDs...")
-        unique_user_ids = submissions_df['user_id'].unique()
-        num_unique_users = len(unique_user_ids)
-
-        # Create a randomly permuted mapping in memory
-        permuted_ids = rng.permutation(range(1, num_unique_users + 1))
-        user_map_df = pd.DataFrame({
-            'original_user_id': unique_user_ids,
-            'anonymized_user_id': permuted_ids
-        })
-
-        # Replace original user IDs with anonymized IDs
-        original_cols = list(submissions_df.columns)
-        user_id_index = original_cols.index('user_id')
-        
-        submissions_df = submissions_df.merge(user_map_df, left_on='user_id', right_on='original_user_id')
-        submissions_df = submissions_df.drop(columns=['user_id', 'original_user_id'])
-        submissions_df = submissions_df.rename(columns={'anonymized_user_id': 'user_id'})
-
-        # Restore original column order
-        new_order = [col for col in original_cols if col != 'user_id']
-        new_order.insert(user_id_index, 'user_id')
-        submissions_df = submissions_df[new_order]
-
-        # Convert back to a dataset
-        submissions_dataset = Dataset.from_pandas(submissions_df)
-
-    # Save the submissions dataset (anonymized or original if empty)
-    submissions_output_path = os.path.join(output_dir, "submissions.parquet")
-    submissions_dataset.to_parquet(submissions_output_path)
-    print(f"Submissions dataset successfully saved to {submissions_output_path}")
-
-    # Filter for and save successful submissions from the anonymized data
-    if 'run_passed' in submissions_df.columns:
-        print("Creating successful submissions dataset...")
-        successful_submissions_df = submissions_df[submissions_df['run_passed'] == True].copy()
-
-        # Convert to dataset and save
-        successful_submissions_dataset = Dataset.from_pandas(successful_submissions_df)
-        successful_output_path = os.path.join(
-            output_dir, "successful_submissions.parquet"
-        )
-        successful_submissions_dataset.to_parquet(successful_output_path)
-        print(
-            "Successful submissions dataset successfully saved to "
-            f"{successful_output_path}"
-        )
+    submissions_output_path = os.path.join(output_dir, "submissions")
+    os.makedirs(submissions_output_path, exist_ok=True)
+    fetch_and_save_submissions(engine, LEADERBOARD_IDS, submissions_output_path)
+
+    # Consolidate part files into single parquet files
+    consolidate_parquet_files(
+        submissions_output_path,
+        "submissions_part_*.parquet",
+        os.path.join(output_dir, "submissions.parquet")
+    )
+
+    consolidate_parquet_files(
+        submissions_output_path,
+        "successful_submissions_part_*.parquet",
+        os.path.join(output_dir, "successful_submissions.parquet")
+    )
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index 0698625..3b5b938 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,5 @@
-datasets
 pandas
 psycopg2-binary
 SQLAlchemy
 pyarrow
 python-dotenv
-numpy 
\ No newline at end of file

From b39b2928ccedb2bf765e4f18c0ae154c0653ed66 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Mon, 23 Jun 2025 16:45:07 -0400
Subject: [PATCH 5/9] Add deduplication logic

---
 README.md     |  15 ++
 dedup.py      | 398 +++++++++++++++++++++++++++++++++++++++++++++++++
 export.py     |  31 ++++
 test_dedup.py | 402 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 846 insertions(+)
 create mode 100644 dedup.py
 create mode 100644 test_dedup.py

diff --git a/README.md b/README.md
index 54fe6b8..0113e64 100644
--- a/README.md
+++ b/README.md
@@ -34,3 +34,18 @@ python export.py
 
 The script will create a directory at the specified output path containing the dataset in Parquet format. If `--output_dir` is not provided, it will save to `dataset` in the current working directory.
 
+## Tests
+The deduplication scripts can be tested by running
+```bash
+python test_dedup.py
+# if you have pytest you can run 
+python -m pytest test_dedup.py -v
+```
+To test things we actually create a fake dataset. Here are the features of it
+The test creates a 50-entry dataset with:
+- **Exact duplicates**: First 5 entries use identical code
+- **Fuzzy duplicates**: Next 5 entries use similar code with small variations
+- **Multiple run modes**: `leaderboard`, `test`, `benchmark`
+- **Mixed success states**: Both `True` and `False` values for `run_passed`
+- **Realistic struct data**: Complex nested structures for `run_result`, `run_compilation`, `run_meta`, and `run_system_info`
+- **Proper timestamps**: All timestamp fields include timezone information
diff --git a/dedup.py b/dedup.py
new file mode 100644
index 0000000..b316e38
--- /dev/null
+++ b/dedup.py
@@ -0,0 +1,398 @@
+# script to dedup a huggingface dataset
+
+from datasets import load_dataset
+import tqdm
+from collections import defaultdict
+import hashlib
+from typing import Dict, List, Tuple, Union
+
+import datasketch
+import pandas as pd
+
+def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]):
+    """
+    Remove exact duplicates from the nested data structure returned by get_sorted_hf_data.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        
+    Returns:
+        Dictionary with same structure but duplicates removed
+    """
+    deduplicated_dict = {}
+    
+    for run_mode, score_duration_dict in tqdm.tqdm(data_dict.items(), desc="Processing run modes"):
+        deduplicated_dict[run_mode] = {}
+
+        for run_success, run_success_dict in tqdm.tqdm(score_duration_dict.items(), desc=f"Processing {run_mode}", leave=False):
+            deduplicated_dict[run_mode][run_success] = {}
+            for score_duration, rows in tqdm.tqdm(run_success_dict.items(), desc=f"Processing {run_mode}", leave=False):
+                # Use a dictionary to track unique entries by their content hash
+                unique_entries = {}
+                
+                for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False):
+                    # Create a hash of the relevant content (assuming 'input' or similar field exists)
+                    # If the row has an 'input' field, use that; otherwise use the entire row
+                    content = row.get('code', "")
+                    content_hash = hashlib.sha256(content.encode()).hexdigest()
+                    
+                    if content_hash not in unique_entries:
+                        unique_entries[content_hash] = row
+                    else:
+                        # If duplicate found, keep the one with better metrics
+                        existing_row = unique_entries[content_hash]
+                        
+                        # For leaderboard mode with successful runs, prefer higher scores
+                        if run_mode == 'leaderboard' and row.get('run_passed') == True:
+                            if row.get('run_score', 0) > existing_row.get('run_score', 0):
+                                unique_entries[content_hash] = row
+                        # For other cases, prefer shorter duration (faster execution)
+                        else:
+                            existing_duration = existing_row.get('run_meta', {}).get('duration', float('inf'))
+                            current_duration = row.get('run_meta', {}).get('duration', float('inf'))
+                            if current_duration < existing_duration:
+                                unique_entries[content_hash] = row
+                
+                deduplicated_dict[run_mode][run_success][score_duration] = list(unique_entries.values())
+    
+    return deduplicated_dict
+
+
+def create_minhashes(
+    documents: List[Dict[str, str]],
+    ngram_size: int = 5,
+    bands: int = 20,
+    rows_per_band: int = 128,
+) -> Tuple[Dict[str, datasketch.MinHash], int]:
+    """
+    Create MinHash signatures for a list of documents with LSH bands configuration.
+
+    Args:
+        documents: List of dictionaries, each containing 'submission_id' and 'input' keys
+        num_permutations: Number of hash functions to use (default: 100)
+        ngram_size: Size of n-grams to generate from input text (default: 3)
+        bands: Number of bands for LSH (default: 20)
+
+    Returns:
+        Tuple containing:
+        - Dictionary mapping document submission_ids to their MinHash signatures
+        - Rows per band (num_permutations / bands)
+
+    Raises:
+        ValueError: If num_permutations is not divisible by bands
+    """
+
+    num_permutations = rows_per_band * bands
+
+    def generate_ngrams(text: str, n: int) -> List[str]:
+        """Generate n-grams from input text."""
+        return [text[i : i + n] for i in range(len(text) - n + 1)]
+
+    # Initialize result dictionary
+    minhash_dict = {}
+    # Process each document
+    for doc in tqdm.tqdm(documents, desc="Creating minhashes"):
+        minhash = datasketch.MinHash(num_perm=num_permutations)
+        submission_id = doc["submission_id"]
+        text = doc["code"].lower()  # Convert to lowercase for consistency
+
+        # Generate n-grams
+        ngrams = generate_ngrams(text, ngram_size)
+        for ngram in ngrams:
+            minhash.update(ngram.encode("utf8"))
+
+        minhash_dict[submission_id] = minhash
+
+    return minhash_dict
+
+
+# 16 bands with 128 rows
+def create_similarity_matrix(
+    minhashes: Dict[str, datasketch.MinHash],
+    rows_per_band: int,
+    num_bands: int,
+    threshold: float,
+) -> Dict[str, List[str]]:
+    lsh = datasketch.MinHashLSH(threshold=threshold, num_perm=num_bands * rows_per_band)
+    print(f"num_perm: {num_bands*rows_per_band}")
+    similarity_matrix = {}
+    for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Inserting minhashes into LSH"):
+        lsh.insert(submission_id, minhash)
+    for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Querying LSH"):
+        similar_submission_ids = lsh.query(minhash)
+        similarity_matrix[submission_id] = similar_submission_ids
+    for submission_id, similar_submission_ids in tqdm.tqdm(
+        similarity_matrix.items(), desc="Removing self-similarities"
+    ):
+        if submission_id in similar_submission_ids:
+            similar_submission_ids.remove(submission_id)
+    return similarity_matrix
+
+
+def filter_matrix(
+    similarity_matrix: Dict[str, List[str]]
+) -> set:
+    good_submission_ids = set()
+    processed = set()
+    
+    for submission_id, similar_submission_ids in similarity_matrix.items():
+        if submission_id in processed:
+            continue
+            
+        # Find all submissions in the similarity cluster
+        cluster = {submission_id}
+        cluster.update(similar_submission_ids)
+        
+        # Keep the one with the largest ID (tiebreaker)
+        keeper = max(cluster)
+        good_submission_ids.add(keeper)
+        
+        # Mark all in cluster as processed
+        processed.update(cluster)
+    
+    return good_submission_ids
+
+
+def fuzzy_filter(
+    data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]],
+    threshold: float = 0.7,
+    ngram_size: int = 5,
+    bands: int = 16,
+    rows_per_band: int = 128,
+) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
+    
+    total_categories = 0
+    for run_mode, run_success_dict in data_dict.items():
+        for run_success, score_duration_dict in run_success_dict.items():
+            for score_duration, rows in score_duration_dict.items():
+                total_categories += 1
+
+    deduped_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    current_category = 0
+    for run_mode, run_success_dict in data_dict.items():
+        for run_success, score_duration_dict in run_success_dict.items():
+            for score_duration, rows in score_duration_dict.items():
+                print(f"Processing {run_mode} {run_success} {score_duration} {len(rows)}")
+                print(f"This is {current_category} of {total_categories}")
+                current_category += 1
+                deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(rows, threshold, ngram_size, bands, rows_per_band)
+
+    return deduped_data
+
+def _fuzzy_filter(
+    data_list: List[Dict],
+    threshold: float = 0.7,
+    ngram_size: int = 5,
+    bands: int = 16,
+    rows_per_band: int = 128,
+) -> List[Dict]:
+    """
+    Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        threshold: Similarity threshold for LSH
+        ngram_size: Size of n-grams for MinHash
+        bands: Number of bands for LSH
+        rows_per_band: Rows per band for LSH
+        create_histogram: Whether to create similarity histogram
+        
+    Returns:
+        Dictionary with same structure but fuzzy duplicates removed
+    """
+    # Flatten the data for processing
+    
+    # Create documents for MinHash processing
+
+    if len(data_list) <= 1:
+        return data_list
+
+    all_documents = []
+    for i, row in tqdm.tqdm(enumerate(data_list), desc="Creating documents for MinHash"):
+        # Use 'input' field if available, otherwise use a string representation
+        content = row.get('code', str(row))
+        document = {
+            "submission_id": str(i),
+            "code": content,
+            "original_row": row
+        }
+        all_documents.append(document)
+    
+    # Apply fuzzy deduplication
+    minhashes = create_minhashes(
+        all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band
+    )
+    similarity_matrix = create_similarity_matrix(
+        minhashes, rows_per_band=rows_per_band, num_bands=bands, threshold=threshold
+    )
+    
+    good_submission_ids = filter_matrix(similarity_matrix)
+    
+    # Keep only the documents that passed the filter
+    good_documents = [all_documents[int(submission_id)]["original_row"] for submission_id in good_submission_ids]
+    
+    # Reconstruct the nested structure
+    return good_documents
+
+def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]:
+    # Login using e.g. `huggingface-cli login` to access this dataset
+    ds = load_dataset("GPUMODE/kernelbot-data", "submissions")
+
+    # we should divide things up into type
+    # run_mode
+    # run_sucess
+    # if run_mode is leaderboard then use score
+    # otherwise use run_meta[duration]
+
+
+    data = ds['train']
+
+    run_mode_dict = defaultdict(list)
+    run_success_dict = defaultdict(lambda: defaultdict(list))
+    run_duration_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+
+    for _, row in tqdm.tqdm(enumerate(data), desc="Processing dataset rows"):
+        run_mode = row['run_mode']
+        run_mode_dict[run_mode].append(row)
+
+    for run_mode, rows in tqdm.tqdm(run_mode_dict.items(), desc="Processing run modes"):
+        for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} success/failure", leave=False):
+            run_success_dict[run_mode][row['run_passed']].append(row)
+
+    for run_mode, mode_dict in tqdm.tqdm(run_success_dict.items(), desc="Processing success/failure groups"):
+        for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False):
+            for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False):
+                if run_mode == 'leaderboard' and run_success == True:
+                    rounded_score = round(float(row['run_score']), 4)
+                    run_duration_dict[run_mode][run_success][rounded_score].append(row)
+                else:
+                    rounded_duration = round(float(row['run_meta']['duration']), 0)
+                    run_duration_dict[run_mode][run_success][rounded_duration].append(row)
+
+    return run_duration_dict
+
+def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
+    """
+    Convert a pandas DataFrame to a nested dictionary structure.
+    
+    Args:
+        df: pandas DataFrame
+        
+    Returns:
+        Nested dictionary structure
+    """
+    data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    for _, row in tqdm.tqdm(df.iterrows(), desc="Processing DataFrame rows"):
+        run_mode = row['run_mode']
+        run_success = row['run_passed']
+        score_duration = row['run_meta']['duration']
+        data_dict[run_mode][run_success][score_duration].append(row)
+    return data_dict
+
+def flatten_data(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]]) -> List[Dict]:
+    """
+    Flatten the nested data structure to a list of documents with metadata.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        
+    Returns:
+        List of documents with additional metadata fields
+    """
+    flattened = []
+    for run_mode, run_success_dict in tqdm.tqdm(data_dict.items(), desc="Flattening data"):
+        for run_success, score_duration_dict in run_success_dict.items():
+            for score_duration, rows in score_duration_dict.items():
+                for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False):
+                    # Add metadata to each row
+                    row_with_metadata = row.copy()
+                    row_with_metadata['_run_mode'] = run_mode
+                    row_with_metadata['_run_success'] = run_success
+                    row_with_metadata['_score_duration'] = score_duration
+                    flattened.append(row_with_metadata)
+    return flattened
+
+def count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int:
+    """
+    Count total number of items in the nested data structure.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        
+    Returns:
+        Total number of items
+    """
+    total = 0
+    for run_mode in data_dict.values():
+        for run_success_dict in run_mode.values():
+            for rows in run_success_dict.values():
+                total += len(rows)
+    return total
+
+
+def example_usage():
+    """
+    Example of how to use the deduplication functions with get_hf_data output.
+    """
+    # Load the data
+    data = get_hf_data()
+    
+    print(f"Original data has {count_items(data)} total items")
+    
+    # Remove exact duplicates
+    deduplicated_data = remove_duplicates(data)
+    print(f"After exact deduplication: {count_items(deduplicated_data)} items")
+    
+    # Apply fuzzy deduplication
+    fuzzy_deduplicated_data = fuzzy_filter(
+        deduplicated_data,
+        threshold=0.8,  # High threshold for more strict deduplication
+        ngram_size=5,
+        bands=16,
+        rows_per_band=128
+    )
+    # convert to df
+    flattened_data = flatten_data(fuzzy_deduplicated_data)
+    df = pd.DataFrame(flattened_data)
+    
+    return df
+
+def dedup_df(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Deduplicate a pandas DataFrame.
+    
+    Args:
+        df: pandas DataFrame
+    """
+    # convert to dict
+    data_dict = convert_df_to_dict(df)
+    # deduplicate
+    deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128)
+    # convert to df
+    flattened_data = flatten_data(deduplicated_data)
+    df = pd.DataFrame(flattened_data)
+    return df
+
+def create_parquet_file(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]], filename: str):
+    """
+    Create a Parquet file from the nested data structure.
+    
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+        filename: Name of the output Parquet file
+    """
+    # Flatten the data
+    flattened_data = flatten_data(data_dict)
+    
+    # Create a pandas DataFrame from the flattened data
+    df = pd.DataFrame(flattened_data)
+    # Convert the DataFrame to a Parquet file
+    df.to_parquet(filename, index=False)
+
+
+
+def main():
+    example_usage()
+
+if __name__ == "__main__":
+    main()
diff --git a/export.py b/export.py
index 6d3078f..d7cffdf 100644
--- a/export.py
+++ b/export.py
@@ -6,6 +6,7 @@
 from sqlalchemy import create_engine, text
 import pyarrow as pa
 import pyarrow.parquet as pq
+from dedup import dedup_df
 
 load_dotenv()
 
@@ -304,6 +305,36 @@ def main(output_dir):
         os.path.join(output_dir, "successful_submissions.parquet")
     )
 
+    # Apply deduplication to submissions
+    print("Applying deduplication to submissions...")
+    submissions_parquet_path = os.path.join(output_dir, "submissions.parquet")
+    try:
+        submissions_df = pd.read_parquet(submissions_parquet_path)
+        original_count = len(submissions_df)
+
+        deduplicated_submissions_df = dedup_df(submissions_df.copy())
+        deduplicated_submissions_path = os.path.join(output_dir, "deduplicated_submissions.parquet")
+        deduplicated_submissions_df.to_parquet(deduplicated_submissions_path, index=False)
+
+        print(f"Deduplicated submissions saved to {deduplicated_submissions_path}")
+        print(f"Original submissions: {original_count}, After deduplication: {len(deduplicated_submissions_df)}")
+
+        # Create deduplicated successful submissions
+        if 'run_passed' in deduplicated_submissions_df.columns:
+            print("Creating deduplicated successful submissions...")
+            deduplicated_successful_df = deduplicated_submissions_df[deduplicated_submissions_df['run_passed'] == True].copy()
+            deduplicated_successful_path = os.path.join(output_dir, "deduplicated_successful_submissions.parquet")
+            deduplicated_successful_df.to_parquet(deduplicated_successful_path, index=False)
+
+            successful_parquet_path = os.path.join(output_dir, "successful_submissions.parquet")
+            successful_df = pd.read_parquet(successful_parquet_path)
+            print(f"Deduplicated successful submissions saved to {deduplicated_successful_path}")
+            print(f"Original successful: {len(successful_df)}, After deduplication: {len(deduplicated_successful_df)}")
+
+    except Exception as e:
+        print(f"Warning: Deduplication failed with error: {e}")
+        print("Proceeding without deduplication...")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Export leaderboard data to a Hugging Face dataset.")
diff --git a/test_dedup.py b/test_dedup.py
new file mode 100644
index 0000000..f5ea811
--- /dev/null
+++ b/test_dedup.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+"""
+Unit tests for the deduplication pipeline.
+Tests the end-to-end flow with fake data matching the database schema.
+"""
+
+import unittest
+import pandas as pd
+import numpy as np
+from datetime import datetime, timezone
+import random
+from typing import Dict, List, Any
+import tempfile
+import os
+import sys
+
+# Import the functions we want to test
+try:
+    from dedup import (
+        remove_duplicates,
+        fuzzy_filter,
+        convert_df_to_dict,
+        flatten_data,
+        dedup_df,
+        count_items,
+        create_parquet_file
+    )
+except ImportError as e:
+    print(f"Import error: {e}")
+    print("Some functions may not be available for testing")
+
+
+class TestDedupEndToEnd(unittest.TestCase):
+    
+    def setUp(self):
+        """Set up test fixtures with fake data matching the schema."""
+        random.seed(42)  # For reproducible tests
+        np.random.seed(42)
+        self.fake_data = self.create_fake_dataset(50)
+        self.df = pd.DataFrame(self.fake_data)
+        
+    def create_fake_dataset(self, num_entries: int) -> List[Dict[str, Any]]:
+        """Create a fake dataset with the required schema fields."""
+        fake_data = []
+        
+        # Sample code snippets (some duplicates for testing)
+        code_samples = [
+            "def hello_world():\n    print('Hello World')",
+            "import numpy as np\nx = np.array([1, 2, 3])",
+            "for i in range(10):\n    print(i)",
+            "class MyClass:\n    def __init__(self):\n        pass",
+            "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
+            "import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})",
+            "def quicksort(arr):\n    if len(arr) <= 1:\n        return arr",
+            "x = [1, 2, 3, 4, 5]\ny = [i**2 for i in x]",
+            "try:\n    result = 10 / 0\nexcept ZeroDivisionError:\n    print('Error')",
+            "def hello_world():\n    print('Hello World')",  # Exact duplicate
+        ]
+        
+        run_modes = ['leaderboard', 'benchmark', 'test']
+        file_names = ['solution.py', 'main.py', 'algorithm.py', 'test.py']
+        
+        for i in range(num_entries):
+            # Create base timestamp
+            base_time = datetime(2024, 1, 1, tzinfo=timezone.utc)
+            submission_time = base_time.replace(
+                day=random.randint(1, 28),
+                hour=random.randint(0, 23),
+                minute=random.randint(0, 59)
+            )
+            
+            # Select code (with some duplicates)
+            code = random.choice(code_samples)
+            if i < 5:  # First 5 entries use the same code for exact duplicate testing
+                code = code_samples[0]
+            elif i < 10:  # Next 5 use slightly modified versions for fuzzy testing
+                code = code_samples[0] + f"\n# Comment {i}"
+                
+            run_mode = random.choice(run_modes)
+            run_passed = random.choice([True, False])
+            
+            # Generate run score based on mode and success
+            if run_mode == 'leaderboard' and run_passed:
+                run_score = round(random.uniform(0.1, 1.0), 4)
+            else:
+                run_score = 0.0 if not run_passed else round(random.uniform(0.1, 0.8), 4)
+                
+            # Create the entry matching the database schema
+            entry = {
+                'submission_id': i + 1000,
+                'leaderboard_id': random.randint(1, 10),
+                'user_id': random.randint(100, 999),
+                'submission_time': submission_time,
+                'file_name': random.choice(file_names),
+                'code': code,
+                'code_id': i + 2000,
+                'run_id': i + 3000,
+                'run_start_time': submission_time,
+                'run_end_time': submission_time.replace(
+                    second=random.randint(1, 59)
+                ),
+                'run_mode': run_mode,
+                'run_score': run_score,
+                'run_passed': run_passed,
+                'run_result': {
+                    'benchmark-count': random.randint(1, 10),
+                    'benchmark.0.best': f'benchmark_{random.randint(1, 100)}.txt',
+                    'benchmark.0.err': '',
+                    'benchmark.0.mean': round(random.uniform(0.1, 2.0), 6),
+                    'benchmark.0.report': f'report_{i}.json'
+                },
+                'run_compilation': {
+                    'command': 'python',
+                    'exit_code': 0 if run_passed else random.randint(1, 255),
+                    'nvcc_found': random.choice([True, False]),
+                    'nvcc_version': f'11.{random.randint(0, 8)}',
+                    'stderr': '' if run_passed else f'Error message {i}',
+                    'stdout': f'Output {i}',
+                    'success': run_passed
+                },
+                'run_meta': {
+                    'command': 'python solution.py',
+                    'duration': round(random.uniform(0.1, 10.0), 3),
+                    'exit_code': 0 if run_passed else random.randint(1, 255),
+                    'stderr': '' if run_passed else f'Runtime error {i}',
+                    'stdout': f'Runtime output {i}',
+                    'success': run_passed
+                },
+                'run_system_info': {
+                    'cpu': f'Intel Core i{random.randint(5, 9)}',
+                    'gpu': random.choice(['NVIDIA RTX 3080', 'NVIDIA RTX 4090', 'None']),
+                    'platform': random.choice(['linux', 'darwin', 'win32']),
+                    'torch': f'2.{random.randint(0, 3)}.{random.randint(0, 9)}'
+                }
+            }
+            fake_data.append(entry)
+            
+        return fake_data
+    
+    def test_dataframe_creation(self):
+        """Test that the fake dataset creates a valid DataFrame."""
+        self.assertEqual(len(self.df), 50)
+        
+        # Check required columns exist (matching the schema in the image)
+        required_columns = [
+            'submission_id', 'leaderboard_id', 'user_id', 'submission_time',
+            'file_name', 'code', 'code_id', 'run_id', 'run_start_time',
+            'run_end_time', 'run_mode', 'run_score', 'run_passed',
+            'run_result', 'run_compilation', 'run_meta', 'run_system_info'
+        ]
+        
+        for col in required_columns:
+            self.assertIn(col, self.df.columns, f"Missing required column: {col}")
+            
+        # Check data types
+        self.assertTrue(self.df['submission_id'].dtype in ['int64', 'int32'])
+        self.assertTrue(self.df['run_passed'].dtype == 'bool')
+        self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32'])
+        
+        # Verify struct fields exist
+        sample_row = self.df.iloc[0]
+        self.assertIsInstance(sample_row['run_result'], dict)
+        self.assertIsInstance(sample_row['run_compilation'], dict)
+        self.assertIsInstance(sample_row['run_meta'], dict)
+        self.assertIsInstance(sample_row['run_system_info'], dict)
+        
+    def test_convert_df_to_dict(self):
+        """Test conversion from DataFrame to nested dictionary structure."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            
+            # Check structure
+            self.assertIsInstance(data_dict, dict)
+            
+            # Should have run_mode keys
+            run_modes = set(self.df['run_mode'].unique())
+            self.assertEqual(set(data_dict.keys()), run_modes)
+            
+            # Check nested structure
+            for run_mode in data_dict:
+                self.assertIsInstance(data_dict[run_mode], dict)
+                for run_success in data_dict[run_mode]:
+                    self.assertIsInstance(data_dict[run_mode][run_success], dict)
+                    for score_duration in data_dict[run_mode][run_success]:
+                        self.assertIsInstance(
+                            data_dict[run_mode][run_success][score_duration], 
+                            list
+                        )
+        except NameError:
+            self.skipTest("convert_df_to_dict function not available")
+    
+    def test_exact_deduplication(self):
+        """Test exact duplicate removal."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            original_count = count_items(data_dict)
+            
+            deduplicated_data = remove_duplicates(data_dict)
+            deduplicated_count = count_items(deduplicated_data)
+            
+            # Should have fewer or equal items after deduplication
+            self.assertLessEqual(deduplicated_count, original_count)
+            
+            # Structure should be preserved
+            self.assertEqual(set(data_dict.keys()), set(deduplicated_data.keys()))
+            
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+        
+    def test_fuzzy_deduplication_small(self):
+        """Test fuzzy duplicate removal with small threshold for faster testing."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            original_count = count_items(data_dict)
+            
+            # Use small parameters for faster testing
+            fuzzy_deduplicated_data = fuzzy_filter(
+                data_dict,
+                threshold=0.5,  # Lower threshold for faster testing
+                ngram_size=3,   # Smaller ngram size
+                bands=4,        # Fewer bands
+                rows_per_band=32  # Fewer rows per band
+            )
+            
+            fuzzy_count = count_items(fuzzy_deduplicated_data)
+            
+            # Should have fewer or equal items after fuzzy deduplication
+            self.assertLessEqual(fuzzy_count, original_count)
+            
+            # Structure should be preserved
+            self.assertEqual(set(data_dict.keys()), set(fuzzy_deduplicated_data.keys()))
+            
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+    
+    def test_flatten_and_reconstruct(self):
+        """Test flattening and reconstruction of data."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            original_count = count_items(data_dict)
+            
+            # Flatten
+            flattened_data = flatten_data(data_dict)
+            self.assertEqual(len(flattened_data), original_count)
+            
+            # Check metadata fields were added
+            if flattened_data:
+                sample_row = flattened_data[0]
+                self.assertIn('_run_mode', sample_row)
+                self.assertIn('_run_success', sample_row)
+                self.assertIn('_score_duration', sample_row)
+                
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+    
+    def test_dedup_df_end_to_end(self):
+        """Test the complete deduplication pipeline."""
+        try:
+            original_length = len(self.df)
+            
+            # Run the complete deduplication pipeline
+            deduplicated_df = dedup_df(self.df)
+            
+            # Should return a DataFrame
+            self.assertIsInstance(deduplicated_df, pd.DataFrame)
+            
+            # Should have fewer or equal rows
+            self.assertLessEqual(len(deduplicated_df), original_length)
+            
+            # Should preserve required columns
+            required_columns = ['submission_id', 'code', 'run_mode', 'run_passed']
+            for col in required_columns:
+                self.assertIn(col, deduplicated_df.columns)
+                
+            # Check data integrity
+            self.assertFalse(deduplicated_df.empty, "Deduplicated DataFrame should not be empty")
+            
+        except NameError as e:
+            self.skipTest(f"dedup_df function not available: {e}")
+        
+    def test_parquet_creation(self):
+        """Test Parquet file creation."""
+        try:
+            data_dict = convert_df_to_dict(self.df)
+            
+            with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as tmp_file:
+                try:
+                    create_parquet_file(data_dict, tmp_file.name)
+                    
+                    # Check file was created
+                    self.assertTrue(os.path.exists(tmp_file.name))
+                    
+                    # Check file is not empty
+                    self.assertGreater(os.path.getsize(tmp_file.name), 0)
+                    
+                    # Try to read the file back
+                    df_from_parquet = pd.read_parquet(tmp_file.name)
+                    self.assertIsInstance(df_from_parquet, pd.DataFrame)
+                    self.assertGreater(len(df_from_parquet), 0)
+                    
+                finally:
+                    # Clean up
+                    if os.path.exists(tmp_file.name):
+                        os.unlink(tmp_file.name)
+                        
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+    
+    def test_data_consistency_after_deduplication(self):
+        """Test that data remains consistent after deduplication."""
+        try:
+            # Create dataset with known duplicates
+            duplicate_data = []
+            
+            # Add the same code 3 times with different metadata
+            base_entry = self.fake_data[0].copy()
+            for i in range(3):
+                entry = base_entry.copy()
+                entry['submission_id'] = 9000 + i
+                entry['run_id'] = 9100 + i
+                duplicate_data.append(entry)
+                
+            # Add to main dataset
+            test_data = self.fake_data + duplicate_data
+            test_df = pd.DataFrame(test_data)
+            
+            original_length = len(test_df)
+            deduplicated_df = dedup_df(test_df)
+            
+            # Should have removed at least 2 duplicates
+            self.assertLess(len(deduplicated_df), original_length)
+            
+            # Check that essential fields are preserved
+            self.assertTrue(all(col in deduplicated_df.columns for col in 
+                              ['submission_id', 'code', 'run_mode', 'run_passed']))
+                              
+        except NameError as e:
+            self.skipTest(f"Required functions not available: {e}")
+
+    def test_schema_compliance(self):
+        """Test that the fake dataset matches the expected schema from the database."""
+        # Test all required fields exist and have correct types
+        
+        # Test BIGINT fields
+        bigint_fields = ['submission_id', 'leaderboard_id', 'user_id', 'code_id', 'run_id']
+        for field in bigint_fields:
+            self.assertTrue(self.df[field].dtype in ['int64', 'int32'], 
+                          f"{field} should be integer type")
+            
+        # Test VARCHAR fields
+        varchar_fields = ['file_name', 'code', 'run_mode']
+        for field in varchar_fields:
+            self.assertTrue(self.df[field].dtype == 'object', 
+                          f"{field} should be string/object type")
+            
+        # Test TIMESTAMP fields
+        timestamp_fields = ['submission_time', 'run_start_time', 'run_end_time']
+        for field in timestamp_fields:
+            # Check that all values are datetime objects with timezone
+            sample_value = self.df[field].iloc[0]
+            self.assertIsInstance(sample_value, datetime)
+            self.assertIsNotNone(sample_value.tzinfo)
+            
+        # Test DOUBLE field
+        self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32'])
+        
+        # Test BOOLEAN field
+        self.assertTrue(self.df['run_passed'].dtype == 'bool')
+        
+        # Test STRUCT fields
+        struct_fields = ['run_result', 'run_compilation', 'run_meta', 'run_system_info']
+        for field in struct_fields:
+            # All values should be dictionaries
+            self.assertTrue(all(isinstance(val, dict) for val in self.df[field]))
+            
+    def test_duplicate_detection(self):
+        """Test that we can detect exact and near duplicates in the dataset."""
+        # Count exact duplicates by code
+        code_counts = self.df['code'].value_counts()
+        exact_duplicates = code_counts[code_counts > 1]
+        
+        # Should have some exact duplicates (first 5 entries)
+        self.assertGreater(len(exact_duplicates), 0, "Should have exact duplicates for testing")
+        
+        # Check that fuzzy duplicates exist (entries with similar code)
+        similar_code_count = 0
+        base_code = "def hello_world():\n    print('Hello World')"
+        for code in self.df['code']:
+            if base_code in code and code != base_code:
+                similar_code_count += 1
+                
+        self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing")
+
+
+if __name__ == '__main__':
+    # Add some helpful output
+    print("Running deduplication pipeline tests...")
+    print(f"Python version: {sys.version}")
+    print(f"Pandas version: {pd.__version__}")
+    
+    # Run the tests
+    unittest.main(verbosity=2) 
\ No newline at end of file

From 216d47a05f1a23eec88dfd8394d617dd0d838818 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Mon, 23 Jun 2025 17:10:24 -0400
Subject: [PATCH 6/9] magic number removal

---
 dedup.py | 118 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 100 insertions(+), 18 deletions(-)

diff --git a/dedup.py b/dedup.py
index b316e38..8f209a6 100644
--- a/dedup.py
+++ b/dedup.py
@@ -9,6 +9,82 @@
 import datasketch
 import pandas as pd
 
+# =============================================================================
+# DEDUPLICATION CONFIGURATION CONSTANTS
+# =============================================================================
+
+# Fuzzy Deduplication Parameters
+FUZZY_SIMILARITY_THRESHOLD = 0.8
+"""
+Jaccard similarity threshold for considering two documents as duplicates.
+Range: 0.0 to 1.0
+- 0.8 = High threshold, only very similar documents are considered duplicates
+- 0.7 = Medium threshold, moderately similar documents are duplicates  
+- 0.5 = Low threshold, loosely similar documents are duplicates
+Higher values = more strict deduplication, fewer items removed
+"""
+
+NGRAM_SIZE = 5
+"""
+Size of character n-grams used for MinHash fingerprinting.
+- Smaller values (3-4): More sensitive to small changes, better for short text
+- Larger values (5-7): Less sensitive to minor variations, better for longer text
+- Too small: May create false positives (different texts seem similar)
+- Too large: May miss actual duplicates with small variations
+"""
+
+LSH_BANDS = 16
+"""
+Number of bands for Locality Sensitive Hashing (LSH).
+Used to speed up similarity detection by grouping similar hashes.
+- More bands = faster but less accurate similarity detection
+- Fewer bands = slower but more accurate similarity detection
+Must divide evenly into ROWS_PER_BAND * LSH_BANDS = total permutations
+"""
+
+ROWS_PER_BAND = 128
+"""
+Number of rows per band in LSH configuration.
+Total MinHash permutations = ROWS_PER_BAND * LSH_BANDS
+- More rows per band = higher precision, may miss some similar pairs
+- Fewer rows per band = higher recall, may include more false positives
+Default: 128 rows × 16 bands = 2048 total permutations
+"""
+
+# Score Processing Parameters
+LEADERBOARD_SCORE_PRECISION = 4
+"""
+Number of decimal places to round leaderboard scores when grouping submissions.
+Used to group submissions with very similar scores together.
+- Higher precision (more decimal places): More granular grouping
+- Lower precision (fewer decimal places): Broader grouping of similar scores
+"""
+
+DURATION_PRECISION = 0
+"""
+Number of decimal places to round execution duration (in seconds).
+Used to group submissions with similar execution times.
+- 0: Round to nearest second (1.7s → 2s)
+- 1: Round to nearest 0.1s (1.73s → 1.7s)
+"""
+
+# =============================================================================
+# CONFIGURATION SUMMARY
+# =============================================================================
+"""
+Current deduplication configuration:
+├─ Similarity Detection: 0.8 threshold (strict)
+├─ Text Fingerprinting: 5-character n-grams  
+├─ LSH Performance: 16 bands × 128 rows = 2048 permutations
+├─ Score Grouping: 4 decimal places for leaderboard scores
+└─ Duration Grouping: 0 decimal places for execution times
+
+To adjust deduplication sensitivity:
+- Increase FUZZY_SIMILARITY_THRESHOLD (0.8→0.9) for stricter deduplication
+- Decrease FUZZY_SIMILARITY_THRESHOLD (0.8→0.7) for more aggressive deduplication  
+- Adjust NGRAM_SIZE for different text lengths (3-4 for short, 5-7 for long)
+"""
+
 def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]):
     """
     Remove exact duplicates from the nested data structure returned by get_sorted_hf_data.
@@ -60,9 +136,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li
 
 def create_minhashes(
     documents: List[Dict[str, str]],
-    ngram_size: int = 5,
-    bands: int = 20,
-    rows_per_band: int = 128,
+    ngram_size: int = NGRAM_SIZE,
+    bands: int = LSH_BANDS,
+    rows_per_band: int = ROWS_PER_BAND,
 ) -> Tuple[Dict[str, datasketch.MinHash], int]:
     """
     Create MinHash signatures for a list of documents with LSH bands configuration.
@@ -155,10 +231,10 @@ def filter_matrix(
 
 def fuzzy_filter(
     data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]],
-    threshold: float = 0.7,
-    ngram_size: int = 5,
-    bands: int = 16,
-    rows_per_band: int = 128,
+    threshold: float = FUZZY_SIMILARITY_THRESHOLD,
+    ngram_size: int = NGRAM_SIZE,
+    bands: int = LSH_BANDS,
+    rows_per_band: int = ROWS_PER_BAND,
 ) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
     
     total_categories = 0
@@ -181,10 +257,10 @@ def fuzzy_filter(
 
 def _fuzzy_filter(
     data_list: List[Dict],
-    threshold: float = 0.7,
-    ngram_size: int = 5,
-    bands: int = 16,
-    rows_per_band: int = 128,
+    threshold: float = FUZZY_SIMILARITY_THRESHOLD,
+    ngram_size: int = NGRAM_SIZE,
+    bands: int = LSH_BANDS,
+    rows_per_band: int = ROWS_PER_BAND,
 ) -> List[Dict]:
     """
     Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data.
@@ -263,10 +339,10 @@ def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]:
         for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False):
             for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False):
                 if run_mode == 'leaderboard' and run_success == True:
-                    rounded_score = round(float(row['run_score']), 4)
+                    rounded_score = round(float(row['run_score']), LEADERBOARD_SCORE_PRECISION)
                     run_duration_dict[run_mode][run_success][rounded_score].append(row)
                 else:
-                    rounded_duration = round(float(row['run_meta']['duration']), 0)
+                    rounded_duration = round(float(row['run_meta']['duration']), DURATION_PRECISION)
                     run_duration_dict[run_mode][run_success][rounded_duration].append(row)
 
     return run_duration_dict
@@ -346,10 +422,10 @@ def example_usage():
     # Apply fuzzy deduplication
     fuzzy_deduplicated_data = fuzzy_filter(
         deduplicated_data,
-        threshold=0.8,  # High threshold for more strict deduplication
-        ngram_size=5,
-        bands=16,
-        rows_per_band=128
+        threshold=FUZZY_SIMILARITY_THRESHOLD,
+        ngram_size=NGRAM_SIZE,
+        bands=LSH_BANDS,
+        rows_per_band=ROWS_PER_BAND
     )
     # convert to df
     flattened_data = flatten_data(fuzzy_deduplicated_data)
@@ -367,7 +443,13 @@ def dedup_df(df: pd.DataFrame) -> pd.DataFrame:
     # convert to dict
     data_dict = convert_df_to_dict(df)
     # deduplicate
-    deduplicated_data = fuzzy_filter(data_dict, threshold=0.8, ngram_size=5, bands=16, rows_per_band=128)
+    deduplicated_data = fuzzy_filter(
+        data_dict, 
+        threshold=FUZZY_SIMILARITY_THRESHOLD, 
+        ngram_size=NGRAM_SIZE, 
+        bands=LSH_BANDS, 
+        rows_per_band=ROWS_PER_BAND
+    )
     # convert to df
     flattened_data = flatten_data(deduplicated_data)
     df = pd.DataFrame(flattened_data)

From 89573b4b80107071d752612242951b02d8e571f2 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Tue, 24 Jun 2025 08:43:42 -0400
Subject: [PATCH 7/9] Add deduplicated datasets

---
 dedup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dedup.py b/dedup.py
index 8f209a6..f6f3449 100644
--- a/dedup.py
+++ b/dedup.py
@@ -118,9 +118,9 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li
                         # If duplicate found, keep the one with better metrics
                         existing_row = unique_entries[content_hash]
                         
-                        # For leaderboard mode with successful runs, prefer higher scores
+                        # For leaderboard mode with successful runs, prefer lower scores / faster times
                         if run_mode == 'leaderboard' and row.get('run_passed') == True:
-                            if row.get('run_score', 0) > existing_row.get('run_score', 0):
+                            if row.get('run_score', 0) < existing_row.get('run_score', 0):
                                 unique_entries[content_hash] = row
                         # For other cases, prefer shorter duration (faster execution)
                         else:

From d32c79ca2855ea2d27af77e80f326db33a2d69e6 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahan@Sahans-MacBook-Pro.local>
Date: Fri, 28 Nov 2025 13:55:46 -0500
Subject: [PATCH 8/9] remove test

---
 test_dedup.py | 402 --------------------------------------------------
 1 file changed, 402 deletions(-)
 delete mode 100644 test_dedup.py

diff --git a/test_dedup.py b/test_dedup.py
deleted file mode 100644
index f5ea811..0000000
--- a/test_dedup.py
+++ /dev/null
@@ -1,402 +0,0 @@
-#!/usr/bin/env python3
-"""
-Unit tests for the deduplication pipeline.
-Tests the end-to-end flow with fake data matching the database schema.
-"""
-
-import unittest
-import pandas as pd
-import numpy as np
-from datetime import datetime, timezone
-import random
-from typing import Dict, List, Any
-import tempfile
-import os
-import sys
-
-# Import the functions we want to test
-try:
-    from dedup import (
-        remove_duplicates,
-        fuzzy_filter,
-        convert_df_to_dict,
-        flatten_data,
-        dedup_df,
-        count_items,
-        create_parquet_file
-    )
-except ImportError as e:
-    print(f"Import error: {e}")
-    print("Some functions may not be available for testing")
-
-
-class TestDedupEndToEnd(unittest.TestCase):
-    
-    def setUp(self):
-        """Set up test fixtures with fake data matching the schema."""
-        random.seed(42)  # For reproducible tests
-        np.random.seed(42)
-        self.fake_data = self.create_fake_dataset(50)
-        self.df = pd.DataFrame(self.fake_data)
-        
-    def create_fake_dataset(self, num_entries: int) -> List[Dict[str, Any]]:
-        """Create a fake dataset with the required schema fields."""
-        fake_data = []
-        
-        # Sample code snippets (some duplicates for testing)
-        code_samples = [
-            "def hello_world():\n    print('Hello World')",
-            "import numpy as np\nx = np.array([1, 2, 3])",
-            "for i in range(10):\n    print(i)",
-            "class MyClass:\n    def __init__(self):\n        pass",
-            "def fibonacci(n):\n    if n <= 1:\n        return n\n    return fibonacci(n-1) + fibonacci(n-2)",
-            "import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})",
-            "def quicksort(arr):\n    if len(arr) <= 1:\n        return arr",
-            "x = [1, 2, 3, 4, 5]\ny = [i**2 for i in x]",
-            "try:\n    result = 10 / 0\nexcept ZeroDivisionError:\n    print('Error')",
-            "def hello_world():\n    print('Hello World')",  # Exact duplicate
-        ]
-        
-        run_modes = ['leaderboard', 'benchmark', 'test']
-        file_names = ['solution.py', 'main.py', 'algorithm.py', 'test.py']
-        
-        for i in range(num_entries):
-            # Create base timestamp
-            base_time = datetime(2024, 1, 1, tzinfo=timezone.utc)
-            submission_time = base_time.replace(
-                day=random.randint(1, 28),
-                hour=random.randint(0, 23),
-                minute=random.randint(0, 59)
-            )
-            
-            # Select code (with some duplicates)
-            code = random.choice(code_samples)
-            if i < 5:  # First 5 entries use the same code for exact duplicate testing
-                code = code_samples[0]
-            elif i < 10:  # Next 5 use slightly modified versions for fuzzy testing
-                code = code_samples[0] + f"\n# Comment {i}"
-                
-            run_mode = random.choice(run_modes)
-            run_passed = random.choice([True, False])
-            
-            # Generate run score based on mode and success
-            if run_mode == 'leaderboard' and run_passed:
-                run_score = round(random.uniform(0.1, 1.0), 4)
-            else:
-                run_score = 0.0 if not run_passed else round(random.uniform(0.1, 0.8), 4)
-                
-            # Create the entry matching the database schema
-            entry = {
-                'submission_id': i + 1000,
-                'leaderboard_id': random.randint(1, 10),
-                'user_id': random.randint(100, 999),
-                'submission_time': submission_time,
-                'file_name': random.choice(file_names),
-                'code': code,
-                'code_id': i + 2000,
-                'run_id': i + 3000,
-                'run_start_time': submission_time,
-                'run_end_time': submission_time.replace(
-                    second=random.randint(1, 59)
-                ),
-                'run_mode': run_mode,
-                'run_score': run_score,
-                'run_passed': run_passed,
-                'run_result': {
-                    'benchmark-count': random.randint(1, 10),
-                    'benchmark.0.best': f'benchmark_{random.randint(1, 100)}.txt',
-                    'benchmark.0.err': '',
-                    'benchmark.0.mean': round(random.uniform(0.1, 2.0), 6),
-                    'benchmark.0.report': f'report_{i}.json'
-                },
-                'run_compilation': {
-                    'command': 'python',
-                    'exit_code': 0 if run_passed else random.randint(1, 255),
-                    'nvcc_found': random.choice([True, False]),
-                    'nvcc_version': f'11.{random.randint(0, 8)}',
-                    'stderr': '' if run_passed else f'Error message {i}',
-                    'stdout': f'Output {i}',
-                    'success': run_passed
-                },
-                'run_meta': {
-                    'command': 'python solution.py',
-                    'duration': round(random.uniform(0.1, 10.0), 3),
-                    'exit_code': 0 if run_passed else random.randint(1, 255),
-                    'stderr': '' if run_passed else f'Runtime error {i}',
-                    'stdout': f'Runtime output {i}',
-                    'success': run_passed
-                },
-                'run_system_info': {
-                    'cpu': f'Intel Core i{random.randint(5, 9)}',
-                    'gpu': random.choice(['NVIDIA RTX 3080', 'NVIDIA RTX 4090', 'None']),
-                    'platform': random.choice(['linux', 'darwin', 'win32']),
-                    'torch': f'2.{random.randint(0, 3)}.{random.randint(0, 9)}'
-                }
-            }
-            fake_data.append(entry)
-            
-        return fake_data
-    
-    def test_dataframe_creation(self):
-        """Test that the fake dataset creates a valid DataFrame."""
-        self.assertEqual(len(self.df), 50)
-        
-        # Check required columns exist (matching the schema in the image)
-        required_columns = [
-            'submission_id', 'leaderboard_id', 'user_id', 'submission_time',
-            'file_name', 'code', 'code_id', 'run_id', 'run_start_time',
-            'run_end_time', 'run_mode', 'run_score', 'run_passed',
-            'run_result', 'run_compilation', 'run_meta', 'run_system_info'
-        ]
-        
-        for col in required_columns:
-            self.assertIn(col, self.df.columns, f"Missing required column: {col}")
-            
-        # Check data types
-        self.assertTrue(self.df['submission_id'].dtype in ['int64', 'int32'])
-        self.assertTrue(self.df['run_passed'].dtype == 'bool')
-        self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32'])
-        
-        # Verify struct fields exist
-        sample_row = self.df.iloc[0]
-        self.assertIsInstance(sample_row['run_result'], dict)
-        self.assertIsInstance(sample_row['run_compilation'], dict)
-        self.assertIsInstance(sample_row['run_meta'], dict)
-        self.assertIsInstance(sample_row['run_system_info'], dict)
-        
-    def test_convert_df_to_dict(self):
-        """Test conversion from DataFrame to nested dictionary structure."""
-        try:
-            data_dict = convert_df_to_dict(self.df)
-            
-            # Check structure
-            self.assertIsInstance(data_dict, dict)
-            
-            # Should have run_mode keys
-            run_modes = set(self.df['run_mode'].unique())
-            self.assertEqual(set(data_dict.keys()), run_modes)
-            
-            # Check nested structure
-            for run_mode in data_dict:
-                self.assertIsInstance(data_dict[run_mode], dict)
-                for run_success in data_dict[run_mode]:
-                    self.assertIsInstance(data_dict[run_mode][run_success], dict)
-                    for score_duration in data_dict[run_mode][run_success]:
-                        self.assertIsInstance(
-                            data_dict[run_mode][run_success][score_duration], 
-                            list
-                        )
-        except NameError:
-            self.skipTest("convert_df_to_dict function not available")
-    
-    def test_exact_deduplication(self):
-        """Test exact duplicate removal."""
-        try:
-            data_dict = convert_df_to_dict(self.df)
-            original_count = count_items(data_dict)
-            
-            deduplicated_data = remove_duplicates(data_dict)
-            deduplicated_count = count_items(deduplicated_data)
-            
-            # Should have fewer or equal items after deduplication
-            self.assertLessEqual(deduplicated_count, original_count)
-            
-            # Structure should be preserved
-            self.assertEqual(set(data_dict.keys()), set(deduplicated_data.keys()))
-            
-        except NameError as e:
-            self.skipTest(f"Required functions not available: {e}")
-        
-    def test_fuzzy_deduplication_small(self):
-        """Test fuzzy duplicate removal with small threshold for faster testing."""
-        try:
-            data_dict = convert_df_to_dict(self.df)
-            original_count = count_items(data_dict)
-            
-            # Use small parameters for faster testing
-            fuzzy_deduplicated_data = fuzzy_filter(
-                data_dict,
-                threshold=0.5,  # Lower threshold for faster testing
-                ngram_size=3,   # Smaller ngram size
-                bands=4,        # Fewer bands
-                rows_per_band=32  # Fewer rows per band
-            )
-            
-            fuzzy_count = count_items(fuzzy_deduplicated_data)
-            
-            # Should have fewer or equal items after fuzzy deduplication
-            self.assertLessEqual(fuzzy_count, original_count)
-            
-            # Structure should be preserved
-            self.assertEqual(set(data_dict.keys()), set(fuzzy_deduplicated_data.keys()))
-            
-        except NameError as e:
-            self.skipTest(f"Required functions not available: {e}")
-    
-    def test_flatten_and_reconstruct(self):
-        """Test flattening and reconstruction of data."""
-        try:
-            data_dict = convert_df_to_dict(self.df)
-            original_count = count_items(data_dict)
-            
-            # Flatten
-            flattened_data = flatten_data(data_dict)
-            self.assertEqual(len(flattened_data), original_count)
-            
-            # Check metadata fields were added
-            if flattened_data:
-                sample_row = flattened_data[0]
-                self.assertIn('_run_mode', sample_row)
-                self.assertIn('_run_success', sample_row)
-                self.assertIn('_score_duration', sample_row)
-                
-        except NameError as e:
-            self.skipTest(f"Required functions not available: {e}")
-    
-    def test_dedup_df_end_to_end(self):
-        """Test the complete deduplication pipeline."""
-        try:
-            original_length = len(self.df)
-            
-            # Run the complete deduplication pipeline
-            deduplicated_df = dedup_df(self.df)
-            
-            # Should return a DataFrame
-            self.assertIsInstance(deduplicated_df, pd.DataFrame)
-            
-            # Should have fewer or equal rows
-            self.assertLessEqual(len(deduplicated_df), original_length)
-            
-            # Should preserve required columns
-            required_columns = ['submission_id', 'code', 'run_mode', 'run_passed']
-            for col in required_columns:
-                self.assertIn(col, deduplicated_df.columns)
-                
-            # Check data integrity
-            self.assertFalse(deduplicated_df.empty, "Deduplicated DataFrame should not be empty")
-            
-        except NameError as e:
-            self.skipTest(f"dedup_df function not available: {e}")
-        
-    def test_parquet_creation(self):
-        """Test Parquet file creation."""
-        try:
-            data_dict = convert_df_to_dict(self.df)
-            
-            with tempfile.NamedTemporaryFile(suffix='.parquet', delete=False) as tmp_file:
-                try:
-                    create_parquet_file(data_dict, tmp_file.name)
-                    
-                    # Check file was created
-                    self.assertTrue(os.path.exists(tmp_file.name))
-                    
-                    # Check file is not empty
-                    self.assertGreater(os.path.getsize(tmp_file.name), 0)
-                    
-                    # Try to read the file back
-                    df_from_parquet = pd.read_parquet(tmp_file.name)
-                    self.assertIsInstance(df_from_parquet, pd.DataFrame)
-                    self.assertGreater(len(df_from_parquet), 0)
-                    
-                finally:
-                    # Clean up
-                    if os.path.exists(tmp_file.name):
-                        os.unlink(tmp_file.name)
-                        
-        except NameError as e:
-            self.skipTest(f"Required functions not available: {e}")
-    
-    def test_data_consistency_after_deduplication(self):
-        """Test that data remains consistent after deduplication."""
-        try:
-            # Create dataset with known duplicates
-            duplicate_data = []
-            
-            # Add the same code 3 times with different metadata
-            base_entry = self.fake_data[0].copy()
-            for i in range(3):
-                entry = base_entry.copy()
-                entry['submission_id'] = 9000 + i
-                entry['run_id'] = 9100 + i
-                duplicate_data.append(entry)
-                
-            # Add to main dataset
-            test_data = self.fake_data + duplicate_data
-            test_df = pd.DataFrame(test_data)
-            
-            original_length = len(test_df)
-            deduplicated_df = dedup_df(test_df)
-            
-            # Should have removed at least 2 duplicates
-            self.assertLess(len(deduplicated_df), original_length)
-            
-            # Check that essential fields are preserved
-            self.assertTrue(all(col in deduplicated_df.columns for col in 
-                              ['submission_id', 'code', 'run_mode', 'run_passed']))
-                              
-        except NameError as e:
-            self.skipTest(f"Required functions not available: {e}")
-
-    def test_schema_compliance(self):
-        """Test that the fake dataset matches the expected schema from the database."""
-        # Test all required fields exist and have correct types
-        
-        # Test BIGINT fields
-        bigint_fields = ['submission_id', 'leaderboard_id', 'user_id', 'code_id', 'run_id']
-        for field in bigint_fields:
-            self.assertTrue(self.df[field].dtype in ['int64', 'int32'], 
-                          f"{field} should be integer type")
-            
-        # Test VARCHAR fields
-        varchar_fields = ['file_name', 'code', 'run_mode']
-        for field in varchar_fields:
-            self.assertTrue(self.df[field].dtype == 'object', 
-                          f"{field} should be string/object type")
-            
-        # Test TIMESTAMP fields
-        timestamp_fields = ['submission_time', 'run_start_time', 'run_end_time']
-        for field in timestamp_fields:
-            # Check that all values are datetime objects with timezone
-            sample_value = self.df[field].iloc[0]
-            self.assertIsInstance(sample_value, datetime)
-            self.assertIsNotNone(sample_value.tzinfo)
-            
-        # Test DOUBLE field
-        self.assertTrue(self.df['run_score'].dtype in ['float64', 'float32'])
-        
-        # Test BOOLEAN field
-        self.assertTrue(self.df['run_passed'].dtype == 'bool')
-        
-        # Test STRUCT fields
-        struct_fields = ['run_result', 'run_compilation', 'run_meta', 'run_system_info']
-        for field in struct_fields:
-            # All values should be dictionaries
-            self.assertTrue(all(isinstance(val, dict) for val in self.df[field]))
-            
-    def test_duplicate_detection(self):
-        """Test that we can detect exact and near duplicates in the dataset."""
-        # Count exact duplicates by code
-        code_counts = self.df['code'].value_counts()
-        exact_duplicates = code_counts[code_counts > 1]
-        
-        # Should have some exact duplicates (first 5 entries)
-        self.assertGreater(len(exact_duplicates), 0, "Should have exact duplicates for testing")
-        
-        # Check that fuzzy duplicates exist (entries with similar code)
-        similar_code_count = 0
-        base_code = "def hello_world():\n    print('Hello World')"
-        for code in self.df['code']:
-            if base_code in code and code != base_code:
-                similar_code_count += 1
-                
-        self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing")
-
-
-if __name__ == '__main__':
-    # Add some helpful output
-    print("Running deduplication pipeline tests...")
-    print(f"Python version: {sys.version}")
-    print(f"Pandas version: {pd.__version__}")
-    
-    # Run the tests
-    unittest.main(verbosity=2) 
\ No newline at end of file

From 9867a05da031f169ff0063b2d26706579e05ea6e Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahan@Sahans-MacBook-Pro.local>
Date: Sun, 30 Nov 2025 00:20:25 -0500
Subject: [PATCH 9/9] update

---
 dedup.py                                      | 484 ++++++++++--------
 export.py                                     |  93 ++--
 tests/__init__.py                             |   0
 tests/fixtures/__init__.py                    |   0
 tests/fixtures/create_fixtures.py             |  43 ++
 tests/fixtures/submissions_fixture.parquet    | Bin 0 -> 40107 bytes
 .../successful_submissions_fixture.parquet    | Bin 0 -> 40452 bytes
 test_dedup.py => tests/test_dedup.py          | 128 ++++-
 8 files changed, 493 insertions(+), 255 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/fixtures/__init__.py
 create mode 100644 tests/fixtures/create_fixtures.py
 create mode 100644 tests/fixtures/submissions_fixture.parquet
 create mode 100644 tests/fixtures/successful_submissions_fixture.parquet
 rename test_dedup.py => tests/test_dedup.py (81%)

diff --git a/dedup.py b/dedup.py
index f6f3449..8f69f1c 100644
--- a/dedup.py
+++ b/dedup.py
@@ -1,13 +1,46 @@
-# script to dedup a huggingface dataset
+"""
+Deduplication Pipeline for Code Submissions
+============================================
+
+This module removes duplicate code submissions using a two-stage approach:
+
+1. EXACT DEDUPLICATION (remove_duplicates)
+   - Computes SHA-256 hash of each code submission
+   - Groups submissions by run_mode, run_passed, and score/duration
+   - Within each group, keeps only unique code (by hash)
+   - When duplicates exist, keeps the one with better metrics (lower score or faster duration)
+
+2. FUZZY DEDUPLICATION (fuzzy_filter)
+   - Uses MinHash + Locality Sensitive Hashing (LSH) to find near-duplicates
+   - Process:
+     a) Convert each code submission to a set of character n-grams (default: 5-char)
+     b) Create MinHash signature for each submission (compact fingerprint)
+     c) Use LSH to efficiently find candidate pairs with high Jaccard similarity
+     d) Group similar submissions into clusters
+     e) Keep one representative from each cluster (highest submission ID)
+
+Usage:
+
+   In practice this should be part of export.py, but if
+   you need to run things adhoc just do:
+
+   python dedup.py input.parquet output.parquet
+
+"""
 
 from datasets import load_dataset
 import tqdm
 from collections import defaultdict
 import hashlib
 from typing import Dict, List, Tuple, Union
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
 
 import datasketch
 import pandas as pd
+import numpy as np
+import pyarrow.parquet as pq
+import os
 
 # =============================================================================
 # DEDUPLICATION CONFIGURATION CONSTANTS
@@ -88,36 +121,34 @@
 def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]):
     """
     Remove exact duplicates from the nested data structure returned by get_sorted_hf_data.
-    
+
     Args:
         data_dict: Nested dictionary structure from get_sorted_hf_data
-        
+
     Returns:
         Dictionary with same structure but duplicates removed
     """
     deduplicated_dict = {}
-    
-    for run_mode, score_duration_dict in tqdm.tqdm(data_dict.items(), desc="Processing run modes"):
+
+    for run_mode, score_duration_dict in data_dict.items():
         deduplicated_dict[run_mode] = {}
 
-        for run_success, run_success_dict in tqdm.tqdm(score_duration_dict.items(), desc=f"Processing {run_mode}", leave=False):
+        for run_success, run_success_dict in score_duration_dict.items():
             deduplicated_dict[run_mode][run_success] = {}
-            for score_duration, rows in tqdm.tqdm(run_success_dict.items(), desc=f"Processing {run_mode}", leave=False):
+            for score_duration, rows in run_success_dict.items():
                 # Use a dictionary to track unique entries by their content hash
                 unique_entries = {}
-                
-                for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False):
-                    # Create a hash of the relevant content (assuming 'input' or similar field exists)
-                    # If the row has an 'input' field, use that; otherwise use the entire row
+
+                for row in rows:
                     content = row.get('code', "")
                     content_hash = hashlib.sha256(content.encode()).hexdigest()
-                    
+
                     if content_hash not in unique_entries:
                         unique_entries[content_hash] = row
                     else:
                         # If duplicate found, keep the one with better metrics
                         existing_row = unique_entries[content_hash]
-                        
+
                         # For leaderboard mode with successful runs, prefer lower scores / faster times
                         if run_mode == 'leaderboard' and row.get('run_passed') == True:
                             if row.get('run_score', 0) < existing_row.get('run_score', 0):
@@ -128,80 +159,99 @@ def remove_duplicates(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], Li
                             current_duration = row.get('run_meta', {}).get('duration', float('inf'))
                             if current_duration < existing_duration:
                                 unique_entries[content_hash] = row
-                
+
                 deduplicated_dict[run_mode][run_success][score_duration] = list(unique_entries.values())
-    
+
     return deduplicated_dict
 
 
+def _create_single_minhash(args: Tuple[str, str, int, int]) -> Tuple[str, datasketch.MinHash]:
+    """Create a MinHash for a single document. Used for parallel processing."""
+    submission_id, text, ngram_size, num_permutations = args
+    minhash = datasketch.MinHash(num_perm=num_permutations)
+    text_lower = text.lower()
+    text_bytes = text_lower.encode('utf8')
+
+    # Generate n-grams directly as bytes to avoid repeated encoding
+    for i in range(len(text_bytes) - ngram_size + 1):
+        minhash.update(text_bytes[i:i + ngram_size])
+
+    return submission_id, minhash
+
+
 def create_minhashes(
     documents: List[Dict[str, str]],
     ngram_size: int = NGRAM_SIZE,
     bands: int = LSH_BANDS,
     rows_per_band: int = ROWS_PER_BAND,
-) -> Tuple[Dict[str, datasketch.MinHash], int]:
+    n_jobs: int = None,
+    position: int = 0,
+) -> Dict[str, datasketch.MinHash]:
     """
     Create MinHash signatures for a list of documents with LSH bands configuration.
 
     Args:
-        documents: List of dictionaries, each containing 'submission_id' and 'input' keys
-        num_permutations: Number of hash functions to use (default: 100)
-        ngram_size: Size of n-grams to generate from input text (default: 3)
-        bands: Number of bands for LSH (default: 20)
+        documents: List of dictionaries, each containing 'submission_id' and 'code' keys
+        ngram_size: Size of n-grams to generate from input text (default: 5)
+        bands: Number of bands for LSH (default: 16)
+        rows_per_band: Rows per band for LSH (default: 128)
+        n_jobs: Number of parallel workers. Defaults to CPU count.
+        position: Position for nested tqdm progress bar
 
     Returns:
-        Tuple containing:
-        - Dictionary mapping document submission_ids to their MinHash signatures
-        - Rows per band (num_permutations / bands)
-
-    Raises:
-        ValueError: If num_permutations is not divisible by bands
+        Dictionary mapping document submission_ids to their MinHash signatures
     """
-
     num_permutations = rows_per_band * bands
 
-    def generate_ngrams(text: str, n: int) -> List[str]:
-        """Generate n-grams from input text."""
-        return [text[i : i + n] for i in range(len(text) - n + 1)]
-
-    # Initialize result dictionary
+    if n_jobs is None:
+        n_jobs = multiprocessing.cpu_count()
+
+    # Prepare arguments for parallel processing
+    args_list = [
+        (doc["submission_id"], doc["code"], ngram_size, num_permutations)
+        for doc in documents
+    ]
+
+    # Use parallel processing for large datasets
+    if len(documents) > 100 and n_jobs > 1:
+        minhash_dict = {}
+        with ProcessPoolExecutor(max_workers=n_jobs) as executor:
+            futures = {executor.submit(_create_single_minhash, args): args[0] for args in args_list}
+            for future in tqdm.tqdm(as_completed(futures), total=len(futures),
+                                     desc="Creating minhashes", position=position, leave=False):
+                submission_id, minhash = future.result()
+                minhash_dict[submission_id] = minhash
+        return minhash_dict
+
+    # Sequential processing for small datasets
     minhash_dict = {}
-    # Process each document
-    for doc in tqdm.tqdm(documents, desc="Creating minhashes"):
-        minhash = datasketch.MinHash(num_perm=num_permutations)
-        submission_id = doc["submission_id"]
-        text = doc["code"].lower()  # Convert to lowercase for consistency
-
-        # Generate n-grams
-        ngrams = generate_ngrams(text, ngram_size)
-        for ngram in ngrams:
-            minhash.update(ngram.encode("utf8"))
-
+    for args in tqdm.tqdm(args_list, desc="Creating minhashes", position=position, leave=False):
+        submission_id, minhash = _create_single_minhash(args)
         minhash_dict[submission_id] = minhash
 
     return minhash_dict
 
 
-# 16 bands with 128 rows
 def create_similarity_matrix(
     minhashes: Dict[str, datasketch.MinHash],
     rows_per_band: int,
     num_bands: int,
     threshold: float,
 ) -> Dict[str, List[str]]:
+    """Build LSH index and query for similar documents."""
     lsh = datasketch.MinHashLSH(threshold=threshold, num_perm=num_bands * rows_per_band)
-    print(f"num_perm: {num_bands*rows_per_band}")
-    similarity_matrix = {}
-    for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Inserting minhashes into LSH"):
+
+    # Batch insert for better performance
+    for submission_id, minhash in minhashes.items():
         lsh.insert(submission_id, minhash)
-    for submission_id, minhash in tqdm.tqdm(minhashes.items(), desc="Querying LSH"):
-        similar_submission_ids = lsh.query(minhash)
-        similarity_matrix[submission_id] = similar_submission_ids
-    for submission_id, similar_submission_ids in tqdm.tqdm(
-        similarity_matrix.items(), desc="Removing self-similarities"
-    ):
-        if submission_id in similar_submission_ids:
-            similar_submission_ids.remove(submission_id)
+
+    # Query all at once
+    similarity_matrix = {}
+    for submission_id, minhash in minhashes.items():
+        similar_ids = lsh.query(minhash)
+        # Remove self from results inline
+        similarity_matrix[submission_id] = [s for s in similar_ids if s != submission_id]
+
     return similarity_matrix
 
 
@@ -236,203 +286,138 @@ def fuzzy_filter(
     bands: int = LSH_BANDS,
     rows_per_band: int = ROWS_PER_BAND,
 ) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
-    
-    total_categories = 0
-    for run_mode, run_success_dict in data_dict.items():
-        for run_success, score_duration_dict in run_success_dict.items():
-            for score_duration, rows in score_duration_dict.items():
-                total_categories += 1
-
+    """Apply fuzzy deduplication to the nested data structure."""
     deduped_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-    current_category = 0
-    for run_mode, run_success_dict in data_dict.items():
-        for run_success, score_duration_dict in run_success_dict.items():
-            for score_duration, rows in score_duration_dict.items():
-                print(f"Processing {run_mode} {run_success} {score_duration} {len(rows)}")
-                print(f"This is {current_category} of {total_categories}")
-                current_category += 1
-                deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(rows, threshold, ngram_size, bands, rows_per_band)
+
+    # Count total groups for progress bar
+    total_groups = sum(
+        len(score_duration_dict)
+        for run_success_dict in data_dict.values()
+        for score_duration_dict in run_success_dict.values()
+    )
+
+    with tqdm.tqdm(total=total_groups, desc="Fuzzy dedup groups", position=0) as pbar:
+        for run_mode, run_success_dict in data_dict.items():
+            for run_success, score_duration_dict in run_success_dict.items():
+                for score_duration, rows in score_duration_dict.items():
+                    pbar.set_postfix({"mode": run_mode, "rows": len(rows)})
+                    deduped_data[run_mode][run_success][score_duration] = _fuzzy_filter(
+                        rows, threshold, ngram_size, bands, rows_per_band, position=1
+                    )
+                    pbar.update(1)
 
     return deduped_data
 
+
 def _fuzzy_filter(
     data_list: List[Dict],
     threshold: float = FUZZY_SIMILARITY_THRESHOLD,
     ngram_size: int = NGRAM_SIZE,
     bands: int = LSH_BANDS,
     rows_per_band: int = ROWS_PER_BAND,
+    position: int = 0,
 ) -> List[Dict]:
     """
-    Apply fuzzy deduplication to the nested data structure returned by get_sorted_hf_data.
-    
+    Apply fuzzy deduplication to a list of documents.
+
     Args:
-        data_dict: Nested dictionary structure from get_sorted_hf_data
+        data_list: List of row dictionaries
         threshold: Similarity threshold for LSH
         ngram_size: Size of n-grams for MinHash
         bands: Number of bands for LSH
         rows_per_band: Rows per band for LSH
-        create_histogram: Whether to create similarity histogram
-        
+        position: Position for nested tqdm progress bar
+
     Returns:
-        Dictionary with same structure but fuzzy duplicates removed
+        List with fuzzy duplicates removed
     """
-    # Flatten the data for processing
-    
-    # Create documents for MinHash processing
-
     if len(data_list) <= 1:
         return data_list
 
-    all_documents = []
-    for i, row in tqdm.tqdm(enumerate(data_list), desc="Creating documents for MinHash"):
-        # Use 'input' field if available, otherwise use a string representation
-        content = row.get('code', str(row))
-        document = {
-            "submission_id": str(i),
-            "code": content,
-            "original_row": row
-        }
-        all_documents.append(document)
-    
+    # Build documents list without tqdm overhead
+    all_documents = [
+        {"submission_id": str(i), "code": row.get('code', str(row)), "original_row": row}
+        for i, row in enumerate(data_list)
+    ]
+
     # Apply fuzzy deduplication
     minhashes = create_minhashes(
-        all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band
+        all_documents, ngram_size=ngram_size, bands=bands, rows_per_band=rows_per_band,
+        position=position
     )
     similarity_matrix = create_similarity_matrix(
         minhashes, rows_per_band=rows_per_band, num_bands=bands, threshold=threshold
     )
-    
-    good_submission_ids = filter_matrix(similarity_matrix)
-    
-    # Keep only the documents that passed the filter
-    good_documents = [all_documents[int(submission_id)]["original_row"] for submission_id in good_submission_ids]
-    
-    # Reconstruct the nested structure
-    return good_documents
-
-def get_hf_data() -> Dict[str, Dict[Union[float, int], List[Dict]]]:
-    # Login using e.g. `huggingface-cli login` to access this dataset
-    ds = load_dataset("GPUMODE/kernelbot-data", "submissions")
-
-    # we should divide things up into type
-    # run_mode
-    # run_sucess
-    # if run_mode is leaderboard then use score
-    # otherwise use run_meta[duration]
-
-
-    data = ds['train']
 
-    run_mode_dict = defaultdict(list)
-    run_success_dict = defaultdict(lambda: defaultdict(list))
-    run_duration_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-
-    for _, row in tqdm.tqdm(enumerate(data), desc="Processing dataset rows"):
-        run_mode = row['run_mode']
-        run_mode_dict[run_mode].append(row)
-
-    for run_mode, rows in tqdm.tqdm(run_mode_dict.items(), desc="Processing run modes"):
-        for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} success/failure", leave=False):
-            run_success_dict[run_mode][row['run_passed']].append(row)
+    good_submission_ids = filter_matrix(similarity_matrix)
 
-    for run_mode, mode_dict in tqdm.tqdm(run_success_dict.items(), desc="Processing success/failure groups"):
-        for run_success, rows in tqdm.tqdm(mode_dict.items(), desc=f"Processing {run_mode}", leave=False):
-            for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {run_success} rows", leave=False):
-                if run_mode == 'leaderboard' and run_success == True:
-                    rounded_score = round(float(row['run_score']), LEADERBOARD_SCORE_PRECISION)
-                    run_duration_dict[run_mode][run_success][rounded_score].append(row)
-                else:
-                    rounded_duration = round(float(row['run_meta']['duration']), DURATION_PRECISION)
-                    run_duration_dict[run_mode][run_success][rounded_duration].append(row)
+    # Keep only the documents that passed the filter
+    return [all_documents[int(sid)]["original_row"] for sid in good_submission_ids]
 
-    return run_duration_dict
 
 def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]:
     """
     Convert a pandas DataFrame to a nested dictionary structure.
-    
+
     Args:
         df: pandas DataFrame
-        
+
     Returns:
-        Nested dictionary structure
+        Nested dictionary structure grouped by run_mode, run_passed, and duration
     """
+    # Extract duration from run_meta column (vectorized where possible)
+    if 'run_meta' in df.columns:
+        durations = df['run_meta'].apply(lambda x: x.get('duration', 0) if isinstance(x, dict) else 0)
+    else:
+        durations = pd.Series([0] * len(df))
+
+    # Add duration as a column for grouping
+    df = df.copy()
+    df['_duration'] = durations
+
     data_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-    for _, row in tqdm.tqdm(df.iterrows(), desc="Processing DataFrame rows"):
-        run_mode = row['run_mode']
-        run_success = row['run_passed']
-        score_duration = row['run_meta']['duration']
-        data_dict[run_mode][run_success][score_duration].append(row)
+
+    # Group by run_mode and run_passed, then iterate groups (much faster than iterrows)
+    for (run_mode, run_passed), group in df.groupby(['run_mode', 'run_passed'], sort=False):
+        # Convert group to list of dicts at once
+        records = group.drop(columns=['_duration']).to_dict('records')
+        group_durations = group['_duration'].tolist()
+
+        for record, duration in zip(records, group_durations):
+            data_dict[run_mode][run_passed][duration].append(record)
+
     return data_dict
 
 def flatten_data(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]]) -> List[Dict]:
     """
     Flatten the nested data structure to a list of documents with metadata.
-    
+
     Args:
         data_dict: Nested dictionary structure from get_sorted_hf_data
-        
+
     Returns:
         List of documents with additional metadata fields
     """
     flattened = []
-    for run_mode, run_success_dict in tqdm.tqdm(data_dict.items(), desc="Flattening data"):
+    for run_mode, run_success_dict in data_dict.items():
         for run_success, score_duration_dict in run_success_dict.items():
             for score_duration, rows in score_duration_dict.items():
-                for row in tqdm.tqdm(rows, desc=f"Processing {run_mode} {score_duration}", leave=False):
-                    # Add metadata to each row
-                    row_with_metadata = row.copy()
-                    row_with_metadata['_run_mode'] = run_mode
-                    row_with_metadata['_run_success'] = run_success
-                    row_with_metadata['_score_duration'] = score_duration
-                    flattened.append(row_with_metadata)
+                for row in rows:
+                    # Add metadata directly to dict (avoid copy if possible)
+                    if isinstance(row, dict):
+                        row['_run_mode'] = run_mode
+                        row['_run_success'] = run_success
+                        row['_score_duration'] = score_duration
+                        flattened.append(row)
+                    else:
+                        # Handle pandas Series
+                        row_dict = row.to_dict() if hasattr(row, 'to_dict') else dict(row)
+                        row_dict['_run_mode'] = run_mode
+                        row_dict['_run_success'] = run_success
+                        row_dict['_score_duration'] = score_duration
+                        flattened.append(row_dict)
     return flattened
 
-def count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int:
-    """
-    Count total number of items in the nested data structure.
-    
-    Args:
-        data_dict: Nested dictionary structure from get_sorted_hf_data
-        
-    Returns:
-        Total number of items
-    """
-    total = 0
-    for run_mode in data_dict.values():
-        for run_success_dict in run_mode.values():
-            for rows in run_success_dict.values():
-                total += len(rows)
-    return total
-
-
-def example_usage():
-    """
-    Example of how to use the deduplication functions with get_hf_data output.
-    """
-    # Load the data
-    data = get_hf_data()
-    
-    print(f"Original data has {count_items(data)} total items")
-    
-    # Remove exact duplicates
-    deduplicated_data = remove_duplicates(data)
-    print(f"After exact deduplication: {count_items(deduplicated_data)} items")
-    
-    # Apply fuzzy deduplication
-    fuzzy_deduplicated_data = fuzzy_filter(
-        deduplicated_data,
-        threshold=FUZZY_SIMILARITY_THRESHOLD,
-        ngram_size=NGRAM_SIZE,
-        bands=LSH_BANDS,
-        rows_per_band=ROWS_PER_BAND
-    )
-    # convert to df
-    flattened_data = flatten_data(fuzzy_deduplicated_data)
-    df = pd.DataFrame(flattened_data)
-    
-    return df
-
 def dedup_df(df: pd.DataFrame) -> pd.DataFrame:
     """
     Deduplicate a pandas DataFrame.
@@ -458,23 +443,122 @@ def dedup_df(df: pd.DataFrame) -> pd.DataFrame:
 def create_parquet_file(data_dict: Dict[str, Dict[Union[float, int], List[Dict]]], filename: str):
     """
     Create a Parquet file from the nested data structure.
-    
+
     Args:
         data_dict: Nested dictionary structure from get_sorted_hf_data
         filename: Name of the output Parquet file
     """
     # Flatten the data
     flattened_data = flatten_data(data_dict)
-    
+
     # Create a pandas DataFrame from the flattened data
     df = pd.DataFrame(flattened_data)
     # Convert the DataFrame to a Parquet file
     df.to_parquet(filename, index=False)
 
 
+def _count_items(data_dict: Dict[str, Dict[bool, Dict[Union[float, int], List[Dict]]]]) -> int:
+    """
+    Count total number of items in the nested data structure. (useful for testing)
+
+    Args:
+        data_dict: Nested dictionary structure from get_sorted_hf_data
+
+    Returns:
+        Total number of items
+    """
+    total = 0
+    for run_mode in data_dict.values():
+        for run_success_dict in run_mode.values():
+            for rows in run_success_dict.values():
+                total += len(rows)
+    return total
+
+
+# Columns required for deduplication
+REQUIRED_COLUMNS = ['code', 'run_mode', 'run_passed', 'run_meta', 'submission_id']
+
+
+def dedup_file(input_path: str, output_path: str) -> None:
+    """
+    Deduplicate a parquet file and save the result.
+
+    Args:
+        input_path: Path to input parquet file
+        output_path: Path to output parquet file
+    """
+    # Show file size
+    file_size = os.path.getsize(input_path)
+    print(f"Loading {input_path} ({file_size / 1e9:.2f} GB)...")
+
+    # Use PyArrow for faster loading, only load required columns
+    pf = pq.ParquetFile(input_path)
+    available_columns = pf.schema.names
+    columns_to_load = [c for c in REQUIRED_COLUMNS if c in available_columns]
+
+    print(f"Loading columns: {columns_to_load}")
+    table = pq.read_table(input_path, columns=columns_to_load)
+    df = table.to_pandas()
+    print(f"Loaded {len(df)} rows")
+
+    # Decode bytes to string if needed
+    if 'code' in df.columns and len(df) > 0:
+        if isinstance(df['code'].iloc[0], bytes):
+            print("Decoding code column from bytes...")
+            df['code'] = df['code'].apply(
+                lambda x: x.decode('utf-8') if isinstance(x, bytes) else x
+            )
+
+    original_count = len(df)
+
+    # Convert to nested dict structure
+    print("Converting to nested structure...")
+    data_dict = convert_df_to_dict(df)
+
+    # Apply exact deduplication
+    print("Applying exact deduplication...")
+    exact_deduped = remove_duplicates(data_dict)
+    exact_count = _count_items(exact_deduped)
+
+
+    # Apply fuzzy deduplication
+    print("Applying fuzzy deduplication...")
+    fuzzy_deduped = fuzzy_filter(
+        exact_deduped,
+        threshold=FUZZY_SIMILARITY_THRESHOLD,
+        ngram_size=NGRAM_SIZE,
+        bands=LSH_BANDS,
+        rows_per_band=ROWS_PER_BAND
+    )
+
+    # Flatten and save
+    print("Flattening and saving...")
+    flattened = flatten_data(fuzzy_deduped)
+    result_df = pd.DataFrame(flattened)
+    result_df.to_parquet(output_path, index=False)
+
+    final_count = len(result_df)
+
+    print("Deduplication results Summary:")
+    print(f"Original rows: {original_count}")
+    print(f"After hash based dedup dedup: {exact_count} rows")
+    print(f"Final rows: {final_count}")
+    print(f"Removed {original_count - final_count} duplicates ({100 * (original_count - final_count) / original_count:.1f}%)")
+    print(f"Saved to {output_path}")
+
 
 def main():
-    example_usage()
+    import sys
+
+    if len(sys.argv) == 3:
+        # File-based deduplication
+        input_path = sys.argv[1]
+        output_path = sys.argv[2]
+        dedup_file(input_path, output_path)
+    else:
+        print("Usage: python dedup.py <input.parquet> <output.parquet>")
+        sys.exit(1)
+
 
 if __name__ == "__main__":
     main()
diff --git a/export.py b/export.py
index d7cffdf..5e2687f 100644
--- a/export.py
+++ b/export.py
@@ -6,7 +6,8 @@
 from sqlalchemy import create_engine, text
 import pyarrow as pa
 import pyarrow.parquet as pq
-from dedup import dedup_df
+import glob
+from dedup import deduplicate_df
 
 load_dotenv()
 
@@ -223,8 +224,8 @@ def consolidate_parquet_files(input_dir, pattern, output_file):
         input_dir: Directory containing the parquet part files
         pattern: Glob pattern to match the part files (e.g., "submissions_part_*.parquet")
         output_file: Path to the output consolidated parquet file
+        skip_deduplication: Whether to skip deduplication step (default: False)
     """
-    import glob
 
     # Find all matching parquet files
     part_files = sorted(glob.glob(os.path.join(input_dir, pattern)))
@@ -261,8 +262,21 @@ def consolidate_parquet_files(input_dir, pattern, output_file):
 
     print(f"  Done! Consolidated {len(part_files)} files ({total_rows} total rows)")
 
-
-def main(output_dir):
+def deduplicate_parquet_file(input_file, output_file):
+    """
+    Deduplicates a parquet file using the dedup.py script.
+    """
+    
+    # load the parquet file into a pandas dataframe
+    df = pd.read_parquet(input_file)
+    
+    # deduplicate the dataframe
+    deduplicated_df = deduplicate_df(df)
+    
+    # save the deduplicated dataframe to a new parquet file
+    deduplicated_df.to_parquet(output_file)
+
+def main(output_dir, skip_deduplication):
     """
     Orchestrates the data export process.
 
@@ -305,36 +319,40 @@ def main(output_dir):
         os.path.join(output_dir, "successful_submissions.parquet")
     )
 
-    # Apply deduplication to submissions
-    print("Applying deduplication to submissions...")
-    submissions_parquet_path = os.path.join(output_dir, "submissions.parquet")
-    try:
-        submissions_df = pd.read_parquet(submissions_parquet_path)
-        original_count = len(submissions_df)
-
-        deduplicated_submissions_df = dedup_df(submissions_df.copy())
-        deduplicated_submissions_path = os.path.join(output_dir, "deduplicated_submissions.parquet")
-        deduplicated_submissions_df.to_parquet(deduplicated_submissions_path, index=False)
-
-        print(f"Deduplicated submissions saved to {deduplicated_submissions_path}")
-        print(f"Original submissions: {original_count}, After deduplication: {len(deduplicated_submissions_df)}")
-
-        # Create deduplicated successful submissions
-        if 'run_passed' in deduplicated_submissions_df.columns:
-            print("Creating deduplicated successful submissions...")
-            deduplicated_successful_df = deduplicated_submissions_df[deduplicated_submissions_df['run_passed'] == True].copy()
-            deduplicated_successful_path = os.path.join(output_dir, "deduplicated_successful_submissions.parquet")
-            deduplicated_successful_df.to_parquet(deduplicated_successful_path, index=False)
-
-            successful_parquet_path = os.path.join(output_dir, "successful_submissions.parquet")
-            successful_df = pd.read_parquet(successful_parquet_path)
-            print(f"Deduplicated successful submissions saved to {deduplicated_successful_path}")
-            print(f"Original successful: {len(successful_df)}, After deduplication: {len(deduplicated_successful_df)}")
-
-    except Exception as e:
-        print(f"Warning: Deduplication failed with error: {e}")
-        print("Proceeding without deduplication...")
-
+    if not skip_deduplication:
+        deduplicated_submissions_output_path = os.path.join(output_dir, "deduplicated_submissions")
+        deduplicated_successful_submissions_output_path = os.path.join(output_dir, "deduplicated_successful_submissions")
+        os.makedirs(deduplicated_submissions_output_path, exist_ok=True)
+        # we do this as everything combined can be too much for pandas to handle
+        # if things get too big I'd multiprocess this
+        for file in glob.glob(os.path.join(output_dir, "submissions_part_*.parquet")):
+            deduplicate_parquet_file(file, os.path.join(deduplicated_submissions_output_path, os.path.basename(file)))
+        for file in glob.glob(os.path.join(output_dir, "successful_submissions_part_*.parquet")):
+            deduplicate_parquet_file(file, os.path.join(deduplicated_successful_submissions_output_path, os.path.basename(file)))
+        consolidate_parquet_files(
+            deduplicated_submissions_output_path,
+            "submissions_part_*.parquet",
+            os.path.join(output_dir, "deduplicated_submissions.parquet")
+        )
+        consolidate_parquet_files(
+            deduplicated_successful_submissions_output_path,
+            "successful_submissions_part_*.parquet",
+            os.path.join(output_dir, "deduplicated_successful_submissions.parquet")
+        )
+        original_submission_rows = pd.read_parquet(os.path.join(output_dir, "submissions.parquet")).shape[0]
+        deduplicated_submission_rows = pd.read_parquet(os.path.join(output_dir, "deduplicated_submissions.parquet")).shape[0]
+        original_successful_submission_rows = pd.read_parquet(os.path.join(output_dir, "successful_submissions.parquet")).shape[0]
+        deduplicated_successful_submission_rows = pd.read_parquet(os.path.join(output_dir, "deduplicated_successful_submissions.parquet")).shape[0]
+
+        print("Deduplication results Summary:")
+        print(f"Original submissions rows: {original_submission_rows}")
+        print(f"Deduplicated submissions rows: {deduplicated_submission_rows}")
+        print(f"Removed {original_submission_rows - deduplicated_submission_rows} duplicates ({100 * (original_submission_rows - deduplicated_submission_rows) / original_submission_rows:.1f}%)")
+        print(f"Original successful submissions rows: {original_successful_submission_rows}")
+        print(f"Deduplicated successful submissions rows: {deduplicated_successful_submission_rows}")
+        print(f"Removed {original_successful_submission_rows - deduplicated_successful_submission_rows} duplicates ({100 * (original_successful_submission_rows - deduplicated_successful_submission_rows) / original_successful_submission_rows:.1f}%)")
+    else:
+        print("Skipping deduplication step")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Export leaderboard data to a Hugging Face dataset.")
@@ -344,5 +362,10 @@ def main(output_dir):
         default="dataset",
         help="Directory to save the Hugging Face dataset."
     )
+    parser.add_argument(
+        "--skip_deduplication",
+        action="store_true",
+        help="Skip deduplication step"
+    )
     args = parser.parse_args()
-    main(args.output_dir) 
\ No newline at end of file
+    main(args.output_dir, args.skip_deduplication) 
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/create_fixtures.py b/tests/fixtures/create_fixtures.py
new file mode 100644
index 0000000..2823b52
--- /dev/null
+++ b/tests/fixtures/create_fixtures.py
@@ -0,0 +1,43 @@
+"""Script to create small test fixtures from the actual parquet data."""
+
+import pyarrow.parquet as pq
+import pandas as pd
+import os
+
+FIXTURE_DIR = os.path.dirname(__file__)
+
+def create_fixture(parquet_name: str) -> pd.DataFrame:
+    """Create a small fixture with 5 rows from the specified parquet file.
+    
+    Args:
+        parquet_name: Name of the parquet file (e.g., 'submissions' or 'successful_submissions')
+    
+    Returns:
+        DataFrame with the fixture data
+    """
+    source_path = os.path.join(FIXTURE_DIR, f'../../data/{parquet_name}.parquet')
+    output_path = os.path.join(FIXTURE_DIR, f'{parquet_name}_fixture.parquet')
+
+    if not os.path.exists(source_path):
+        print(f"Error: {source_path} does not exist")
+        print(f"Please place a recent {parquet_name}.parquet in {source_path} and rerun this script")
+        exit(1)
+
+    columns = [
+        'submission_id', 'leaderboard_id', 'user_id', 'submission_time',
+        'file_name', 'code', 'code_id', 'run_id', 'run_start_time',
+        'run_end_time', 'run_mode', 'run_score', 'run_passed',
+        'run_compilation', 'run_meta', 'run_system_info'
+    ]
+
+    pf = pq.ParquetFile(source_path)
+    table = pf.read_row_group(0, columns=columns)
+    df = table.to_pandas().head(5)
+
+    df.to_parquet(output_path, index=False)
+    print(f"Created {output_path} with {len(df)} rows")
+    return df
+
+if __name__ == '__main__':
+    create_fixture('submissions')
+    create_fixture('successful_submissions')
diff --git a/tests/fixtures/submissions_fixture.parquet b/tests/fixtures/submissions_fixture.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..67acda46176ce474d831bcba21488e69726cfd0f
GIT binary patch
literal 40107
zcmeIb33waT)i8XotVtwU8rwn=wqu)-ZP}6+S)1i0u@lLb9mh)?Cvg_XQAX0(maWCn
zlGrW*QZP*@Ng-?@4HQ!hr4&jiv=mw(Ev77GYbga<QUXoe^lkG}S_-9L{&Po?EiYkd
z`@YZje_!;-nwh(ubI(2Z+;h)8XWVJ)GAf12PGw`8Qldn1gtVfD^ALrF-|ME93ekfy
z(TeYPBmVPn{HjDsl`^{w%BL%rD3|19iV``Qq%QwaK|2dmAuClYfL{qtN!izg0CKug
z(WVsekVQFP!Lix^L9SFKLrCOgiV}AOLgIcl73zzb7z8XK?HiQ(Ho%hfEK_(365|!>
z7!0&~7MMbrnG7tElPOA+CBPP@l8zL{fC*AOn5W~089E+)<vY%Ymt0bndCA8eKjzw2
zW>)u|`9(im51Y>W<9@gf_k8f|ZE#&k&qPk9DA5K%U|}jCFrS_y$_o?yUGb--P$0sE
zBl_XdRJiqU+;BhV_rqfzJcgq#zc(EA1_Msk?{W4GS3CWFXMpwY;-CazHHR!#q+J?s
zD>F5zC1_|K5iWl?MR~;~I6;UXzmBhZRfSN2x^r!3!`iM6#>cT9F60WbArAwP%K>t^
z-!I8hU)Vn!4KwX6W`jW@@%o2@p$HQUOZq}VKNDfYLySyO<_!!-BhE+}6O2YEiL78-
zFytQ4M<ZTeSnnPlc5)*TE<ie!QT=w8FUWeF-hj^=;3Se@SkDD^c|*ZKPkzJZ&H24d
z9h2|&v4Q^5(tL@;9rduzaL5g*q5S-ONr5-u_C-A$v%(t;M?xIyuP13TMfHGpxB~uw
zVn(ZepgvcRvkM6YJzRwK`lwW~4pkr0BB86VejP$y<;5csr?cM|bg@3Cli3yYdKf<&
zX^;9^xln-fX`LAZ)lSahH#_?R(J`jPR>pK?BB{Mh!UQmC?1NrJn39GvChQ&KoDrr&
zDM77D$#A=l`GSODynzT43hoBHUA~}usKqm)A7x6!GE1Y^4|k<ZWB{=DP|MJAnOWox
z`t)XmM))6-8Hq-gORz=M$7pfCVg=K|6cte`(Ta>qg$yG;Rv3=@p<#pG(1-gn8Ky57
zf`YdoFFwC)IpbwkFs;j((o(PZVT`dwT3W|wDZ5B#>%pGF5~dY^dwZFZj7-Mf<7G;i
zcBT}MqF(WKv`5p38f2!793fMO*W-NA9?71j`>`U%h12(7<2}>^N4|m6qhW8LUu=qu
z7_ESa)5%66A+IYM;havb*6G~Exg)`lQ}b7aQwt?ZG#?9tedW0#olYlSrMbFei-2K!
zs(Z^AS5LL3fVvs`GCk1F7YuvUO3gKt3q|zTV^nRr9*fu>uX?q})}!50BOK9{-;i~l
ztshlu?!;R~4C4}4A-lc`DGTvR^=(-z(Fj$gegfaJ&*^kUy}pQ-pG5ucfYaaSXPtd!
zlhbS(fv?f(RR7=-=$5u5#KBN;q2Ul0ak|-XWQ8vn=r5_)vN}L77v;F%>M3Ik;uNIQ
zH0o)L23u}R9|R(s)HE7gcbi^=dY`hIv6f*PdbEQ;P@~mUml8slfj)VF#H9?b(cO<2
zO@`y)aZfs#uv+mIig1Vxc(nK-7(sV15RN$V_U5KT6|+fwndy9_XoY*D)qXxg?J~Vk
z!gMtV_TDcnv16-ar3hG0Q&JX=cxr3iFzJ{TE5x$&KJN(UQPMeKE&}~=dygZb)}-6M
zucCsn`G5`C2**SN-aan}%+C3Pp;6#eEzFvB7GGd!X9H|M=g}4ct9LlKjqk7dt$+*}
zzG=-$!uU+<xaQ@R2sMZ=LhV^AHJBO(tnjnjS_0mPm-WFsU~CNZiLo=gy^#S%eYg;0
z#4gqs<-(Hg_k`4|woO8>Zv1=5#+6a27%RdNHsW=+JiF#yB&^r!)rdL_SG?Pz)#;cD
zrpwmR=xn!b*6nuOTfd7^XXRdu?9&&czKhZEYs-I=$@!tD+6FG9#$DSm{@4N?ZRGku
zurU20uSc_O>RHE|>h%hS@p%2_19j=)PA+6W{-;df$}*s>BSV%i3Pk<XGErEoy)mFP
zD)b{vIb&3t_kSrj3zbp(YcbHUBl>JsDH`40h|s9+5=U8C8ygL{2Vf3!%#d6yqMRZi
zqBu)cDUvofw`uhb5h}}i6qVKAiZl+bp^O2N0s$M!=<`I<)-t9&HzM7{dLx=?Lv!aw
zCK6--X>WLd8f7aQ8@XLxH@AKiAb?l|s%qI*>Vg%}x3$9&kfE&4H_8l$Z1uunmQItP
zKI+Gc5EqX6BJ^KGqL6olGMUna!u&chHZV;sUF$b6>o?fCHZatADE~UOlnZ!#?Dzc_
zG+nv`b+mk~eU-Hs{qU&ci3g5iZ2B^cZ@x|>9PGsxit?F!@vb|!K(YZ*vaF-W4<bga
zy!y|zuOpG8qfz`TRJZ1Jg!ZV{PUDn&aEjDyDq|)B_!TaH6d_}8c1u+A%&voilWw8k
zKfO-?-!uCJj(v*tj)t`zjnn%DvG4Z%0@;bnuMlKHADcoPXN@k!a-r77{`1b;sdQwo
zO$V%Sx_o77;Q)wzkk?M9XM81<nhv7R&8c3M37yW#+cH56yFK1rJAaAT{7ETtd0#ZZ
zw2p=XjFyF2=sLDa=vt25Fa!O=KDt1JR+t_U724a@Hnch$TiWaGBeIrB<ZO8wX~otx
zl}qVoL|j%M%=i`PqOZ$jD^6Wm2n&HT>}GvjnH82#yG%89ITHT>xu{n9@2Q90lih8j
zD`emNKqj-E_(+ynUk9@L!V9TYQn+hZ{ZyoTU0y8Ft&{bM)^D`aeu*PX*j`rivs3q)
zN|3O_srhfwdZtdSN3B*pqBn@;@-(`fE@El<X_3q)uMmk~A;A}*<ykaZq3k)nJwwxS
zZQgfjN`;)(9%Q|E2^tc2i-wpLSyQ+Z4P~@sv@#(0hRr4rCaNcCQ7Nniz4$OH$yzQ2
zk_^#lGSnf*GK|G1kSn8JXzx)^jMan8QPxu)>bS79F5^;EBI|hSqi269jQVkYR)rAO
zj0fe7X{c13ExudY0gH+LdxG|7s2fp*=@~?=6<4bNLS27KwB|@!_KT_O{!6NZ6>D5r
z$VO^wH-I7#4EbM2+FZE^rbRA|>?Rt?9_w(W=q<><Pp^Q%*8QM;B?_qTM|v!iY7M$F
z&0l0`QX|M>`ek5<j68Lh>4yTx!+XC;56gt6YEGp3o{UQU%MECQ{IU!b*55D5SeAXC
zgwvhgBP*BcM|5RgISQP*HsgFWI`K8cl$YHt1`_U*)~{HxB3~rZS0ibCeZ98)7v)z>
zv?8wPUb#hQ(|gzm>l}`Rw7RnG8W?X7_Oj0M{mSgTNvTj*wr=^FWk|L{cI7&ilqu0Q
zD9f=eGf>71l`($Z+WmeZ(_pI>=I7J33K6J5B{hn=_weh;Kiw)g)G9#B#b3_x2R%_A
z$JFURQ0R6l0<52_%NJjb^2P5{59S1;z~14y9!e$(xqD^H<T9l)hoY6a(moJC^j)bU
z$jn$Lt3tczKPwesu5Lpp>X$1(-wLr#%9if#^RfNdqAg!*&M#x~%gcj(dv_Z=<t*rD
zb^U!KH5L<<#lSLB=PoTZ8SrL<P9l+bxIV@m4M&3OhLF~#LA80ADyfbsug_nmLbaNg
zGFey~%NTa&ZeSTP{Tn4-L21gL)>R_CE(d8{wMe0pyg?(BeMqGM%^(sC<lV0l(@0OZ
zskBj8Wf(ThM0}F$Nmb}cl}I94C94w+dLz^y6frH<W@RZ601+=mZfYs@JtY?T2Evp|
z$b@8K>jl7`d0&@V&qsxz1bRIj=*nfvRN+urcOhta0cD}kHB`GD$(oTnt6As{bQd5}
zKTAV?x2{2=5Ih7?2p890hxR)Q*EiVO9JPRveo`pqMutQC->cn&Wcu`$tqy0qy#fiX
zmOec~9rwXc>*ac+Xzp6OvD3Nr3RJ%zAzjO3n(qjQz4T#48MHAR3igLsztc-!bRM-V
z2NoFTu-D@}o~2Hu|0&6TQzRa;>d^5W89LOF_qvFJ3p^Ksw$dL8Mu(X?)Atc9@JvPo
z1*k7l8swyI#orKCR4ADYJ5>zJd?*}I-leQWC8l0Pm56gr3aFQ*es4gV5fb^Ca(xRz
zWu`hLe$*(_4<ji|dM!SRH1@9|k&e-rhEZ0dTI;Gr(5A@>dj;x2`NblI6PDvpg!;bt
zioAeW;})OP>k#h8Iuro(P=|PZda$oA%!xZxkBOB|RH~k0SkPhmIauNJzZD=Ieo5?=
zt;8M^XbFKn_~`(xwS#(5wvvuWgb`nsU*DA?YSl?vzCJ#Q82VkY2<Dc0r>u_g_tb*c
z5e#s2L)L*(DeGrSmA$A3=K>eh>KP;)(uqGt1yhgGcV{A(>J3Fs7<3Hx0OygdxdeDA
z$bbUUA7H`*>@XK*wC!cBI%XB4Z7u82fij!k{>#ag^n|o_0->Qb6Tl=xSraJ0-vwk7
zcaNSNxN}VZ0iqvO-s#@^kCv@>Pk;`rl-Q11GX$t2?`6sOoAlRGWz7>=CznjTFJ#@v
ze^dxVJ;sHCpgQOU0@JM+b<qCIoXF@fR|i=q`$ZA6Nqi;!fHZvw_5;kiCcJjsmjABQ
z@qn;Zci>8lsXZ8r3K)+{tCad-E2YX(g2qluoh(zg<_(+)iacl|L)Oy**jlWkYLQD{
zi#FD7twqY^TeKqBUS;1-YU~pMJbF48X&n=wwDh4mYN!n6Xb=>XP#x?ObQw@;Z-&`X
zN_H47*kLFyPKT)gU9`4Kveu)J=>pU)Xtx3M-Ra_0Ii6rd%a*xxHGS&e!xA7aKqc5)
zqlDir8*j=wC>?UTqc#=|)p~L5W`XjN^dV<pB*K=8hDwb#fA1QX1+lLSokRHV;(Amr
z-lg17F2oOKl%kS#Pn`URQddrQ>Cnl4Ds`Giak1%ZNVyTQStXk9_Q5jf8q$@rD+^h-
zTOSE(p=WvTr0Tva_lRjh`hhb1_{5j=d$9jn<i6>r9i>9#KCN+taCl87CI4X_DpF?W
zHVIcS2YyIqGl;Mdr=<=76S?)*3y^!SLT;9PcUdZZztnaK)5Z01AuizNXuUe!$N9Mc
zosyj%3^2jIr|v@K@-8_&B?UgcT0&JyZK3|K>Q)g@Xie(sT4qCZ7{qpD0Cs^OFhJ{_
zxFt&i!^ze%s=L)HPa5hOzgl%CLw|!oy*M)YjDqR$!^)7oR+@p4>-Ks?e^XXlkFqIg
zM<WuNP4#1l)r*X#>e>dBy;6F41I!`WD*AXDOwu)L5OUWxqIG*~_aR}RR}(;8@&^=k
zJ9R7R=`<<qa04gvRq}#~=jqyf%4$4`3UEN(sSWfp-Z1EjTscescaD&a<S)%V)uQ@+
zuF&hpYh`N3(wtpfXaF{%dTOKUd>Ijs@_-bTajVrIQ@N;}{zs0qj$!qh$Eow^^YV~O
zoG$D!9JoW?vus5j<753U56iSWHrQ$zZT82~slA}ifRyej-g8FOUfjF)Yx-i40&9wq
zqNBJMR&Vc<c?~j{);m$J3Z;S4@i=l9{WDehJoO!n^DHuDhLF{J-;rORp_I|rW(b)A
zr3ScOgOsvbSbyD#JAx=Qotq{!>awmwA!`xb>x&RwpDNwWhK48ZMb+}lGw5cCu%E-}
z$RutD$;Uhu%}z~IxFwcSgyffK<eM~mmkEO*dSqGQ5a)#bzk9$Ly1FX=BbvTkbMQ}e
z_VP4Qcr=`mDg-;l^_m^U8@gIHtnF}icC~bDaN0I*Si50kM@vU@F?|OkDmE5pSX)x9
zu<q&WFHED&=|XQ<K9-KO*-wgOzZI|CrVm8}5wD-qdqQB(3F%pPgxGcZ=@-?$Xb3nF
zY$$A~ANDjly^D1Zfu_#XfjxwNCLQJ9xm#S;2-Xlb8VN?C0nQ1V#y)R<?Jz<;8;fOQ
zurDh@U$Fj8fNJ;Zg=LIhi84EiYZ+6e)v83I*5X?FQCi3p>o?KqkX*e-{}vY3nRO?d
zFxsdR)>Zd*jeV6Ybk=d?I)do5oUtRQ{vBG0vyLFk?MIN%Xj)T=?MkGsoc=0;ulTF=
zYw+|D^fM?oj?~XjTQGbFEm-r=>*vKDDlM<wh*0I_M^ljLc0md$_H6x0e8hwwx)CV~
znC4*6Lmw20pd<3lv<P<Os_T>}tg^y98|x9;M+Eq>uD=V#CDZQ-?|DyX{}RGS#b2uZ
zia@NKK8ns9Mby!b_whkM=UU2oEmeOxQd{k<*mez|_XJ=Vs{0{63MfNi_~st1Z^QVj
zlwy}pF#RIF{6*YVNjmVOBghUlU#OQ;fVSR(<<E*yZxNQzzpi*0R>w6N_>VFPeW|vd
z7OXIqF~z&tU0m2X%!Qm>lx~rrJ=R@IN&;e2jihgE$*yC_elkUU1)<cJJB}fGhve2{
z$n>oA&10x*xd4*!F-K?ZaLRoyJbes_k8!;R9~O4pi|C@%S{5Vov|KPI7eGS20)5j7
zhLDH8jZklPO&)!RZ0oHlLT_JvA&qgimd4fe8Cg>u;>1$U7v_pi$#X4Ta-jlrIElSg
zb!U!6vTmadShdTsz79l-{F^lPNnK!osI}9=0!CFvi=rV`eoaP>iw%QSAQ+IZ)`7qX
z?gFjK19l6#yi@@?DOgBU*Ph?;8j{^sbQO)5+M{CXl`L$(R#^W-%)Z%WOcB~!yKkXu
zS+w_3VK_n`N*C5y>i?We<;ou+x4BhFr^-6dx(6fX%&(C6SN1!F)4xW!zZRMb?68L{
z!2j$zfuxcGW=FOuH4}8f9mSldXjq-YxY+>X0@K2CsGQ!JuJACfQ6?(?rC9{4o3aKo
z_=3hAjE8dvLoA5<GG_OH*F8XgD|Iz&I=FBxG|9j!YGZs}!0TdV#;%qSvx5n`2Ej@d
zR=uJ`9@XEnL>_OS+@eAeyjs}jU3XX-hDCtt#5fZ=&z29f>kfjc1m>!~jM;J1_eJeH
zbaGdP!X5MvdwngxLP7a-c~(WGTDU_;Pbx%_&?x1S0@OeyqK_#oB}3e331Aisa||Gt
znM+HRDv2>23?)9F?YD~lelEmtSSj<PKKismZJ{<|9PLm&4ttsW!)jx@H&2#_>g>l-
zPWPe$(1$W=qrVUrmO-<1J;m+Cz>^)AQnVKLmPvHVVKkU6%2$9q^GAKzJsQ=c`Ql44
zmKHFbLHQRg^w9aZ6buD!Jzhp1&#YaZFMsxY<b=(fRBp-=U(1(lXX)(~w6aLmT(iFw
zWBd8a^a5tX0GBPx8UUM4$jiEXplF3SSY5&(xtIa4cKW0AUIP;ZZO0AP%`jb<DdZSh
zo2P{#+daSq7=Lh=`f({d8Kb`i&4*JR$W%Xx*uAf0f*mO8V?$YobM%Y@$jt0e9Z`ZN
z<ljNRq3Q}Dw1fH!+L5~*x#aJvL?9#8A1i$#P#EP;ml!<3-LS{EF$1uzXb00lSXcBk
ztsKr*70|T>h$#}OWNK?w!5TH%T3H}lsk^!4mI7G@8tQ08;+))&EW=uk0G)R04Jnlw
z`WsTL&*Y;reGaNs7MOBC<FBtqb#lE`JXnn0m0JrDUX%Spr3ehts_&TO1L=Efh0%y@
z1z|`Qr31~G2D(8fWkwjg?mpd={7@#H&Ip-STB6mwDpL>(P{;oLs6l@|vi<4wTPUAC
zQEdBIwLAxKDE}Qs)N0m^hW8H9&lMChUQd|W0sDs?<0pj!oNCvyi7N{m7;gkr-C)#5
zZ_h~ggYlgczr^l_^~QDvd2IJdLEU6khw|z3MbZ$M^)=_AAa-Z^gJ1w`Gs!2j(qZ7D
zVP(VW^&ITQT)|x&v!l(#G_asi%lDW?e(%`i1&Z(vCd|3zJ4JLEqk3vZ_KO+x+5-9C
z@=C5LOlL>D;kxX$3nsotzmh+FxG?#9=5S&5;leZD7PQ|ZY6x+3rV$Ol=(DBS?v!(y
zmT#Z@Fz-P1y4!@CxYJ`&*0YPHcjTdn?S7eq&d~{Xa}yt&x)XP#w}RC!)Us1HLjS5-
z0oq^`xa+V{ro8hR6rTJOV#RydGSs90k<fZrh<Y=k`~Mrrw`1zBp$23X7N8D$1KLyf
z=S#JW4R)XdFpa(Lu9_6;QM#28I$2gl_ZA^1ttr$$h-z=tz&Oi~Xo_Cd2%RqZD;ky)
z1{hOck-W5e>oHCJ8=5bk(17t>b)lLr&pFjd-&ZJmVoka_|JSfP+y0sccj+#!Kd9Mt
zC8=JLt<L|4n*K7RK6{3~vPS6i%J&$l+wz{g0F2fJCN-$@s+Cf)mHH(<n46Z3ivXXF
zQ%I(j`U{v+nMFC8e|16D*JXOut@%s=2xTz%u?&58s?f)+yHU!(!qf5$^&Bb?TPlll
z(=u-CqOnYi0{6gL>k@}$%Vkdsha)s9$WUOK{zg%U4-B~#74JSE)RmYX62T66+OYH=
z25Ae3br9E);Gh47IQ@qyd+Sld$&ZN%@uodDE*HTxrc1R#W=CI<P}I@8L;pp*cfGv9
zOzWjWE<FBM#elaTHW<_<X^3@?W(=Yo$^lU@v}6A}LKjEpOGTjL&~7zdm50Ai7d9#w
zwhsVO$E3bsa9G@syXy4D>0cB-^owFp1lr(Gh`LS`;70C#QH`^E>(k4aelDQ91hMjG
z3hBDkbl^<)ki1_6lVI}U>~g92=e(S1>7$Q}fbr>@^Kqxv6^sTvVK|iGA~;J5yFx=&
zh5epl6wdx#q4(+0t=(x_)6rslF$lEhi)%NaF}JpB1FSyuvl`l1OdZy%g?UeF{MA<<
z&_9I4%d*u4q8?+fij`n2%a%I9mc94Nym4*D`BK=phO^4AQ;Ruv0@cav<CQOeJ4I=&
zU!e`0O4+{>RNK3d3;(F)C-PBUhD!O(tS2h(P(I)F5xVKRtQ8lTk=oF{N_}tl(-;lY
zJL+Y(>c(=}rd*`@NnvgQ{iZ1TLlI0&db)vY1}kJ!dWv+|<I>YDTCk_EZcdxETCp?A
z1|nVZs|v{OPqm~(#JF_XN44_8Vz9&NN_1<)-;l{P{+ed-cQV%2e;X;k39A&N%P7uT
z36^#doC?XV6q-s@22E~W{Z~=eGqOu*<PaY~t|pLq?(7%ED5!rP3mIF!Mk3Eim%o_4
zw`cqiYQ6KTH<fEn_WfS>=OD&h#$YM3|45Lv4Qb`dQlUosi#Jh4$=zbu<I>_nWT<_2
zN$#lb-rh&>x1U~*awp22vKxq&c@?S~5?`40v~Z{dXf3S(^<?XWh@LE|yS4qYysv4q
zzgKo<y)<uwG<_4yU+&L2`o95}XFsb&e$_rXYEXSt!Tu!sB6@NRx#^}<`Wn%Fj#S4P
z@hW;%DqeNp%2aBV{7a=1w^L$zi9~g}0=e;*#l0of5>%S^Vg@`7iZ3YosYJF8pJ1cn
z3v%mU+$he<sz%J#yfXS)-GP6A<*6H#mdTv#)e1)z(t3YJNojF~SS$tGmpi*J1H?_k
z$=}dZA~?qg!4@p<DYb$Phq+LM#yWbZk+2Rc%zHz($_Qs&`7HsWEx>^XESyDmohJ<E
zQ-44O%2o8W#W2Mq-u`GX8m70^p#1Dz3WW_8({Lo)E|xLkx{mE7s+}dWy<&GdV{Bx6
zs&^YzH?E{Tip&tla3HkzH`9MVj|s!F0D8;r09}|te<@@CHoBx_`l}@ezgkj$P&}c#
zZu{*e<u}t$ucBXDslKCVP(e@1Q5k))BUh1sa|vx*0h=tg1a7g0-)_y`iLX}<;L_Uj
zP?r%YBUwh}d9_CLz{LnGBpGkRx!B*bf0m8hEq_GrZ|2v!#3{7A6*imnFUw@T<@Zs)
z8eElv-ki><UHHw<sijkM9NU%nqlQ}1`CxO|Re@@kksG=(*V>Ev^2z;lUhV&ZUprtW
zX!-xq_m6@W)9+T$4=z*fTh=6KneXEO-UM&t#!vWTR@=mRf9y$5j^&fAw@E$#@mk4#
z0K7*4>_b+?D}zeU8<rDp3R>Q<4mMt5Uxx|BU*J44=>wbmSSFZj7s+l1H4088%b2;f
zW=+Ci*p69;!5+V88CL+7idY%f5S;8Mf8yzGFxn(;1B({{G-nIJ!&zE_PRxRXhXqK*
z4x(ml(fp|}u||`FU{m!x;PKMDg_<{ACl|L`&R)9XbCquW^wKd~`1-{$nU9FBmM1g+
z16nO~4OXuii@68Rdibn_(VRFsKml^B2R2<MxzlqC29gQ_S0okem|Jkx%+4eu4)*p0
z%m9x8Z;N;G7!o|;XflzXXn`-r8{>KVK0$S+QpudglN#jRMMzO%u1~ZGHhVY$2CIIo
zp|UxZW9X9;1X3-j7@Xf}wFEcK0@T3DAIBvIZ?+9Wr?XaqPR4o=t0m#_<{emKX9GM2
zF-%I5N!-4mAOS-P27s=~V-ahTKSzl0t!d{r4~{WD{miLCQq6Yo2+%MwK=JB)SCXp-
zfFft9-oaPz;PH%C9|9DUODE$AXB07s!3!xU7`p<&6R0Q7E8y@mAs@)01y~K>h$P}1
z>LG_2aCVWLKLm$sM8p&@#0LsoXc$+_QzCJe5pU^J&ojvRMx3pA@d*gVuCoXZErOtH
zXcoc1A_%$yaRlRn7n)7n%{J_x?JyC1G0ut$XsD35BfL3xY=F7q<4I22bhFJ6!AOoB
z329)T2aUk6#7|a|Ibhzb2EUxdSqtxt1xhd91aXJr4_p#wQ=~awfWT6b)LyGDE{h0U
zz9zYW<CvHS2|17n&m@y4axAmR_{L9Z<}!Ng+?uZBn(>paUgD@l&Ufb5oT)m?CvemQ
zJ~RYB@*#LYp9m8rR#AdqT@1<ObDx<i1c5}A_=-DQ1@zh3qoUY25HK*jNwwmf5Th~4
zhG9#}j*oksfk-~_vWOK(oF>gunG7@drSU~d2%uzcg{Eeha~2zN<V04&`Q##!If<`k
zdd#0lL3u44NfBmC9{ID?A(PiRM7kBL4TD3V&a@e)0YN=cbY}KL-q1|JI5ie62zj#w
z<8q~LE_ySKva==Q6Z~u?T|+Tak?Avgbmrn`hS3@uHD1Sp{MdCyR?VEfCdn#yk}O+H
zRJdb{B+qdK<RWr$1p~g2MKd``t$;^k3?bmi*b!Y!`(xbTH!;J=ZXy$AeG&Z5p_peT
z@yVzK+XSy*60Zc}b;>`)R14xKO89T)bdLe;lt0}g-e&yiUd$=8oOk45;Lrg~4dCla
zDv?vZSTp1)25P}kACb?qg-Ag_2yD;b^BQw+>EX#9^B*PdqrJc`v7_nOAs|?3Nv*`k
z_1SBK201jZHaN%<hor^YFq`H>CY%qt<h&+apS6iVygp#DcuVeiXe1bL9v*?-_z)5A
ztR#ZR$VdDk$(bfuA!B1p&d*(aMk`+{F~9{3KqH5$NynUVH^+p3Bkvf%JNLr58)*)l
z=ZGI9&jyL3PrlKZOAAcmG9uVuKLSimhM&&_&69XzvU=XVfKTdxXIJdG$Iu&_d&KDw
zlII?a#<xES4VE01_7o=xBRD1{;wiZp=&uWk#m-M-(<ZTn>LCDowLCtAmtuWz&ob9c
zcV>z}PoVYI*&@&<egj0DtVjoA55&`oDVV7R9i1&S^DwWFJGs!z!@NR(QF5`QM<_T0
z4@MxlRMKN|DWW}-!(Jk<tAPvP5j<G>;Z&lIF~u5#GvC;v6ZZpjOTaaLzI*<hK#S@$
z5@>PvauR594+L6#dpXpA&$0M*{-!xol*q{xL0$es0s|JNLe?ij&ae{5+wLcjdjazH
z`(v%`h+h{%PUK{YpzaLhg{g}|P6~<`kZm*NLQur&u`919!4R*!wQi>uu7=4y-~KXO
ztF7OO-o9v1L?S0slxTzWePQaSVeptp{Yo)lka_yxb$)<!)m7>1hT%Gra>3F=a2>uV
z&vn<L7$kBsMTs^D1`AUc!5~|imk4u_8N^V)1->8yel|1&q@EM&0+tc+OlIcnE!3GC
z*@BdUHmFP@Ti{noE9J8>ELOsMOEdY{l;m6#3)q39MvMS={yJL{pU(%(SQ40KZUg^F
zG&~v^fR$`ET4b@{5Rq1~Sh>taQ=w5wVG8`F2oM%5hysEP{sWzw6M>Q`O61JkU7`|c
z7fFS3a{wR`W<;oy)PD?BN&iU{kSHgC4G=)2lqLuuloE*Z@vw@tD3fH&fgw~rE>gZm
zC92*ke7imm=)aMKFEAAYK|fakE*SwXRH1j1qpu2n5uXiL30Dk{nvDjhdpJrw7CT2b
zU^vI)l#B&}JAx6tUN3?3EjEI?l{tmvxQY*a7=36EWmd$3CUEz*Wj!?}|N5vKJUPRC
z(U|i#oSFu4dKnzC!-7NtPuv4|bXy<M^7YGl41P%?9J#wAkl(Pe(I(CU2d3c+ox$fJ
z{+wZn!|#nmaIF?r$U&ANu?`-p{N;NQs-SF0PqpJ8AUp2F&KfzK`SJdxS#PA<VCmb9
z3n}XxWL1PsD(oX*ufu}lIFX$$IDrN&Wp^+HdR7>^2AUhJkg&p#lU{P@4z7PcX@fIP
zCLXZ^{!$VCD3~g&1}}Z*nzi=zb)eb7S<Xe*EjB_o3;zwu6ZHyQt>S@1aPpiy0AxI8
z@@5C(bB=L*%*e2wZP3UdCXX5ce70eWWx&Xh;Uxv<3?E5I8b7}9zlLuG&j5P9F@n3D
z@WWh+#JM2GlcWsK46`Wws|*o=7@K}R4q`q}@Jut0D?SaW%ySHqG{0v@hacZ_45B9s
z=sfxU_E)g-P;NrNzu)|J)87g}41O-r9-sCVe%Nx5a7D7EtoXYfZRiOPd54EN2>%KM
z!EJ}QI){5DFW-Q`JKn?U-BG6e#P#UyRS^86hd-t#IMgfI5R9<CB!L<YgrmbGYBDa&
z8RELnZ$Mzz<_p&{5LRigL~nP2D~%71c0EjNp9|Xv5@ZA(OpH;|W2`XuGV!=bro6qq
z5q_Fr;Q)+-dJj|HxM6Kq!)m8v^9D!9`WA3e?69>v>U5VW^U7?UEzXvXhPI834rf-$
zzSU5;bBn#DqtR$|Hmq&$Y-w|JIX7(SbkuFjclNv8+w%4KNU?rHV_S<IF9WwJ7sLku
zS*&Wr*f+McH9Bqf7F<~99vNX>UZcTi#hM&1xJiOFhq2y|_mLfzuLA19>PgfTv)*W^
z2*VjJxKX)>h=N|>7+@+Ap`6pJk!*F#BTR)G0zE6(VehmP*_}wVVfE_VyDY)sNJS{<
z_Lp1r(9W128JuK^s&GbvL0?6uMu{ras!YAHs$#@y$*8L{v@zv<oh=Pac|5AYSjCjP
zYy0sN_oaw}K(R-z3IYY3&ebiQ&dxU5hNiV$?M_=eq|_N9W8-><vukZbJ0&ZG681LR
zdS|yue3eMjW3oM1{~;2&ya9GqA=s~=@TQgxt8wqf+qWDMcG#vL7cBaHZA2h~vJ<-*
z=W1xJsS^UvZfg1Qs*bhr+}7k=+v(`C!SJnbf6|(V3L!G8+tEeFP{wRqYi3By1~Dhh
z3EGH;kcqvtFc2(<=-l#<$NjsDQ(oL6Di1O^b?Q!Gka_L*OR)VLz?8L>RkfB{2KGs0
zI}LV|{y;E9L`!@}#E0j>!vE?w-rfsNbl`#o9LR_76$gSIFPIiWo?>Puz?11I-xLi*
zVGS&g#hmdQ`N}HhLfC0(0bV;Wb<jO0mJoKgt?)<A6oP#a9tx%Sz~-6gS_m>+9^3m-
z%_3&|MOKWaMHp4X4>(+-YU()BHi@v%UZxQW+Zjkc1Iab<?3(Q&EP_B>TY8;{-7bpg
zbkhPsNFb0_8EdW6l9cNuDZ(lv^#Nj9goW{tK?vx){yo(DPsCVLQcM_qke&kIXnZTg
zcUD7?=ORIPweRBG>0JE5pOLVpa{8b6=6~YY4cuS;Pb@MTEYoTDNE&}vmp1(&e)&Tz
zG#aNrpl<$vf}8Jupdi6*T3hX}1<(E*G1JcrNcI$U^Aw*wMg1J28-I?r{~QTThUvpg
zjvQVB{@@13wI^?ZtOm&13Rzam@o>o(#1^n=dEMX!2DTF7XJgG+!SFHJ#SqV0Of<aW
zQ%g~C-ka%&QQah_3JXm}OO>VCY_(Kjosve?^q0jMTeRi;v0*t>DyLd-y3C7u=?O8i
z?mU38Ar_w*i;HzO9TdldG}pG(Yx&?+I8>>aiPE$lTY3h3*D$(hXMy4(Nkudq%0EL#
zU>brb&=1sBSJfJ;YeAGoz*ZIWnddyZ{7gUV4gt0qM^G_-r>vT=QbEiHue?9w_DTn;
z!lmGP<xq9QwqW}&zWl-dB|>ZUv_b&idWB&Bcf?r#R~&ne{obPh%9?imE+<TUh`10-
ze|(;n_dksJYtlPqQgTY5s)dlG4;+H&pUeIa|C)X-%ORnUdP$F&K9dLDIk&6n3zua-
zvMjIp0%3SSzG0cLyzMotCgPeY_qmM*@e=x21_&OXJXb9xI~b*uzA;~xnIS?>)iyQ0
zu=WZ@ym14>$isZJS{RTWarH19B%881+dM1T(9KzTKiBr{M=8o5&AfZyB)$Z~i4t2_
z4g?TvUctjRzV~J-BK~14%<+x|$AUF3|Dke}Ghnjp3RhWta9?Qk1!}6J10Ef^;!7yU
zX!Y;jIoh|2lrrx$4OfSD1_1dv@4|rHyDq*FUK|>Mm;jgg=|vMTp%Za~jv(>oj80{e
zSp{9P@xq)xT2~>|75Lf}M!4R7UhVcuxSn|X7MU5YipZt!m93hOFYKh_HOZbjR)!cg
z7N(+w7%<AhHrOx|3q<+~$mc&0Bfx(E(j2=+rYMm!vwJ5dh(%JNdfd_>z$qdCaLZEV
zBe1cKL3$zO?fNHScK#dNyZFZXl9_$o*>+(6{<gU+dE14X9(o+*BvioI_L-U4MPaA%
zl^pO|=J(KI11|jE-bE*j9dm{a82ZVSe}mR%+e)8}<mWcMbF3i$;Y}}@nX_{%52n;3
z(^&aJU1D;5?c%qaZh&CQ|J6-RCM*hMPm_Dy+fA&GysBiBfk*z`5&zmAM^ayjzVRu0
z9OLXB$FCj&pY=<?n)RD9P}t$%g(#-wZPudCw87B4h6@ylg%COMJHm1wl@nhFb2F6+
zaSy(5-sVEK(K2^6w3VgnVKH3fMGE&yEP9KHsnnay@Kb4I%=)ToxT~?kkHG>zmKsR2
zRxz+`sm6I&vB6ZsKpg}8SZg5PS`GO%mADD75L&CS0=FU~E=aeSp#W6{I*&KArW)?e
z7Q6y$K~co5cq6LBHfj%*hN|(3GKlGynDr(Lv}3{RGR;+tNncfi_bE(}Vlay(Qj;D+
zx+S7Y#w5O%x(gX_0kU99T5VxWcu)c8Ss4q?%CzVWm5fm@z8ycRFjWFfBkojGo1nQG
z9EQk5ti?8k!3wQb<G({IH^5(@tfmqYOn?KFSL*@!DawvaI1QQ9*-+pu>VN`zTdA+Y
z&CnYKB=sPRjDjvmkXaX@sx$|rKp(SOSw0~&7}ec)g+_v^)pk^=ZbLQr=g6vl6j|We
zf=8fXEvi!Y2+fs{P=lu+;RG~$0<KfYjNcL(t?G4}ZUA4SwhK*FID+4TygUHcEUc<l
z_uwYYOS&bHU{WuE3I*UkZ^a%cP3S_QDvcTcUDxJF#ap4-FGAITunL-L$xx?L7f@{)
zy`w@3<7`r24ES-7e?Vjbdh7s<F?9YzNJ#?>r=ZmX07D5#4njKg8;Db-z5$ul7XUI(
zp(^V4xE%Ua4sf-QlBYQeb+3dWrI11M7C`5q*Ox)s6x5ec7okeqUy;eypc!-2I8FnR
z%}}?-_Wru>0wx!Ufc*es6@EZ-K=UnRRX?VHVFj+khoJgJ;w*eSpz=kjwUV*o=b*+z
z05c8XKLCMkOZYx?Ypwz4Fsib*pf#hd9Wcv7HRWwks(>;9Ht(QHb|*56pQC<{%(i35
zOqHv@MJa?PGd^nD23>wd^B$_CwCWvRjQ=G5JBIK}{FJcL;J6H@a#DbL*dcY4APc-q
zz=#L;ug4O{N8T%J^-vXPX}bfiKs6lHJSMF%1Cb#LuEnhR1E5*}-G3e#Y&SuR5~>_J
zm<ELdsB&)upw~oQEv&NO31m=DaW7Nd<8ny6NL<L)5LiM}P0M9~a2Js5J!Dqj0oP{u
ztAf_|LnmQ?*qZTvZ$0)<qsVCci0h-M@qWiat_C1$9b~|4!pdqq!QDsIA(Ofg_M#`$
z-N>k3hpO1GI4a;ysk_zIfDC)5fy@$Nm0_aP76lAVz!*yaXBcki7-7pgsIm!}s4Dga
zs$8>A0}$08a@(L>8n+)AsR<awLjc5#EL69e1+rfZxQ+r2??7#(c%S+pb<yP4)<G;G
zF5u1}Gb@0<`$Q&S!+qTMbdNbs;HNa}bb6qf6+h0^LyKk@ax=bOXslv2R2_7U(Y&Rx
zw~PUQR@#08S#9t)!M!kblKX{bih2+FQ3@lpPjeA>H`R$Oj`!JT9DkTRGTsYlRy+Q@
z{|a^9sZ~%yl5Vy#l1i9lmG~(46=Y#usB$6&RjQXfyGmMZWUMX6g*8AC;2RUj9IJ`3
z>K;Dzvr}Qft$h5&QyYOAtECnzQ)N?%sz8JsK$XWIoZ52w5oFoxVoz%hNzK)a6>pV-
zEUmI#cIQbVR@kph4mgfG#*o1_26I5wlDhXnWL95u`p1N2G#tz!i>(j(cGxk5s`h@J
zYSkQd=;T+6o(eg-sVcm2O09Wy@;2a!Zwafdz&BNRo#r6-1)#zKt{3{U<Ej5*Eii0%
zP5qkC!K+cH0@GR8OD5l#`Y&kvVc3bAVTh{K$ABUfvf|NG+u^r~XO=KTARObmxYftk
zKq(!}m=lxRsSCu_0Hxb;4XR=PG5%o7L41gM=qW!{Zo7lBY3!4gjuVbyd74Ommk8)r
zhpfln)jaytkJx3Xl3k{GNYf@X8F{I9Vrscgsn$EbPQ8RI_y$S=h^|8h{2tXzJ+3)8
z`7*UmqoWGs4~PJF6Lo;no)S)qxGqPwV~GZx`aY_f@Hv7|^{)N>RJ!Kyl$A;we*+o!
z`s7ujQ{M)Z-*H@qs%<qIJrLyH@eXM3yF3>SLWeo(^;0L2k$uAP7<GeYaQp=rfXiS=
zqEqtm(|}1pSZ!jTkY6kwS0e*;i0zO+CI;lHCyl6@y&EbnLsch#4ge}yMrbjxPpEa{
zjmTuXhP!$ESJW%O(J<Rk4W=|lsd70d0jiqhzZY@Oj5A^h`w=ot-Z}n9O&>DYUekaO
z*1U%=b+l`4LWbk-BCG8MWS;PgVX@JS%Ckg{_Q{_@Kg&<8ntB_WY}M!{SDxB_>Wj$W
zScmf*Q<|ewA?WBG!m7%NJLPp^dA;PRwfLo$_t{HMeUZCc{+MKZCpFP>EAG)eEoV}x
zM<?Y}Imb9YqO|gN#E|!Z_pi1>O*a()E_i_JI+cI=O8Gi5ve+ytODcX?ey<2sPn=JB
zGj;vcNuccWj?>6;O00fw3b4gh^4mq4LTPn1W88ZjXaG`5{1^G7qVc;O$EXp&cKuTy
z+s4Ly+)ctNGxx+^H8O%eRJpgv@q+FIR-9S^%-&61di;J$K^=mQyrwxexsu{I6}0^*
zs^K))so8;QPKr4Vva(<2G~5{XTNu+;&7dYC4~v8rv-&N$M#5b(Ud#F94~x0`G(%9g
z?$j$>KPPx~IY^vl$_DMhBpAOBRoSroHn16C1$S^t4%`}|HCdWJIIt!Q`bLd^oxWBI
z&F_N;kdwA2kWm+%`l4<OfLt!)q?(uHFG`{LEYhH+hgAX9%#()6m$~;Rk2u)LQb)S{
z>$0g|Qy$H$y0-Dv>IgEjAfe^QB!I~Knqf*wy+rN1`W|_HntV^1+zJbrxEltzid!mw
zS$6eT<O4K3!Qzhd<UdOTOkm+>??aU!u}$Kz_^Fcv8oT_zY57>{<h6IcA^&SCCzx7`
zh$>*Q?RPvO|C{vG1C9pyw$!Oxs65Rdj7k`fH1UTxBEOD4eWkj4+%>uP)Rm6Qs3z47
zVo(+S;?T>}B>*ms2yVw~<E7BM3EhzTyC4g#$N!5O1s-{CYPmd+E-w(NRH@kHxI=Zb
z6r}KTTsz=4EWbt~{)>j%`-Ec&l|~h4>VPi{wpYRO$7-H3!Yd1EKrXMR(&SG|y#H<c
ztpks@Xr8AIP5R}dqRAA^JMx#r)N@o8R|2dichl6H)a&y4RL2P_PgNjNSIgaLcN)g~
zsdrk&@T=nn`LJZt2;H74MpoN#`9=}d>`+qgAk##Se6w`&jq%M6{pn6BGI15M%7;>I
zAB|VYKa$EXkZ5|iVMjHWK6NLuf^uR#{!0fe=M(ohmN`oH|Ac-_tjPu{?~wmluDL_^
z>f{I1rFc|xhvTxzpE&Aj1~m};>bQye`{eoK7P&J+vv1thau7jKxk%o39x|TXNDayj
z(n%wy8LyS!m`WX$i_;v}Xa>PJ`6b{8DU9w9q*N!jOmp<qFH!a0Um=rx>OB11>E-x!
z>N(X&y8QiA+hO^oa=Dxqf$m)k?^%Kulc}l|!n!(TcH%u_8x_-M|JpYykoow{&pt8z
zV#e9NCtl3R`(q~kplJG4)!Du$AnWW<J~b;3>Yg0|$WlS>*`c3#Ri%O#dRjgiPRT|^
z_Hso>w_^I6+W*(SN4}{Y?g7J2FH>!_m<?bk4S=Uuka$vX9<m3YZyd9t>iJvq8IPVh
zJ}WVw9pbYR_wEr~0Gs$Lv2t@_3*WSgyxX+}UNq^`cIlYSTrhw|mQEiyB7)}yQ*Eg-
zFk|@$w4^Pi3(?T#&8bORm{T>A`J{Z(CIFwcFr*Ws-6!}d&GrKmHz2*zlsH~Je=~WL
zFr1YEz%Zchj>O&zyLl0u&s(He=3<G-xbbfX_9zDu08Nj>`?gRU5NV_wb|$d3E;pL3
z<-Aef>n<OuwmPkra&lT#-VcUvw@GiVTm%>W=W8MJbCzGwgU><Th;mzkNot3)pe4}e
z^#;9%+hsDCDh(E6RVg|Bg#8=@2Wu;hro`hS-OQR>H>q&uysV^VlaWsW<5Sj$g_S6C
zNf=I^{j8D!4$ufPWwVZE5D^tiqm$kkJd?N-P6k6WPiaZj$a`;mSITo@Hfg04S``z=
z8F;s`AC#0f2suW4s?Z(ua4`@|$?Z&QOTblY=7A;;{QJtB^I9hON=YufJ!gI%fhr<-
zV9}icDj}%w&A}@~05XDS33(nS!Mq++DacGCkFm2O-gg*|&v;w`LBAi~{gl9;Z;Bi7
z!mDKYPVh|ycDdcoJ`$K7t0d(U;LMJ^5IIqkV73H~%D~}jeiC><Cz42eOT#f&GJ;pE
zFYtmArS$9wi9GO%7vgXbBdRzFzkdxXPmS1lKfK4eFPH$H0*6cqOa*|c6u21PWD2x^
z*B<$PkQm?;II>O@;|B!X9wKlY=T9V(x=VrL{J_psqj=xEco~%>D<|tirU>N#$5x^x
z=~?r5R4GtNR3Bp1-*C}Dk?8Zh4n3<ZCLhUqkggP<6}lx^MToebBtx6W{XDn_vFb5g
z^esWrhZq2Ug)9j0nY|+?D+^Q6X9Jj+Sv;iIu<F;i=yGAv4^jZq!k3^ENDEUxBc%DW
zHbKhE1V#In6dk4j;=;F{^QT7fcc;(3A^lSzO5;N}D+qX4(^z%6py+;4(b1(jaMZE*
z8_pL{O?4&CN*<|1<GeiK+r9U_6xD-2z(p_Y$3?e_Nz>=Toxy7(_DRGzLpVj^_FT9#
zi8osiN}U`v4ACwSTS(sC9govK7w!x%Oyzxfk^_l>jOW6g&xJdm3wMUM>Nz2r>0G!o
zdF49)#)#J02;Xzz&ha=xvLc@gcZS!(kof7OP)Xj4ln+F7o(p$&M$Uyh&kMLd7w!yh
zzwEhi=h;BwbK%Yl_sJ(7N}01!P8d)U*0#icA513?J%Ni}n<CbXh2hL($G<T3Q+E6b
zSaDlk5~zEg#6{2jgn(KYzf3?aO#O_Y=2;38Ai<h(-_yA0M^6!u3j>x3$c3q&5#${6
zV-mpc{uCEI@eBdDFg}?8T$uVP024Nj8Ji{#>}IUG9T#2p94>nLSxn3vvjNM=vC9iN
zW9uBgkR+@#GqzE_^$n`6MF*e9MQ=X`ZA%tMkXXxja~aCgEJ0bx=2X6oPSu8@r=Q0w
zAksHE=s7C~Y;IsYpJ#K6_fk9G3Y)OL@=d;pRe!`qXI{WXH@^UgeZJV|g{05|DZuHS
z1ZxwsUz`>TP?$BQ@<=>{RX@T-5B>}n?fV%SW<mz?nYlY#Z1e&I!04C+K1Rj`&?U2`
zWgdK&s=MgvpU25M8%sThtVyk&jjnSH)O^E(s!-9BFX5tFUgAkhBCQu-tU_~1cDB~z
zR&KuC+p|@VonLh2uW-?!U+^uT3jjSA0GbrYy&!D)T!e7)8vxEl2rpoeMdZ>CM4w$S
z!C4zGm|nOFtNxx_1pkw+=x2W-l4+54|L_%Z;E*Iw&myH02X-?e`pGK<v6b($i|Cny
zzBy~&bCl|5DMgnY5ER|~WdT{}KRtHoUkP(sDC&<84U+uhC|KK)m&X}2XU-h?PQy7I
zs9SG*MNstiK>=A7KWli?0?1@poCO=#CbctQ=Pk!iVio8E_g^h2`rB0m?q?2m`V_c{
zrELao@|t#in(9GW(XEFCMUNli2jyI-&i^&<r&vtwxoYX$xv(g6B;?91A7j;b>x<63
z4u`~FBisQlMhlotb<Z>}S!NkME?H*yZH`DG%Jh9FW<*ND8)>fAH``VMzxk*5<n*4O
zt$!4#9s=}!BPhE3H#`IX`|d{zyLhi)F4Y9F%|=YcP4X?6<3qrs*9l7QiIak&$4&}#
ziyYNkd7;EhrL*=F91!@X;28Z)T=d&F1x44t$@g?I|D?s8mKOHtk$GcwHt3>+yF*Mg
z3RIV+6n*$xLDBnf@hE&w-=oj%tF#dDpUxBJXB&!yOUBG79Qi%4$?yKvQHc3-@Qpnv
zP~9dddioEr*#4d;!aR4S&*;gtuxlSAjfv`^6H8SR_*XDzP77epWiBEAw2cjC9!dW~
zzBl0EM!-+S7xf3i`S4Of^1lAqBY3z#9yc$QfBZr|@m+%A`AL3GUQa&Mg@3W^gO?wl
zEjtF<NnT$|szdO-)k3O)GvM&(Ofq=&#gfUtAR_M~gt7s6fz+NdW)}2ZFRX>I7v}#5
zfyceQ7D7$R_z!|hA~W#~S_pQL0`tHF8Q~wn#Oltab|S}W=?R2;N!y=9_8Hf;g-FcF
zng<^K`!?|mBbDYtkNrEUPlKAMJD0r@sq<jDM&Un=|2vSt*=}L)2(Wm@Jb3>O7ZVfH
zV$f#O|HHs$T?ZF-k+*k|e}&4s50d<Oz!R*pkhqDQ&&EUqP{K=baWs;1{|AvE?vRTk
zkjR+_{NLnbp2Ec2a&e&HMAkg0{9APwlaxsrpEPL+*qBr2;&5m4<^guWyM{=WxpOv=
z`e`7O97h+2JFmbz;IpD(F8JB>PeXv@FAh18^B)J9q{1x@d0v5k8v%aBChuYlhG3ce
zLO!^IhuF|4Szd=n*-$9Bo8S@mx+SS5BluYx@?kMeOz-4Oc-fRWe-Fx(+Pb>dZmNyB
zLuXzB|3wQv#{N#5&9)By$fwn2v)JtYwkDgc-`3F2r`hdu@^_O&$4q{cttIvhzkDlv
zKBVooK|Q$gjRYGY8=m1O2x<1J7QZjLwYl4}+0?PiwYodv3bcFIdhM>w>v#9J_~U85
znqil}%f~f$kF^X2`!^X4;g*IqgD%sE(Y2{EmSzrnUH+!X*7Yq`$Q$!)S`%S6cZ{){
zYNDG>zM&Q`X>aTB)=eWrn@yDiu1y;Oj?L>fcY9r%d}D5tZ<h<;Y;9;EZSUT)sd8w`
z=8itkYTxd7o5QZaI#Wk1KkC_J+|6~@7`V-LAHk_*$QSiAZZX8*7=51RCaB*vK=NFH
z?yze$;JkVcj^VAFnufM+UgLF{nnt%a&A|4vo4UjB9Nn_X7;D=<;@h&R%jfp5*|lZ!
z8o)o^pMZS;=rZ6oCgB0}oW-Nk=-O<r+-&ZubT`NPyM9xpp<{5Jx2?ej{q8nEy91tP
z2cbFiX(ZeSbmvCb%)$<Ij{%N-?50?qt$v@+)!fyWL~Enl)IIvYi&KYji{CixHg^oW
zOqG3}W*-@MGHwfL7dUI##m1P78I0Xbdke=I=<R9lHn1=@9fMsiSD?f5ImR6D8}TLh
zkBq@0^juKJ-P{Coq-iX996g&Vhqst&hT{ER1kV~T>+c@)#P~I`6?hTaFl-_7qZ#<3
zd8DSTfy}?zxscS}LOQ}++zLEIa9#|bEdifl>*n?tUj@1&?$y5JxwC5vyxMDv`E&3c
z53l=^_zvcO7tr#5b1tkgCdr6}Gz!egUyQ&0(=ucf_@K{i1{o0O`jmOV<2D9zY1GqH
z!}BlCw~NY<Ssst^_Ch&AcnNrF804s_qbU-Fmvem%nZo1Z1KIUiaGqaoF}nUw&s(0(
zu1z!Z?QCNoAv}3jx(2$V?55dy1na`TKTl#jHwJTe^z1S)Hugzn7Neox)m$_1No&Ve
z{|Kx<Gi%^yk|}QJ^JcSs0A#f<zP6r?cDrG{9orgTXBL$){+eB`v#r0dHo8o^KQ9l0
z3^DEonG7-=)_?EW))1b4i<MoBj&o(*e{PI<ddKGDB6<v=vx(>(j<d>Wmubz;t(!XH
zI?BR1JvaX|=`eBm;{&<U9RXP!-RiIA;V-hTowe*@V?7s_e+5^ghpbgw%xiqGrtfll
zVcjG8S2Mg}(%&@7#q`A8+@_{cx4*_5o3mSe?f}s#EWB>mKIn+HH&}MJ1bBUOcC7u~
zqrARI+H7bUZ5wQlwvVo<0j;)~sJ4mzz%8!{`)yl5Q{GMTwt!A;GubwRhP@7SYrDk`
zCA!G7o#@()TZnEPB-K_E-P%@7o*T*Y8XKt2b_coT_2;(1%8H6=$aQppvJQoK{hHJv
zdOLrC#CCEsOnz2FMR><8e@lLBq^^CjF|qCJ3{_O@ZnVbRZ%S&EzvZhFoNac4%e-cg
zB?%pT`C#d;iVBheogyFTIC+@)tR{CvCh$7ex<(5rzmbOr8S6F@B%$l%!{73EWY1t*
zZ4bjZ`T<@fmbjs+qN1ic0nZ+TN03T}*GQhXn(aoLV-U)h_9x26@#86#gda~Y*pHY%
zPJ?dIu?yB$18Kvyn$)jYm#F_q@U{?mYsN@<L!x}+dL9(tRtz3bnikScBWZsP*)Q4o
zO%vah*dX%b#mmsx+$JS$mQEnr7{P}h|K`QV$4%OUeNe~XR*<VU+h8@&tG^VGA{lY~
z=Z{Y_X&>~7A(y$E;JUk_v!cRF;1SH?<HHw98Xq3j)g1<)+o&75xv6ukqN1uXfnP%m
zKfY^8{4)d%+F?)UGY~q1ZVAr~?l^unp1o|1LkWCsBajWKk)NdT;_0(>b@wRn2AQ`c
z+t$Ae;1h8Dwv7aTxI0^ac~*mUG2+?;IyS%~<vR)fe18pL(q1R|iS?H+zm}hx>i~^L
zp5KBb`xE9DY2V%KBmB$b*xqTbsIZXn_#ETcG*|=t%Z~>@J6nI|jbDSUqOYRDG}E8h
z{Q9)<+ez@--3ah_{9^N;pM-qJNpImp2n2gZf5+fv^Nif&b4;ZE*i8L6f6gC&LbZ5(
z(^fLi2tzn3fYmnh@Zo7)`{MF|q=L<W-Q3j&v>n<?o;yQvd|PaTGxg8XUxG?A*cLV!
zVgJ0&wz(5nwQ576zn_F}OkToX4EDT$Z$k^g*D?d|6YyP~q=ONBS62aR6Z*E;90Wek
zZ=WRp`1-tl36*TNwHrwNY62gg><t5aD$G83;BWaKUY^C|M|Tu#3cTLW=X50Cx7*w^
z@Pj<P`AL;Dzxkylj(_t00oqv);)K9&w6*g1z^x-b-+5G$<~tEJvGoRQ3DE7ac>ir=
z3550I)9Ul`Y-Z1AYnxZ!wgE8V`PH$wJS0%;e0>5A;F0p-IR1(GG@pKJNKw#b!dtsz
z_89UE^aT)nd+tR2IQ@B1kVJo8bj-6c&5QxaO(6A_#Ci==s$ol_JzH!%=gVha{&+Tb
z_}s)+1u8+jd_rEf%&R}YKWhp8uI6sA1%S--*9^E}55|v2+aOdW^ox%NbOIET7=F8L
zJRik7&6kPeH^09fBy(PW0Z;Pj2PPryxrrhZ{{RH|piE|%4eg9_k#cw?e>rd1Wa5TP
YCT7xvxOjM(!vE+OY3&Gg+PaMY9|ZVQTmS$7

literal 0
HcmV?d00001

diff --git a/tests/fixtures/successful_submissions_fixture.parquet b/tests/fixtures/successful_submissions_fixture.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7e4c853fb2b1a5791a784f5f6830b6e6f649f61d
GIT binary patch
literal 40452
zcmeIb3tU^}`8fW*A*VeF$pK;#fdZZYfdoiMLI{@vZGb>2mr~l&3)1u?IY1y6noC=l
zRXfF5XY2jC>colH>8w-dbW^*pbzbIeb57mttghSYcXQ5pn^XKf?@0pTQZIAg@8|#j
z{TeYj=e(Ead7t-rpZ9s6%W1d0$E*^ox>d~`Dya%75Yo|2ryvT8f7DAY6{82_blpR}
zNcil`iv6mPO0CK+gZk;JC8{MknRKE?CTS~t)X>kur%;rt6T!C>r=*<Uf&enBD(rwC
z0y(|zA{=YIKmH4l6e@Kxq(qHOI#Gu}TKE+Dk+3mHI3*nbtW;HZsAvHWE4~3qFw#ht
zu0%WIkP2oYDpi@u5EC^rX-y0{L?AAFN=8)}gD6VzqzFV^KSR_5FaOZ_(2}#OGSB|3
z>%aJp6`9ooZ~uA_en<O0e&#0l{qq~EsO~^@=0f5oYGl%hJ_rH}p8^8&iF-X(-Gmdv
zUh_`ZQm7N*!x7`?SSnn4cz$${_xs^C4{pOzm){!>dxHTd=l3`VMys8Ezcavjckoan
z4Qjw+gV)0cBHqXtJOW(j5X**iOXJ;Vrp>Dht>=;F%6%!S*Uts?Vtn`X<M`rN)Cd)5
zy4Q3!t?B7veLUykL#`kf^01^gK*-?tOS3d*430*_Y-d}A$t0C}{iDH9gbjwJ1EHXw
zjd0-+R<11b21cV1XQYe`MkAC|UT}Ia<Q_6cBVJ$F=pG$)^1C8@fQ&Vx`kfwMkn=db
z0iQR(OQpfEkq_+fhJt~<{H9Ht^849(Hs9^z0)wTc`BJGn>fxN>kQ<(c^7Heh1>S($
z7xnOLoi`Ycgm}*1K%U8!)kEIVa`+zzX|@@M8gh*|yO0pp!$&x;k4ly3QT3%dBz6rn
ztVPJHI%}8I=^XS0U7XM9WOoF;9@fuAI-|aJJ`~`6I%mdEwUf8{E1Uy?=r~(qFJpT$
zk*u*y$_6lM9)M9q*pj9)HtZeeoe{Q6B}MHj>1d~({f3lfy@3cD3ho5FUA~}uq|LL-
zIL4Mp<kn`dAFfK-$Pi%fp_ZZL@(S7=^cgD<+9iBhDiFPExfEOJ0ak|xm33?vTU11?
zKy?}Ch*?&0urM6;L&qkgX#fvovg|-G1Ql;UUVLiVa@Nb%vF*#*(o(PFA&jwAR$9;M
zs76|E@57$L61E+Hd;8gvj7+w%&&!suoop!{L;aGiXt%Z*HOVa*IbyaRufzHDZt3op
zd$BUcoYQw<^Ig<^*WQZLqhW7gP-2PfGTQ(Vr<03BLS9!i!aJQhozuC4cSnLDr}iI8
zrw(eAXg?DN2MoEiUayy|)LzoHS;TUE)%|6xtFKyHKwXc0nI7op8>ZbFmG&~qg(Ai)
zF{-v)iD|CSt64?c`*fRY#Jlu{tFlh9529-AZFmdKvMxy#YBW|MRUuxXxhZP}+C^1q
z9>+JF?{vDNUSGs3tgS(J!08|GbIySZi?hPA3;xVDr{*VT!?1KEAs(iR4~>TSh||r5
zBXz!DV6db?$LRsRT$JO2Ur!lplB6KLw%N#FG~9Mm`Y;gLqG8bR+MA48)c>Td0_#||
zsZTcy1U1_%^(i5QSs0TCNL<R|8vVV9)n+&zn((BPMXQrspbUq&fJcWPgc)=P1L24x
zZ%=MIG^wy?&b6G1l<jbBwl$uLP^a7|ma;ufqCNMDOB%7wv4RHH)0UKlBc9q?H!M1~
zu1+FPAMoztJt`(A%tv56Zto!^)>-sh&o3`$?LJ^bF2b|XfOo*l1GDq~U}y|DRR^o4
zlf$Q(I=KKh$a{1}!0KHNe#1wr{~#h$hOb+_f-pYYKB0YS1wu`dGf`*O3N5Ba0W18>
z>1_dT#LM|$9k6y5#>6(VJH3%1R&#YB$cP=BFUp6dy&sCHSL_?bUj4+6kex51QZZJB
zBV5GmZhL0+`$*iNGinfZHT>e8R-InYma{$fu4ZSaeUpBt<L-tXlqM_pEYvuC78*DU
z9lG4``%K;sBh@wWAr0<19plfeFwkax00av=81j0wryqXC@rGudl4U(!f5pD~^l&#H
zYCQCpOyJ5gpsgcAo=*p&erg#V*6FSZsLV>^F4n-BH5Gfmm79gisJ*oqXxJ5fCaV;U
zZEZ$qOn<hctgM5J2HZoihIw{Gp`j@!4Mdb=sViw&Yiozj=%7(q)+4B_;YOr&=uBlS
zkQ4~mRK}b_%i7D>&fJJ>Bj=51r%kQh8`wyY1*E;<A!>{(Z*Jyyc-{QEF@OMK5vZ!;
z+Nsm(U~FqfBOpULpKpvE4cQySqa2eaMFZ4-DMNfX>WeUcr|FP)7iF=ei^ci%5^Q2y
z+IrTlXV<N__pE2BQ&9dD8W|t(__!bYPir}63F>P5Zs$r{G5YBN$K&@Mz}WI_nBROo
zEgtU2XVCd<zGTO3o8hquQSz(<hxQ{@qPpa-wXY%C(bX*ZEvjGr8bZ4@Yo>9^T{uNn
zVJTxL1Ndb=e+(gWe|B3``}B_eqN8rH-#>l62>#waU*tGnxz5qFrmJ~+uPFAnb+1T%
z<h%<+nJ~td5YOA9OR++%vvdEt?Pe++HP)sBRybX;BDHV`#6HMtr_(dBf=W#X(dXvX
zugJwt=hRJ^Acox@?~d)iL0tZn47t27m|$5)LjhLD!76keTq$-f$8K1G{!t%OK%+X#
z!*pR|$C{>gXLDO;L*p)a+Z1xPJ%w}<+v>`t%+oZVH2^EV4xRZuxxDV!g@v#YIKytv
z$Cue)`)rh}$InBOpCA|2&U}!1>4)+=?M%7+_K)Rq+mTP@nGN+IyU#d-S}B98PW3Nn
z{cDP1seY||fL^zuk?~6%S>n#JnqMBf+fstWT~6)a=yhzp#E9B$M#QX_C=_Xofhpn`
z#c^8hQ<T#*Y$W(Jv^<MJb*jEYTQjt6m*?HjP-^6~^&#6?OVEg<mmXp3vJT^JG?LMl
z(awV48?CT_Fi}6j(50{!^y8~hN!D^1kYt2OlcO#LmSZeAf?OF5;>JGB<ah(f990A5
zp$>^l>od+lCGxH(KYivm;;0|zXO)X#&v-!5oQ6sz*^)bDU9g!Le<bRBnz{y+Tb@SL
z8cC(*@6?sY=+)PzWxtTR_P=F%*s&(Wg<Pbzc0DKp!I1woq{~&%uq<*J)M#Ol{LwB~
ziqVSv=NpwU+4`S!u0R3Jy~v2=GM!0Zrv1A-O=bpJ%)A5)k&&nAvHVozcxcb<%&1&!
zspe_*kK|P9->*XJ73XH4u<>4L#<J{tq`dz4Zn;5b+@&w`Dp26q<r$}<vB~cu)=+k*
z1W0(ktf8*1E}y22)kxOR(4aH?+Hk>SJK~G(R#^3Rqlb%d&e2Fnr!U*8h4}_yFYh+&
zRb}T*$;A4ywaZs8L-IQLg=^I^wnX2gGGKdVpo|?UWBvLyd;Maz$zCna&u40tG^jx(
zHOl%A@oUIG-7dPcU4)iPzMbO_dZIp_tv7zG)NfY?I6q&XFS!KeOFp6=$O%S)y~Fi=
zl$;K^`{m0Na+NBFVpO@Z0T4jU9jP=FW~`N0p&iU$Rmw15zdjW8E0mybg*Yc=Pj?Ua
zxIt{y8P-(fm$CT<LvUcvPLs#Lfqqs$IIye6YN4`N*hcEzrKJ`V-el5CrBV++z`CR1
zNO0{4(%H4BHZM~x)3b(#{AFrXt9>z(gT1kg<+kqxmXR>OSK&HJYj{duiH!Ojq;u6G
zrC$0vgHZORY9(j}kzgS2UbTclMy5lpi^49$a$z>&lV(q;Lr<t_sdS~ho*wo_s6Q)X
zYO>wNQ8WONEJbc=DfJ^2rhP+UN-btXa*6FUV9&ho$!({iLQn!hM+OzTOqD7gDeEl+
z4KJW76uU-hw<38fa%Z)Q-GSZ$MA~O*$@iAkhz`L`5QT7Y?G<RRvv6ILy~9xp7#XL;
zGJe-+Xzz!$yOG?O-nPZz>})JYVw-iqh)~x(@YQ~v5h+`H)@<l@Zn*$8>_tf5_NexU
z;!!ViwXzKQ7!3snL!95~WzIZ>T9yMFjC0iMaUROjq%!}K=D$HpMr?X?Xj_IJb>+QA
zQ}6@#g`lkrhJw*iw%+n%1RFe?5kUd!o0JwgsT=XF;_`A8o6$%W!!{oZM^tyHDp850
zA5kTeoTDP@C7ItF&}D>ZKWi|yAyj6mN0LX(a^onH!J^mU14!HW9YpI{tz{HtHEVRP
zN(6mcY;abf9#EV`E1j?%ha%LEB^TrcBwDxRs8NsbAl9P*poh97>(YY*17Tj$rG8YR
za-verVU`0OW{`&+&iDrr(&HB;Uik{_v4EBk7=W)X&|15w7vw9Lh*TW$W%-RgDRjGD
z+V;JPDa11GOK4bIn(gv>*56kPT1POzGfi3hN@bj%EmifSKAa0&P-|q7ctkJx3>6%H
zgt;>lxzw*KbHbowxQBR;eD&GDOF<SCkih^O9^yv%Fsth<YuB?YSzUWsmmZYa^v>T*
ztzaf)wUY>qteylW8OfSN0pThjpS*MI=+JHB#*Y#6i0U@?o`1G&xpNY9V3pK<z?LCG
z<#{hjC*EMbn<{Ue%sRSc@*^?lKJ>FfnCfvp6a>}5C=yw2#Hfq$XXZr4M)`UuIyy*4
zDlC!<nfqkvBXAyI*S6p_6ZZV~WsdvA?fQKeVodGEm@Z&FYMn~vhoh7_O9dJ`BXe?W
z{p#0oCMfctjf~iii{NOnma0WAV=dZHyQLPXmT%V4aJ<UCnY1{c26)VLF48$BL1`I4
z_0&iitkEDSD4}{dC+IVv)}D+CM=3dBxZs4LIx8KP0u0gCDa~4kLYC7|r>N5o(08Uw
zR_1tu5gk|N($@@V{sdcqqyUv*Z;cARH*dHuYrky7>5ketG*auuwVOn$hto%#fn5=<
zlpZNH+x`8kT~@@sCU%bC4<rr9Alac>ZxG{$GD=a&+Q*OnQ>8aBJ$iKXUn;%!5nOEf
zE>dkkTvmzp{sGtqT_gHZZbc#Ib{iu>9gHmR-BkVk3Xg;lrSB`lk4=8dxEuQ~NABx>
z(N!u&?&DfV2!~f^Qi`AEp(0gwZi~2%-S<<nnn8qxcpY^qFp=AMr3ktAC>0gb`<JCM
z_sZ;NvpxI(AL0XUo-u0DeY~F!Fe%yT!2lZ^c=8TpQ1mF6!!qF0OQck#%pMvHt8b)%
zLaS3()w1iOqad~;LvRWNfdN|Y<PBL`m`<*iRo|&md(u$f#3kz6Smst1_2bCY(@M6_
z4?9El8d(NLuA3VWbGy8@0cBINu4W{zur!P#&Pbar)wNA1dxh-0CRjuAmCT_uSfs00
zBjm1aMr-%fo{z+Ver*8tDDG3%Z`ZG2rqg6_!VMhFS1Sr8pJQtCDVzByD!>6vw=U4n
zdc&YA@&=CiTaK8E<S)%V)~5bbuGs6xYvdZo(wrT9Xb29XMrwolR5=lkihvB2@vAhS
zQMss-`Dc!-p5=_%$EZ`7Q}U2Yk}mEu?Yl+Mx2&$7^>KcehhsY(>+Q9yF8edt;XR<v
zfRyel-u*V+S=_(pyT)RW0;`LWva7fsc5m+!c};Ry*4t6P8l{2K@fdO!{VP@V9Q8wt
z^Q>}JhL|&Y-&I_hp^`I~XNcJXl@_>Oi&XMj*ni!LD}pF=otq{$>$9#vAzKk#8;cOr
zkSg2Bg+?duM%9Y*GMHAWc#y~G$Rg<k$;Un!%}z~Ix~0}qgcN6M6&tmCmWhKQX4kU9
z5#9;sfA^3xbV*hIrwntRcK=_P?B!{6cr2WeDh4~nmD+8^>wDVPujz7j_q27bciK0s
zU$cHgS6f$WF>?z`7n_SSY;CDF*!PSLXQVL|>0)nKF`kZe*-y~&KS)-bZVW}in&an<
zo)DOMLPpLVA?BVz<^_!}8UjuP8wwX1gfora=;GWXpsBO<U=LxQPDlB-?Ua-?gEfSU
zMuL%OfOo>7alkuRJBm=>hGO|RoXd*PH*9|tq1ru0aT#k=q0Fx0TGmo&v#Ai>UR=vO
z!id>o<3=VO9#`)+zKO;475bws7;R9C>#O^F#=k>0I@<(tU5l8sobhW>!@G<MXI+b|
zH(!gyX3OeI+^9mD%IWVQ_>+96VKtt<7X1?HO(4y4(^d?B!&a<)@U>H7H<i{`uR*Bt
zyaOr7a<eFfRC}i3D8AN$AG`)B3)t3R(8KJfX&8uN6GOv^Tz!QKh1E7#XXAb1#$6)(
zXwL^?Ny+qw;=4W+H+~D@1CnpmUL=yJrVpUE4<PD5*GG6ipmRB8yPRq`4{2<T?bv=9
zp!Yan7^?p%J^&~~W%$cI(9nVLcPQl!pJ@68eBKMVr;-fdXV;=eX!(4Df&#RSR;+kN
zg8GZFl=)rxORzhx&cJWUrOY|H21Zn8E@O*#ay$62bCeG``6$ySMZ0Y~mXrh}mKy26
z_>vt5QRC4R%>{%~-|jkym~GM<4<gGmvNsN*p5-EVj1M}xYe!S=apCEMNOF+x-~W)f
z>u$sprPgv7RZJ^H;|dWxXi%ctonQ!g@COL>XV>I0x5&5Lm?HKLG!!xzXX_YT&Acsd
zsYkp-#{0s2(J@7?wMQXVf(|EbY**iwW0kJmU<X$1ajdHc(W1DW!9JM_3=p-AjJSYR
zmoaoS#3?Sz$Z>IDunGhNidA|L7{MK&Re8W}!5B)Fpp$}yM1A?GU9TefO+^<oh^;*!
zp<d3yjhBlXeu}v_ddw-}#`fMDm|70)IY%6hFqfu_>#Yrc&82b`50lH>Dx_CuonqUK
z5&QOUk>t0Hw~42JhjM=>wiGnN8L|NXtLF%kNekF**_PBy&;_>@^PZwnO%Chk0;~&6
z3(q10vprquVO?WvRPmb%8g@5T4QBCa&D&TH?+%7I5cg&5&LOXRi1~i%DmZlT;acdD
zg<aIn`n-VGS*na3Z6S6W8*~kWl`5=$S%p06_p)e@cR*oPqX=Fl9`LTcS{8;)fa=CL
z69&&2M!B{7!Bhfk)mX-EyY9zy=Qh2<Rjzag{i9xA+iy`&F>S~yuhfXQ>6s}d9SMz5
zE*U@#R3he}(poaYkCgys!7$GPa+$e|Or@5Z)4@>U^V$EP9313BERR)kKN?_;D>YVX
z1IE!V^<!|B$-i1-?)2u#^H6=`!Ib0ur~ve#jN0h0C52_sZGB&HXEE?(7p9c$#r<Vc
zy=oK<XVdvgkZ1m=FS}2xek5OV4#u(qwmYc!rj;2v6_<jcz-`3Km_wPh%kve_oQj-q
zn3E|iS(3|znw=c8wVY8EsatFIwqx9Qsw%yJT|dNU%d>{SrW5jVE*~gbAs%*@Fi0+T
z2&|p{D6_}J20`0#gLN~^6lRKfmeJ+uV9It5@d4H!+@X0)26yJ@A3*cr)%!9vPatm3
z%b8#Yiu$-v*3~&i)&XQ@x2dmHfhOeN#=NfX2_dwN`a9Z|yA`<<@2hE$k($p`J{lB8
z#Zx6FPjDxk@$Kvo94k7(bP(1TJ;kU-^VJ1RZ2@A7Xti8pt14KnL0c*d<SX>om)uYw
z&p;zx?MRZ78<J<(3<%Kav|W`_nPI#t#rAYQDl_JwT2+B12Q>bMYE-W<+9bop=zWE)
z5aHF?KUL9SkXHZDq8LiwQ!9=}>~(}8Ig}1GXPcNNy^P((HtO%uA68tN$)vMlww;mc
zw6DmO!~)c{cQ0x(o{H>$IsPWfXO0xxKT|Ky0UQi}#E4qOxzXsJ5$4%~Le}dEv)kbO
zux;X~c!*c;ST=cKVH4|(fT|me`k1X5>3%T2^O6_2ov`27-$ow$Ju*-?IrXLa%&D|2
z1ZI8hDJY2Dnf@Rc06Q#-sjPIExM*0_v}zp>XE9fB2hVQnu&_-WXw-_`6|~<wK6RQh
zyp0X>ZpC(*DPz@7)@8qt!K^7zyq8yUSz$W2%NwrG?l^7oN6gFl(^nTJ|Gj;6VfNL9
zZ~s8lc^BOj;+ae{8hycMPqW{q;I(Z(IQmK6zUs9%i8u1c$7P&n2ghv7LlOJEatD*6
z7w_aJKR$LF?n-Y5t6ivVyL=b(+iE3fgHhnFtIcxNZBL`{)L#%M+0B)qKI6~CwyVXc
zKO?&LZ$Q3nhyM|3LN;*$>S}C4yX*gYj*hj%33LdSvDe*GlR`bhw6kI-$ElhABIIPW
zg~kU^?KN7MXT`PJqF1zHr%Um&mgB_%)-q6}D6QUdP}}gj_M1nvV0>4fp<xU;$C{aY
z3gwTlPS@oB4o+uVU)ACs{aFqBwL2~(%}cU1`Tx`~--f5pyv<x#BX)WfyUo;1c~6`M
zM(YBL2Gn`=3Yo-4{RZ#PP0PkbfKS&kB-hFO1#GF@sv66`q#)~ia-;gje6|3DG8p_g
zmbo)k?Bmy7BV%FXX?vP_78OXWmBqPf8Q1hMSgu2XyI`+%Ny75w@~6b35e5}xC^5s_
zTGZtOLvDHb`}c|UC6)(iI3Z7)mj2TuYXh+k;yM!i>wAbdev-1M0W}@{jF=Fw+kMS)
z8kR9rsuQ!@28zUVSN}HSH}Rfzilz$2C=>JHiGL`Eyn}GSpf<`voO>)|7;RGx(ZSHR
zz3+-$Jd-b@LC0a-8m1}_|ClLkR<hgx0HhAee8J$Tq$zjh@eR|zE`IRW#h?gufK!ON
zf)4Pz?tDRmv-=y;%h*9apg$XNil+;i`qXsbO!tUlkcLGt^-#7!Ci#^hr`iUXLo_fx
zbA3MU*13YwfF}%289suuWN<1pWtBJHRgA*fe=PJqHMXTUO=me!jL!mr_FQr8dNl6V
z^{j{8hj~WJn2V{ajT&*@Q(AxZCHssIBFVCBO#$6!?pJeCjOE!fC)l$0T$neZ%Q#gA
z2iI_x;R=m}=O$6Tym6xPr5~iIYz=j~z_FCQD?qiq1G(_e+J3GW(`Tqvw`V<Gd5h|~
zo=?$rS7g<lS%EaB&Xt<Gd!NE+l-brGzfnJ)%eCYp_0J1)3z#?P=uc@_n9Ot&-wIa9
zmh=?asK;ew+H_z~;oQ6~Yn5_)lnX?96c-ne)1P`t3C+6n*#~rr!eX$)>Pz&iCAZ4u
zT7ONe<cAq+8h(IOx5F;Q>NAS7R)D3Q2CtC(Lb0VpZPMoEHGBtUJuN?nK@Q12<Z1z#
z=gxjXf`Z29u$Z;yYo&^ubj1tld-^6WMeVnJ=Q@M-=)j-!e+^>HXAGC3#-E9@PDeV0
zs#L7i{rU}5UUH`d&bW-E5SeP<Uy?hfzq|io{DY^~q1;J>Q+^fEGA~B;Ba$<+o)V9g
z0Ig-^pq^}*q?xIb`WrjX&HJt{`$uJOuao7im!)rn^~?V?$M_y_dG<3p<X4}sKuzjT
z%ekLtpUF&(BRA8M%3Ma@<4AS9Em_H|OvNkjS&>SuRD7#+@@7iHERm{@mm@d+wxqwL
zT8c{ZUdVvEVaaJlzmUq;;v-yCa$0V~3mYU^S=ET$l2^uDuHW}husrpG(lV8ky-MlG
zLOSm+DH$Uvmq=t_`*LRwWPrG7I{JI&Fb#f;5FEksp42G0aF`E87_4Wun+fZ1;=I@O
zE6w2R%5Mu0Z2=q}uyGdMaf&#cPyHDcs8%wU7sC>dcn71wXqY*@2IXh(P%7=PnT8|T
zjS@L4sqflaqTXI2-y?CSv*u>jr+&X#ea#BSqs$EPEDu6^Z!7b`DQp<F1<+e|2AIMO
z=35ziPiIO>roU6N|2rjy{gO%j6<cpEF<j3)wUT*th31x`VI?!AKxNEXj$CE_^(BnG
z4h~sd30z{WaNL@`l3b}8!lkvRpdK?)MY7DQQ)<oVzOxWmNHX35zu0@(zsyGNwzrV`
z_WW9xB!yA5!(o&8O_{vka1Zs{;gu=VIko>k`g<#<)`E;+0UF4u6%VHbUUL?LJ}h|R
zrP89C^K&>h=D%mI*sOSVCG-4giuKDfM}woFga#}H+fXr6xGkd<4qKpoZU!rEwsac`
zGqr5yKoA^pJHyPwI=X>f&unDX&bAylqDMo#Z;a|={?(B;nVox9CY(CKOJQVwT!cc|
zlWZm^z$JlTiF$u73ghjC4J>rr0ROcy;ccjvso$#DzD=>LtLW@)R2tIDOU0}~{hk$+
zx>HfDq)stbkd?3gtx{Ehx-I+amEiw~jIojxjA<E)FsB#OpyFrWUMK_gX<K#*M~$Iv
z@{MSXg5l^|cAdJUF#BLS+GZI6Ei8LHhX&<?d9g_6;z6Su00ox!sJE){NZx<vFgrP$
z7tX(Ugn6r2<^jV3)5_^N&6x3b+)6M-SWK2mgUMzvRkJ2jEtP^wsU`L_8c3x0NeQFS
zBR?~=1#!$DVNU_KCZp7WXvdc#o;fuqCp-#1%>l2Qv2Oxm2Ed--QP3@7j>Wr~oKjSf
z-|te?xs-L_QVTcaH)@%?8xU`3l5f<!l(UK(9UTMxQ1Rm&q{$y~-S}W#_G;r@oPs%(
z6OTsof9NXOs6cFIs;tjwG%~N$X<sous{jLh1e7Z&1WnA1!hzt6*K^~+5HSOHA~Yg)
zXCx5}NrvvEaEW-U<d6v=cmj@WE8~sf^ds2d6Cooy!3P26aN>#p*(3ab>o(~F_K;X1
zShg1_?gWh!9Jpod+*Y&JbMOJg4C&yyShS8S0K00ej%x&hAd)}v=WeiXB`yPtR{=C<
zE5XePdV)dBf`gj{NX2}1vu1n2fldtf#Aj})o(DW$TQKGeR{rGbHtUINcYUSW?O$9w
z<~CTjI41KE(bo!O7QWyB07HX~e#W`r0k@b?l<+T*m=LHyJfL9yW|Id!w_+fvB5*}g
z#jd#(XFU>ZGUDKxNWcsT7zl0)r+^{B6TG;I@<b0pHNkHoxIzf3GmT2-be`0q5LrX2
z63<4WM{sX|%mD{TtfR6yjbrE&XA5bTR1I96Ivv4Hy8tzCh{SP;!JF-a(CLJYV34s9
z#9B!OyoFewm>WgFAcjdvGKo7ER3u<X#Q@MXc`jmI3NDw3(3@^<_YgzmGtM|*l3I2`
z@Pw9)0g5*lhLYSo02Dbv^Dd!zmw;!y`3RtxTss+0aOcD%h9IP%V(b?Po<Kd}+5tat
zLOu}x4j5R#n-p=5^bx-gxOtMxN5BtBL`(ro!o?ukh;_w+N)m3Lcu!yG`XR2MI9m(i
z6A+C3&LTLn2!gJWSp)-%Am|Fj5sV97=r(aR+p&Lkz(nxHI4drop+Vw`@aEjP0p^O&
zCvi;bXS*SSk$7kcY2Zi<oxrrjolwafFmF{u2vfo>B?PX)`9SD`M2-o*se~JkbSDT9
z*ea6xYuCqR5rHeTBp2YhiUr#c-&J_#F}WiitVQNG?$Dac=<Rb`x{_PQor3)&rj5A5
z=C_<_I?E^E6@zdif*<)1JYY<Oi4wag!LL4sWU`BFrU^kH(Imd(&Ncznf7YuOn+F00
zrZ=fooD*U+CdDvqNyYJbk24S{Cjlw33JJ&CES1SLLkOQxrGx-V=2qxxhB;@kA>KK%
z6V4|Wk<3YaH8Wztc?b1%;JqWvmOS$(YC{&UbA${l)*2><K%MC`P6L8^qUy}*hq95G
zig9WzS`o@-E5_wY{ao~BI^||-#uxaBYPv>Zq$0~_*30M;R)*OYn>9hlg7Vn!jI5e*
zQzywPcakhyOjNjIo20;T1mq%ev5tkX(W04>q+THSGlmdwWXzix)BadDgqX}QvYW_+
z*{}`2b0ikTO2Sua!Hgp)m?VIM1n3GkG1Y>E)Dr%iafGs<oeGXn5|}4ALSyl(1|jB&
z1^*0KWgsk=G$IbsSU2P@25Ny<l*s4VN~9tn1ZH{&4~|6w^$BE;h1!xx+kRk|m={0h
zF9m}(X_ff6zI<!YAV=o4hFD?}yR}#!6_)vs3Fm_@Ij;-Xm+c}DZx2{3-jjPC8VLrR
zhex14K1C#uEs5YU@{!O{;{GK&WNdCpsDW$1Y!g}~Cb)nFXvF`V<PnZXgeF2kg%}DU
z4jNpPq&tW+Bq5^$8zj8MLZ`9lA6Ud?M6khm1elskzfcI8Ckb5Tj6x)a@Td#!U9tN<
zQ-5sjk@!q_JlBI9KmH^%SmIaiD^3zd;87;xDY+WxuM4WhT+Oj%lQ=^45rF+V0UyFk
zu`#%3nQLY^GgV+D(0luA6&RCn03z{dWPq_75|qXk%(Q}m&Q_YanODi3TxsTJUM0XN
zxmwaKRGfhaGmu;>={C6*(VmH4oXG2H-~zaXV4p#7;MB90SZClKk8L{fkV&@`vIyqq
zILt|1sZJxQD|ap@sVjFu>dL|8&;mZ45Wz=2^AF9LbfQKk4Q+)F$*x%V6pB6<a)ymS
zKK)(-xgQ{JyEoRG9%7EjXCdT7jZ7Na3LgS_;ZrDD2stUUWJrFxB^NSFR*heHCCMmx
z`OUT4y(GiL_QUAS@cZbuWPkYmqM0R$8kuyW4}!tMr(cA@qqOGbV!$Bt`2H*W0P*6B
z)7Or|@93F%t~&@ODLZqHEs8;+Mkbx;gJ7`m=^_}EsEmokAyu~oD!3qQX2{QlM#OMK
zBHDrEbFznQNGB%GGbJ-sq2}B)A*2#@Ks%C-M5c^UDQ0tlY=r%mW(qkz$)zZk1qF^-
zi~)V&ceW-z4N`$jDr<*IBy^j3I^o69vB(f?X0s_bi>1@hI^|;ZG8cUcok|N+;5$Wx
zu!y8qrK^+)pfs4zV;>mQoSdCZI#Dunb+($;E%FrV%>jT2079KP<ELlFPv<1C0bFR6
zEP;zyMj+0|!$#{+CMlQ$L#%p?R=r9kn%*dWry&pMzkwusu;oKRKVJ?3ECD`LZgdkz
zVY#r6&nEYT%SXp5%qFLMG)m$-yT{gJ@Ottp)(WXG!3fyZq~HzYBDhzTQ%Ib9LYm0f
zgTpAZJeFdDdrx21S7Y(7i@G81H9QcFg}Q@}If&ECARqzuB$B4%9>Qa%gXNJA8<+K&
z{L*F!P;f_}ylF$TU6KXiUZWX$lg~q<U&B&|-y4bGS{-bX!yHR8Lp+q>rMnR-r|if`
zb>g3(M%;~^H45;M3emV(uczCIofa2Tw%5tV2<KEdO2Bc4gD`s{K3(9i2SsIPFa)Yr
z7={MQ8*GxW$q>Id2~mJR$)9(CN0^Q0H$ik+L~xu_h1HPL;9R|?aa}zqci{Ot^NPi0
z=z8(jpghsAATTYSb_IU=<Ov}2Ia4+}5npkR<8wx)^+c0KCNX)|2;dV<TdV+Pj!ZA9
zIA{9EgQWQrD*t=<mJ1AE6gnfgHxhoBOOZGi#CVd_5tv~Xg@2bJB9M_ZC}fSy=Lvyn
z=5fUrA(eTOL6X+@?Cc2hdy+wnWCNWi-`}|it1it=2>6d$-)VWT0L0)|67BJ2U+#xf
z2TATEr^@mV8qxZ`@Q8PGl!s)wKoEkQNMLxlU;5Hj2yzKLoY5U+4M(m-@2rGWDI@&P
z_60}!rR#$c&X*)mqk(XAl;oJkg*i(S9fUIo9Nc{2S{9N}8!OQ}JrH>014p=rjU9F2
zBtcTLAl{2LOZ&{_75!{Hi<C8Vb~eLT3v3*KanR^t4bAJ<^faw<IyS9$bggTHfX*&^
zr=wnft}3t0-reSG>uT!Q(Clz#m7Ko{DtB*gZ0l+^o1IN-I=kCC96iqUo4XzLr{_Bd
z-R{%#jrmBqZhdn{TO(cuL1r$<rU0_oG>C27(ALrHv^Tcl!b111U7X8nHkoZ$o8yhS
z_gLG#_<VB6@|8n7*gc7wQeiZk%ERFChM+U|2+`5Y9Ybt+BJp*46_T%NdzdYELt1P(
zH|m{sqDCj8*RNWYdxteR8YvG2-F}132>p!vk;zH6sB&i{81$8QYgMRRqs}y%tIBuT
ztQqz7rViFH(B0O=8sa%1<|@|Ut{udW-;+WIfnpC|90UqDovYfqo!uSw^(||9I-T}T
zcv5eMf(`2&&Ym?*os_%~YBYA(*ExGFl8b3+pT+(_!zYM#c>~<aLNH-L<&ACYSK<B*
zH*daH+-0ABOtk3VtGh%r)ScYPI#)q&E!~i&cU{|mt?XI@_Z=<HHQkOLJ51lY&L?bn
zs1WkVdL2Dv4(06WYbsch;X*75b5dNQA!OmsDGUS+kjrccdE9?IE9HgFv?0ji)Wf%l
zgY2t+T7nyI1*WX6tg5xvvT#%)Cu%UA3<iQBB3j~yA|aU(HvU&`eP<7Z974bvaG;Rv
zSR4p?ykK1jd5YPYbXc~}urV5l!X9XdW#|cK`N}Hx3^;7*0A43Bb<jO0^AZlY?eITw
zD+C81nHFjZ+2J#}$dHn`Ja+t}T4{FcnKq23X^0YqF9=UZ)zl%RYoW2Yu}mu#ce3#K
zZFpQ0FRs~2V;a(y?dkP2x0R0Q_0u9zNF<U~nQLv+(v&NuDdH+K^)X`G#D(z$OGtmc
z@<Y`BFT~nXQY;vKoSp*U7<?mS!&XB|>>??PweREH>0JErUy-<`a{6ER`hVfr1zex^
zFHD<F*6B2SZJKaZpEmsoe(4h|Hk+qErmp{(f{P!2tR$&`I(zN!M9=&RvD43qNbzCn
z`olu;Vd__q3;HXx^;bx2F->2+<l3v3K=i!Harx03pr{Fowm^~1dMI4-4T%-ZT3$E!
zt-)MEqLOSGb*zxVUJTj4#YDp^KDHDU=e?1RSoL)hs<6;vwpLlID{R&(tXDCpn)$XQ
zW3$d6cn%HF$UwE>bh#JxGm{cz+rAHDQ!JY`mPPDr*)NHww65uB&<Ux&;8!W1$!WD6
zT>3Ud`eAhD_5$UZ((-6Hl>aspfn^Ayz&KP}T~%wYt_4vV0drL>!+`hb^E3UNI|SHf
zT#JhFyJgjcl?q~Jc-6fbH&;4P6$a-&>rnT?OBS~N`r9AxT_UzsPb)?6*Ps;b{h<UK
z|AAxovG2Q-Kv~Q7KSCG+L|pM9j`{2qoe-@Vi*9AM%Voq-pss}ku8$p}>0inJKmKR=
zl{|+eiW;ST70lat5X*G4hB;$d_QT8aT2B*)hagx(Z0L9uYlygJ4L-NoBw4~7%mBgT
zQ{-x7<OHLVG1uhFGc#!9RG+TFXVhN6N;a&AHwwUdv{_k@9dY$A93+RbImbL3Ind47
zdcV@~?ST~4k(oC&{0^V3hu-2xQyv5m99|(FI)40S%On0#EY9(c2gidoF8`$llrv<p
z?g&>|eQ;f9^95?EqeC7&y5L)gt=MiEtqyGu6t);pH#waP%}EzFZrBwH4DE!W%z51i
z9ODb;!CtgorhZ<nxfl`<3E(iAxQRuScrQt}D#^rx&faiF&Re#X2=xTMdx05#Z$71V
zYbE?DBj<cr2EW^W{n2e@E9avMN9uS>a=4DwAqI|xPtigQPIP>qW7C*9w4>Q80RnVL
zKC@5ZCT{8wVJR^W+^|&jFr2Jo?_PO6<(-BnV0nIx^IiO8efG@3?nFnhuYYbXOFnnu
zmIoh0ISCywc7A4Oj#1dDdN~JzyoE!w#Doj~w~x^YYsZ{v10H@N<!jLTL}%#}k^JPT
zca9n4KYZ#X3vFg?rR2eqdUzVEp07_VuJ4}pPRmtb2mW6@)nvl1Kn^vzSG?20`N*4H
zAlS@1D!gjr-#g?;8!FMQUvS7V&mMC8nh^-`KO4+hzb^xg9h@&jGc|PBioVndL;ET&
zP$qUl<izia4L&L-z7ghTs+5vGe8#-fg?xi`?sjM|OE<!9xX61a?v`4ORtsBcv{b-X
zrJ1cTR#n4QjSap`R`{~kz%yGF3+I+<oQIX`Ej28(F~OIu2Fh*KP+n7sTX3D&R)v+g
z9hq@Ky0ro-P*tGxcr$CN;kv?#>#z<qMcj@zph|3~c0)j9HLfdzS0G3$j20{OW5sJT
zE2>zFv8o22ue87uQ-wq-vlt-*UP@Q87RlYz9ms?WkQGz1YAb8O!%9HU##(V!rqyVw
zWX(p&&G-?er4nG8aksMC0^QZ%FeGwf9kwe?Ht4k)e*nE)1^)-iYAWG@1#p1+8Y3Wo
zm}*28oQ5o#Y^d-iwNDA7tu$8QRv3*E9`zxsoPr@pQH4H2RcZIhfIbx(m0?nBGHZHq
zomPsfHI1lJ(}8O6uaHgi2(rSx74L$Mb*M_yC$6Z32Q~OGJU9Z~9)aJ(r~<z!Hrq67
zwY>noM$;&^RN)AI6Uy=cT&uXMTGNMHv@hzHzyphB2{b5xYy%tiKy5-73RP(<@CW)1
zM=IU|-F_3A2E<j+Ra=H8ojQ%`&>9`(GMHzJ<}AREhw^>273i@IFvc+WPvA)!V0ajM
z-3KsKfaEYdhj9aOsx()j3e9PN%#)~!`V%(5m<#|{2T$^}2cYeRFr^eSY2O6sJdFBW
zcy<`tOQ|zarTrhsVsFxpJ8B%qfyfomw#NR^+WP^MGihKyfLMj^)9%xLAK5gIDq&iI
z>mbO~jL(#0;hO=KZ^~?ytPMX4Egl4zX#oE|$OBv=jG<S189;|wmA?tSneCl`Sstn}
zbU>{F$^zKDiz>P8s6z5A^(RzeKZq(QgXa5`Qf#Tf2kfWAkYCn*h$<<aW}6q|zeqm7
zkgbZJ6jz!Y=i*de22ihd$Q&ie3h!Pp<KeyQu+;IX_X2wZGzD7PZ-HN+8V+h7mDN-L
zks)cYtwQ@}K(zpd{~R*euY(??lmP~q29*P-a!(VW*Fs$)uCn4uWYQewU!r;^6!7p&
zNg-E5V2Ld?ZRY~QJwUP#QHACf_-%#%tDyJ2Fi4mnu4ZD;+kia~A#ApP$`4S~#Gqq8
zUjvYJ4l-eOab-20<nN*CkwsGohtVUNUS!s+MOEBIj&itC>2I_*A=92|AhT3lWtuFt
zM*%|%Fvb$V8KxTsM%c0*nruWCs)~D_GHB1&0z}Ox{OM3Hjo*vR)Fe#er2wP?S*czP
z2V_4Ba2*32-i6jG$@!Z7)R|M?T??s~xPX5fRd6Es{~p=`Y<NEZBmJX}Blt<}TD=iy
zX2Xy14bWo+OnC*qQf#i`v{XF|jn%%XZEPC{{;ahB9*R2P|4IJ&!$<jFYY$T&!Z=D{
zhR)ZX$=^wJBdg;h?rF!Lr>>po2Q;f4f8Be5Chyovs3A?Su(8rgSY(y>0Dlp(axPRk
znSv@cOP*ONt2VQ?wnO3?pa}4d1!RuR!rJr?9sA|6FyLmGc;VOvpvEeh)y7uYRdf}I
zkbS7~&;y4zAAcBG_qe#@+Dl~>)vOI~k%26&vY&g~Q6g5ji>8JghaBU`WFLn$pl(au
z^8l*QTz34w2+L@BSVLC(0F3Qw#|Wz0^F69vd%&SrTtYt?a`aMFc*9|h_LZrdfG55$
zuC@W+RN=MS{roq83j6qe7|XUN|C_VIwB2#|cZ3dJttJ(i&dQxV_4?uehQ1$yqj&{O
zQI+N(P=rD@Ja%j=e76Y95{870gM1Ia>d<ParH2)BWNIsQnxq<_^g1p>HQYZZ9%$Q-
zFQp!Q(oY%ew@`L%<5Z>Nh+|ZdMl0^1fqwPKcIbWWBTxQ}TZSsRW!eX|9b$`Fka|ZB
zFW0LyM#uN47m*cTMJWN%waA1&q*|%RwEL%CqSk8lRDt3?8gREz`zYNp@f6MXII<l}
zwCLE6QPrf+5rn39>>Z@iwO1duQE3yeBl8}gqKZED13>v*$GNE5UZXVvLGGUDg8uFo
zxM&y#%u}x&JBrNQ<BmtEtF*%t&%*?q3sVw3te7|sm;}Vt7VdGyS&|72GEtXuU5ZB~
zfL!&I8C7$4Lc?XK>gcZkKqbeDttReqjeepTS?rhb*H8SGdKowxRvW6pl=c8+Q1DWq
zszvc9ntytNl}Ncyk!9+(iMO-^$Yg(23qn}?AwI{^sl5)F4!w_T_UBQ>q+bG?jdo0t
zMLRmDegWe&99wz#9q6)MtDjnNZ0oUaB9mh+&T|~r9ylC=f!-pns+_z{Q7=(6NS|DT
zUu^q`JNwu-`8ySlN+-5clWjNRKJ8NqHkEp0N<kTT*6}H&Q@ksIvirRMuor53sQ_@n
zeSFWc{NooY)=H4oZdF@T@k5HcX;eLVDjCh;D-Rz9%0A~fj;zNdnhy^Hwzx`hGp#L@
zRadj-J%@k>Af+UKS3E*b-03(-?E-AqJ^7h^e8R_HC$6gCAK#-vX3&Q!_Y^sv*B`;k
zV|Bpnz0^5}?xmE}r7)0JwFjqGP&}`Oz8^s~ycRpP+fdC>39m&q?t8qJALsu7bK0&Q
z)<zUzT5PS*ys6Mi`Lic#d7t7T34f1v1lragdzl~PMXxLeiPK8ip+8sz6ZfDhJ67BT
zMkB1`_a9aOw}u#PmiEsMtj&V4Q4`-|E|)>~=fe%iN&Dl-tdAc4rhXiNoG0gH+7}fs
z$e{Zy(xJAGQv=m1rc6^W@gGfH>)@tJ9qEei$q)aI@@QYtcTB9(M39*S39UFN1w=m5
zj#4V>Me6)Z?o#BZDR!qRY_NezdSQaA_@#=M<d<Be7-HZKHg}w-_+=Vk0vkVf52^%-
zZIOf}PaYl8HY)zcD8^H#F2C({#XnMc(cv|Sr~+2|UdQ8#_hiTJb2KSVPd$7Cm8bo)
zSq1ZvCiw(M6jw0EFVyr-xTf|TyU=ki)uO&i0;<B_9Y#gE6u_ks!R>f;q7+6qsUOkY
z53<m9=)b8k;E@jxFINQ86$P|for*1vTh!OfKng$0cLHvsip!*uziX*Ik2{u7X;gu>
z9{9p!e+5i`oc2jGyy2k+<nlTyP4Se}`#1X^9C)Hl`y6%YlwUDMPo-$zRlF#no~5$*
z5@0ojo1xyIUQ;xrI*w3z>H=C*t#GH^W||nJ-fbJluS}Q}qtYoe4Et~~ve^$QHqcb7
zLq)xdER#8kO|q%iCpI~Z$GfS><i*IQ7)iB%I#I6pRHishs_o-P9o2mL;oFc6loQ*b
z-#B19pS;Vl%u%xU=ggxLZ8lJOo8qqu?JfFOraq?5!DHH69Oq8`+)-aMtc6@*$92>P
zQ>RW?70wLp`4g_T{RndKX~n=P$b57IHLNhnrp&x{qE>NDDs@01NpoDL9R};<w}2yL
zFuOmIQQiD9?SW&zLDhSHi!6%6r{HIgFUPM@&#HH&D?UoKU#&Ptp-?b1=-##PS}J&t
zo4Q&luCG^RCtl08K{<Wmzs3zpRB`C~XC9w^A>%}UM_$Otdn=Q9fS!IueWJf3P;_D_
zUs#j}ZBL8<6se)~#L(Y<MXiSSpISd3PRRyk_Ht!cuX6f!-T!NU*WRuhC8<#TY_-{1
zVFF8O0HVc$B$R^pkTW=#-{x6as^=fgXF_`B1g$Ix+7XeUoeQyQQ{o8UvXZ<Zw+-IQ
z>C^S-*-d;ffN5*D4}uXP^n$InR+-rGd<0t3=F){|=<?>YBt6V&n#q1%Iq4IC&srF0
zVzm35KqaUg4^zU#&}URx68_awH=P27;4A}xVL;s-iM>X5(;_&ZvPiYe#U3W(CRi3d
zs-Xlx%VY3*GSmS?+GT)~2^_5r<_eoZu<Co=hF#S*r_E|0?kd9|SiapBV@2g6xEQ}$
z51C)F{(=#F1?t{~Rl^++0Y*Z`HjZXNSD?-tO-2vDgQOOktmdjx;{SvL9VAoJm1fHT
zycNr8F(sbDaOaxpkuGa)7tkhAIdfi7Qttpc{)T0dC}q7!CC>V!uo7i134;&X&#72o
z1+6GkJ{xid=~1y~OwyZ{XC5vEA7N<bE-k4Ud0&w4LPbs;K!imngI*;hga%#}?FVh8
zgT$H%PsQ$_hmV0+N-k%5TLM8_GdB#m5#ICWoYynKS4K+VtwZz62vnMsfoXRJsFa{0
zbO%|l02#r(l-!S!Bxnz+6lJE7+n5(g2qXqant&@H==Z}rvl93VUGclT@CsgG5JFdh
z9d5UCfF$(C8p(tQ;JqU+l1{WFm@NUP8H8TVe+1#siASWrrQujm8Nn+y7I?9XN_Jv|
zXb-#^hlC!)h$>0K@85$eP$TB~hxcC(1QWnhz}=L<R0Nnp-Yj_oEYJd87v=j&VuDk^
zd7Y>xOb7%(M8GlUPdp@TmjcCwiJfUi2~m3SI%;WFPSz)E5y}CMtwc-Gv*z)rN~D&m
zKf&tvaM4f^{Z-yJpH&tUlw@N_SBX%aeo0mlB0(t0(B^T!0PYE_ejXS7L{#(%27niV
zV}yLZ@TK!#4q#$s36S2x>ObP58^lGwNC8L-Un@_XISZenF9&J<s!fpcTv5@DON#EH
z0OG>;$P<VQpMC*ES$yhd1%UwTZjt&yQPIbA(fvzvz~^z|x5F<OkNR?)l{{Ms0D?>r
z`hN7u6!rc?xaht8xab`T>HcK?GsJMjK8d7f$gfCTp3HwH31|y)tCQ1+A^ioC4au9&
z<B8iR^Pk~mvqEG~a%M4*@nrt<$^7S&`OokM7AK@Poy>nGud5ecA<;gY@_RD>Ii6TZ
zcI1=!&+wiYl2DzLJ1NAJ3Ym${lljli$jSWYc^TLz^PeFIm^+#OJeyg3GXHtuIr;4i
zQ|26$6GoJT#Vv8>2lL61r*YBrQ^caNFu$3c`WHU^f>VD2R@|PK1nQU1;G%bbK|n1`
zXeOW*KK+uQ=2;69Ai<*X(sQ`zkIxd23p1Dr$c0b8B*-}?$RvQTzJQC~dY%AWn6OL$
zE`0h0024Nj8M~$c>?W-KJ}!FTSGefyUt(hBSdhfLkSV(7u!J;WoS89=3LURkZz+1<
zMO<|4i_oofaomWpOt6-rEbS7Mm26EF`sh}#FM97qTnDMo$=T0YIbd=FBl<j(TYP}J
z`9|1;@m1*Z4Xl0(7hU}lE_&kEfY?_{f?h}p9gqT|-$~XsvHHbnu>gfxTdIJ>gIN7D
zT=d3oaM3-#Au~+KKt3~9CrXlDfB@JWlfcKwxB$9z*0L;s?@{*_z4!7ASrbXsbI6+1
z>xt+($387|Jgg2Cz55C-dg>K{v?S$v0mf=Hmt-gEJ#OR{`n@?@{phJh_rDHC(^rL_
zPiBIi%mhu!=w6V&d@@xy`BeZXQ-v2W$Rcv-C-j#WOyFu01k>{uWAz8QMR#8WzMQ`k
z$+Sqnf4+$L8j|Gc38ZwwUpFJ7-@Qnb*!jM!h@SED&DrxFq||>%DSG@;QPC3@i^v}T
zAJ1@FDCI9EdL#L#wqRRJ-Wg{wobfga;{^{KC|L)t5EXs@auL}PziejH0?1@XoCO=#
zA$2og=k3G~VD+Q8==Cc_MR#8*z!mo4q@7s!_n6}kXZPnt_O-ci6T8|B+~n=-$~5%@
z@}g(26BWI4l`tPCvvvNjc}K+}oT0vW>D;xjD03u4$_<}k_5BS+*S;w#x^;?h2Sgby
zU~=_c(}GZ$W%RgEnc=rN(uC;IN8X%~CW(NgxeDKGUkUu?`|pY0URbSf369rWkBN%z
z`Gdf~Uq1qA;Slc@&83<scFu@tw@Fb2bD{?X^twU6z2HxxqQCr6q+i5IZxe))AeBxy
zQV2Z|x_T9>-@ru||5;S@vp)$VT`WXt@wlahBYJq=oSg_dod|S@iAIt7+?1kQ-x3ww
zbX-8;D~2C^<#45ii2q`qFh9{$B!V$!X5sO_02lw|-<^e6B!|%12O{<TqN4NO78OnZ
zMIgewNTn|s#I$f|A1BR;`X>jMs-^Ic=Y0QfB3N^oOUOTS$AvSmP5(x|H{juSL4=Ag
z>JNnT;oXAdUpR=}LTn3^@$;Svx8KMoQA<!gKPke=>&b_<@Q+ga;2p>(Dvp76lDF8B
z=8%AIvyx^I4>&qD^B4mAVvor`+C*MS2z3MS5~<x~>@4WH@mC9BFD(BL0*}XeErgm>
z@E-)1L}n5dv=Hnf73P5lGQvNd6Kgw{+KCdIwJ#9vCw+e&*=K^*79ugHXdZa@_sU63
z7-=*Ydh8$Y`XZ=_wsYAl@pK+6*BIose;r5=X18#31X#Ra9=xx^#l*U_7_`~v|6yRW
zL4ym2D44p)KOQRt4wCYDz!R*pkhqDGFULd#P$E8YaWs-k{|AvEfsl(MkSLi4{A=>D
zKw%PRxj4{pqG%pe;pMuENy?;x&s($vY%H8}ak#T(^8mZxeM6+l+%=nc`b8j<LPi&d
zJFmh#;IpD(F8JB!UxWZDUmS9x<UbBFNrhV+^1KRP8v$X*ChubmhG3igMm|J@hq%xf
z*<MG-xKJp#li(4Lw<S+4W{9*l<-=y0Sl-En@WQEz{M{&1X7A})v#~Z72%UL0{JUfD
zF%Ndz?e?|sKl!xV?N)o^puNRzAG9|O3eOrF=alax4;?e*E%vt9J$wtj2<7l>rybhC
zuh2=b35ww!zJl<qv8v7Qi*9M{wQjO>?QpH?jkp4x-ZkDv*QRwl2iyGdXTF+Im%qoy
zxAu;=4F?A|noZ%hrq#nP%PzBPV`c1FMcC`|w?wwAYqLSwxM$<)2)C(goZDCv-DL5N
zw0TK?TSm8R+%>YvQaR+>xB=kUz3z%$uWO@k+->pgZ~>exO>Ly_otrmSj%?o4HQ-t0
z+Zpe3)D>83>57#{JsZtC`Q926zp2qjaB3UzMLo@%O))rTpQp72+V>2RGFPBC>{<ml
zubP8nc+19?ku956dtH{6u`Mk#u>IV|-Z0!pH*Yk@`u6YgZQj`9bNg5C*t}^q;2$4P
zpm7N3GUPTV;Q{oV#iP>f+SFLNsiLRS-5MM3x{Z~luHm)bjwU;dyVnH$4tZJ~gyt}&
zUEu+sJ3qF17IvU_9B>@qHpbd)@%wzP)}DbRTASUL-m(8(oVv`L{pL}3Mc1gyQaRvh
z^^tie^R|$7ffLqUY>vsC!Q9RCw{V_;-k#Q869;qCHQeKJ1-d+6Va@@+UA_eWkvUj|
zo(t-@TU%g_w2UXuqi18~=w?gJNPOIj;92A4{Jq1T7{5lg053uxrp;u1v;tqW?yBi%
zBI|E<EhP1~kdCkxw*U_joEO7qbHHcXvZ*u1SApJ$dzCMF?d;eLulL$q@fG+^fY<wZ
zd<W~l2Wa`fxfWKNlVrp~8U^N*FUDX0X&EvGd@$gy02vVI`GR#I;5H6&Y0T47Bk-@l
zw~NY<Ssst^_Ch&AcnNrF6y&I-t0fYJ7j%6EnIhog1KIUuaGqapF}nUwuUmo6u8lM6
z?L>1QAv}3Px(0fq+{W2;1pC6*Uneo18;7+!c48SAoBO0Xi_y^UYONXiyuD+Ke;4dO
zGkf5dk|}N&^QMZ%A&}L+_}+RV+U<n>c6>{GpIKDK_-l5!PPG5R-srOI{Hi<%GQ_+S
zWHQKf*#EsJ+Cv2TEmn6iI?k1K|G7C9=p9>+i|8?g&K9C~I8G>|U6$3`w`}Z+>nID?
z^xX0<rNhMKj}PQZZv<p<bc?@QfWOGTcEY-g&GlSd{vBM+9<o<$u2}7ZJ$;AU3;Q0?
zzgpoflm3=5KBgz`<TtjAx&1X2u{FEJ=ME5^!Yb&7ox_f3XOnejTR_k^XXo1AJ0|Fh
zq|c_dv5w)+Xy@4K8qjK6iE5h|4_peGu;0EJH07P7Y%}Q8c8h%jXxM8(w{EmHLX94B
z-$-=r=FLPm4w7c8h;D7KCil(cezhG`=SBy)6!hng;mY#zYAAJdfwB&j1pS({A$q&;
z0}ngN#VGk&1r6anx56d)vXi!ri_M9Bdv~b3d}p&Q-hWF{r^2PsoZxJ4G`T8P4|C)}
zmry@gx}&_D6u_X!2L?`VW<IOP6_E*oj<vSgO6qSA;6cIK4FpLTI{65f!WB6)*jF`%
zVIKVeFA{sWzN);uraA$yF$Rwyl?<<$+;6F9G}|4+P``9AQ9q8KK&d4B1bV@F!~${}
zbc?PXu)mr}ANEzGefio%`_F^7nZR2;PU@Qy^_$lTpoG3+@C4Gdkztxi|EtM)sZlsI
z2}6lZqA*{A42`XAQqykj2BM7<e1!RLU3`Arq(3+Zbq#L;xoWo$R|CBUO93fT5XXQ1
z{Irt(L7y0LRrC^Ecb0dTmwO33f?0figi1;CBcQsf%LH^AbHgw<c8`~rS2ZW_Yl`6~
z3@wR&Mxet+IMew|gwCK_!hMrFj-Op%FMIPy0$=+sC<fHXSJHe5^x3kicMN!gtXooS
zAKU@(3AjP~27*6aooKuQtHHh)acu-08{m=p-2{JOyrwYeubX_u#w*lcBP`9efJU>x
zZ$VQ0IqQq`?{4)G{uOZS?5-#;x03nz3iH=8Tm$?o%m+X_(Rk*~Uz5FjpuF5NGoIM`
z`l9*UPVn2=4DbZ}V(VX6gu=kdXyHQ$1ZPHn*YKu_8M!HxSV;Tvnf7u1oIn4BYVr1#
zEo7Y$hH#Vvt8EhC!`<4>#pMBc3N{07Q_ld<c4P~=?+(TBZL<&0v_HXk2`a5%Ti9rZ
z^YdE!rfy)>s`ZKSejdItc?oARIP(I&O>G2U>kPcl!FN@X4o2`@RRyd~=-Xy@5cmSW
zeV+Ui+6($6G_u>*tS9ZO34FL~Y#I`t!s>$?;Zpb#<XKF9^hUv^An5%<Nml}Xr`<gR
zKPb>!SX4>tTi9CS_$Qwqpr3UhP6+&Fd%J)STsq?GT|gyiy%SLr+i$>@0K*=SkKazV
zK-fRNsJ$T1X3l)}j(P3v>j4viUmc6fLjt8yXivZaJW@X#$3L;2=F@LAsS3JGcuQ~0
z9z*Vdz5qh#&z)!=r@tTylISmpj(IkwnK=Nt38dbf*sozpHEmAxXOGS2eEBTMAJ3*P
zpPSgKKqZLRPsq!*dF|(qXAQyM)!GZT0Fasfnjtrw!G!tf7>1^Ve)0K$L4ZOM!*8cu
z;G_7Ug*tKk=8w0F6wVtj;7LA%z$BzUH&JBbAAkTKl*x{Aq3uyVVu0838wA598#i3C
YF_R`_#KUV8{zv~%ej`HN_8#;90m3+yy#N3J

literal 0
HcmV?d00001

diff --git a/test_dedup.py b/tests/test_dedup.py
similarity index 81%
rename from test_dedup.py
rename to tests/test_dedup.py
index f5ea811..721fd71 100644
--- a/test_dedup.py
+++ b/tests/test_dedup.py
@@ -14,20 +14,16 @@
 import os
 import sys
 
-# Import the functions we want to test
-try:
-    from dedup import (
-        remove_duplicates,
-        fuzzy_filter,
-        convert_df_to_dict,
-        flatten_data,
-        dedup_df,
-        count_items,
-        create_parquet_file
-    )
-except ImportError as e:
-    print(f"Import error: {e}")
-    print("Some functions may not be available for testing")
+
+from dedup import (
+    remove_duplicates,
+    fuzzy_filter,
+    convert_df_to_dict,
+    flatten_data,
+    dedup_df,
+    _count_items,
+    create_parquet_file
+)
 
 
 class TestDedupEndToEnd(unittest.TestCase):
@@ -193,10 +189,10 @@ def test_exact_deduplication(self):
         """Test exact duplicate removal."""
         try:
             data_dict = convert_df_to_dict(self.df)
-            original_count = count_items(data_dict)
+            original_count = _count_items(data_dict)
             
             deduplicated_data = remove_duplicates(data_dict)
-            deduplicated_count = count_items(deduplicated_data)
+            deduplicated_count = _count_items(deduplicated_data)
             
             # Should have fewer or equal items after deduplication
             self.assertLessEqual(deduplicated_count, original_count)
@@ -211,7 +207,7 @@ def test_fuzzy_deduplication_small(self):
         """Test fuzzy duplicate removal with small threshold for faster testing."""
         try:
             data_dict = convert_df_to_dict(self.df)
-            original_count = count_items(data_dict)
+            original_count = _count_items(data_dict)
             
             # Use small parameters for faster testing
             fuzzy_deduplicated_data = fuzzy_filter(
@@ -222,7 +218,7 @@ def test_fuzzy_deduplication_small(self):
                 rows_per_band=32  # Fewer rows per band
             )
             
-            fuzzy_count = count_items(fuzzy_deduplicated_data)
+            fuzzy_count = _count_items(fuzzy_deduplicated_data)
             
             # Should have fewer or equal items after fuzzy deduplication
             self.assertLessEqual(fuzzy_count, original_count)
@@ -237,7 +233,7 @@ def test_flatten_and_reconstruct(self):
         """Test flattening and reconstruction of data."""
         try:
             data_dict = convert_df_to_dict(self.df)
-            original_count = count_items(data_dict)
+            original_count = _count_items(data_dict)
             
             # Flatten
             flattened_data = flatten_data(data_dict)
@@ -392,11 +388,103 @@ def test_duplicate_detection(self):
         self.assertGreater(similar_code_count, 0, "Should have fuzzy duplicates for testing")
 
 
+class TestIntegrationWithParquetFixtures(unittest.TestCase):
+    """Integration tests using real parquet fixtures."""
+
+    FIXTURES_DIR = os.path.join(os.path.dirname(__file__), 'fixtures')
+
+    @classmethod
+    def setUpClass(cls):
+        """Load parquet fixtures once for all tests."""
+        submissions_path = os.path.join(cls.FIXTURES_DIR, 'submissions_fixture.parquet')
+        if os.path.exists(submissions_path):
+            cls.submissions_df = pd.read_parquet(submissions_path)
+            # Decode bytes to string if needed
+            if cls.submissions_df['code'].dtype == object and len(cls.submissions_df) > 0:
+                if isinstance(cls.submissions_df['code'].iloc[0], bytes):
+                    cls.submissions_df['code'] = cls.submissions_df['code'].apply(
+                        lambda x: x.decode('utf-8') if isinstance(x, bytes) else x
+                    )
+        else:
+            cls.submissions_df = None
+
+    def test_exact_dedup_on_fixture(self):
+        """Test exact deduplication on real fixture data."""
+        if self.submissions_df is None:
+            self.skipTest("Fixture not available")
+
+        data_dict = convert_df_to_dict(self.submissions_df)
+        original_count = _count_items(data_dict)
+
+        deduplicated = remove_duplicates(data_dict)
+        dedup_count = _count_items(deduplicated)
+
+        # Should have same or fewer items
+        self.assertLessEqual(dedup_count, original_count)
+        # Structure should be preserved
+        self.assertEqual(set(data_dict.keys()), set(deduplicated.keys()))
+
+    def test_fuzzy_dedup_on_fixture(self):
+        """Test fuzzy deduplication on real fixture data."""
+        if self.submissions_df is None:
+            self.skipTest("Fixture not available")
+
+        data_dict = convert_df_to_dict(self.submissions_df)
+        original_count = _count_items(data_dict)
+
+        # Use smaller parameters for faster testing
+        fuzzy_deduped = fuzzy_filter(
+            data_dict,
+            threshold=0.5,
+            ngram_size=3,
+            bands=4,
+            rows_per_band=32
+        )
+
+        fuzzy_count = _count_items(fuzzy_deduped)
+
+        # Should have same or fewer items
+        self.assertLessEqual(fuzzy_count, original_count)
+
+    def test_full_pipeline_on_fixture(self):
+        """Test the full dedup pipeline on real fixture data."""
+        if self.submissions_df is None:
+            self.skipTest("Fixture not available")
+
+        data_dict = convert_df_to_dict(self.submissions_df)
+        original_count = _count_items(data_dict)
+
+        # Run exact dedup first
+        exact_deduped = remove_duplicates(data_dict)
+
+        # Then fuzzy dedup with smaller params
+        fuzzy_deduped = fuzzy_filter(
+            exact_deduped,
+            threshold=0.5,
+            ngram_size=3,
+            bands=4,
+            rows_per_band=32
+        )
+
+        # Flatten to DataFrame
+        flattened = flatten_data(fuzzy_deduped)
+        result_df = pd.DataFrame(flattened)
+
+        # Verify output
+        self.assertIsInstance(result_df, pd.DataFrame)
+        self.assertLessEqual(len(result_df), original_count)
+
+        # Should preserve key columns
+        if len(result_df) > 0:
+            self.assertIn('code', result_df.columns)
+            self.assertIn('run_mode', result_df.columns)
+
+
 if __name__ == '__main__':
     # Add some helpful output
     print("Running deduplication pipeline tests...")
     print(f"Python version: {sys.version}")
     print(f"Pandas version: {pd.__version__}")
-    
+
     # Run the tests
     unittest.main(verbosity=2) 
\ No newline at end of file