skylight-org · luis-gasparschroeder · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/README.md b/README.md
diff --git a/benchmark/AIME2024/__init__.py b/benchmark/AIME2024/__init__.py
@@ -8,4 +8,4 @@
 from .calculate_metrics import calculate_metrics
 from .aime2024 import AIME2024
 
-__all__ = ["calculate_metrics", "AIME2024"]
+__all__ = ["calculate_metrics", "AIME2024"]
diff --git a/benchmark/AIME2024/aime2024.py b/benchmark/AIME2024/aime2024.py
@@ -29,22 +29,23 @@ class AIME2024(Benchmark):
 
     # AIME2024 has a single dataset
     all_datasets: List[str] = ["aime2024"]
-    
+
     benchmark_name: str = "aime2024"
     huggingface_dataset_id: str = "xAlg-AI/att-hub-aime2024"
 
     def _load_datasets(self) -> pd.DataFrame:
         """Load AIME2024 dataset.
-        
+
         AIME2024 uses a single dataset with all problems.
-        
+
         Returns:
             pandas DataFrame with all AIME2024 problems.
         """
         print(f"Loading AIME2024 dataset")
-        
+
         try:
             from datasets import load_dataset
+
             dataset = load_dataset(self.huggingface_dataset_id, split="test")
             df = dataset.to_pandas()
             df["task"] = "aime2024"  # Ensure task column exists
@@ -72,7 +73,7 @@ def post_run_evaluate(self, results_df: pd.DataFrame) -> Dict[str, Any]:
 
         # Use the calculate_metrics function from HashAttention evaluation
         metrics: Dict[str, Any] = calculate_metrics(results_df)
-        
+
         # Format the results for consistency with other benchmarks
         overall_metrics: Dict[str, Any] = {
             "overall_score": round(metrics["accuracy"], 4),
@@ -84,16 +85,19 @@ def post_run_evaluate(self, results_df: pd.DataFrame) -> Dict[str, Any]:
             "task_scores": {
                 "aime2024": {
                     "accuracy": round(metrics["accuracy"], 4),
-                    "extraction_success_rate": round(metrics["extraction_success_rate"], 4)
+                    "extraction_success_rate": round(
+                        metrics["extraction_success_rate"], 4
+                    ),
                 }
             },
-            "summary": {
-                "total_tasks": 1,
-                "total_samples": len(results_df)
-            }
+            "summary": {"total_tasks": 1, "total_samples": len(results_df)},
         }
-
-        print(f"  ✓ AIME2024 Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
-        print(f"  ✓ Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")
-
-        return overall_metrics 
+
+        print(
+            f"  ✓ AIME2024 Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)"
+        )
+        print(
+            f"  ✓ Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)"
+        )
+
+        return overall_metrics
diff --git a/benchmark/AIME2024/calculate_metrics.py b/benchmark/AIME2024/calculate_metrics.py
@@ -5,51 +5,53 @@
 import pandas as pd
 from typing import List, Dict, Any
 
+
 def extract_boxed_answer(text: str) -> str:
     """
     Extract the answer from \boxed{...} format in the text.
-    
+
     Args:
         text: The model's response text
-        
+
     Returns:
         The extracted answer as a string, or empty string if not found
     """
     # Look for \boxed{...} pattern
-    boxed_pattern = r'\\boxed\{([^}]*)\}'
+    boxed_pattern = r"\\boxed\{([^}]*)\}"
     matches = re.findall(boxed_pattern, text)
-    
+
     if matches:
         # Take the last boxed answer in case there are multiple
         answer = matches[-1].strip()
-        
+
         # Extract just the number if there's additional formatting
         # Handle cases like "033", "23", "$23$", etc.
-        number_match = re.search(r'\d+', answer)
+        number_match = re.search(r"\d+", answer)
         if number_match:
             return number_match.group()
         else:
             return answer
-    
+
     # Fallback: look for numbers at the end of the text
     # This handles cases where the model doesn't use \boxed format
-    lines = text.strip().split('\n')
+    lines = text.strip().split("\n")
     for line in reversed(lines):
         if line.strip():
             # Look for a number in the last non-empty line
-            number_match = re.search(r'\b(\d{1,3})\b', line)
+            number_match = re.search(r"\b(\d{1,3})\b", line)
             if number_match:
                 return number_match.group(1)
-    
+
     return ""
 
+
 def normalize_answer(answer: str) -> str:
     """
     Normalize an answer to a standard format.
-    
+
     Args:
         answer: The answer string to normalize
-        
+
     Returns:
         Normalized answer string
     """
@@ -59,34 +61,37 @@ def normalize_answer(answer: str) -> str:
         return str(int(answer))
     return answer
 
+
 def calculate_metrics(df: pd.DataFrame) -> Dict[str, Any]:
     """
     Calculate evaluation metrics for AIME2024 benchmark.
-    
+
     Args:
         df: DataFrame with columns 'answer' (ground truth) and 'predicted_answer' (model output)
-        
+
     Returns:
         Dictionary containing evaluation metrics
     """
-    if 'predicted_answer' not in df.columns:
+    if "predicted_answer" not in df.columns:
         raise ValueError("DataFrame must contain 'predicted_answer' column")
-    if 'answer' not in df.columns:
+    if "answer" not in df.columns:
         raise ValueError("DataFrame must contain 'answer' column")
-    
+
     total_problems = len(df)
     correct_answers = 0
     extraction_failures = 0
-    
+
     detailed_results = []
-    
+
     for idx, row in df.iterrows():
-        ground_truth = normalize_answer(str(row['answer']))
-        predicted_text = str(row['predicted_answer']) if pd.notna(row['predicted_answer']) else ""
-
+        ground_truth = normalize_answer(str(row["answer"]))
+        predicted_text = (
+            str(row["predicted_answer"]) if pd.notna(row["predicted_answer"]) else ""
+        )
+
         # Extract the predicted answer
         extracted_answer = extract_boxed_answer(predicted_text)
-        
+
         if not extracted_answer:
             extraction_failures += 1
             is_correct = False
@@ -95,35 +100,42 @@ def calculate_metrics(df: pd.DataFrame) -> Dict[str, Any]:
             is_correct = extracted_answer == ground_truth
             if is_correct:
                 correct_answers += 1
-
-        detailed_results.append({
-            'id': row.get('id', f'problem_{idx}'),
-            'ground_truth': ground_truth,
-            'predicted_text': predicted_text,
-            'extracted_answer': extracted_answer,
-            'is_correct': is_correct,
-            'extraction_failed': not bool(extracted_answer)
-        })
-
+
+        detailed_results.append(
+            {
+                "id": row.get("id", f"problem_{idx}"),
+                "ground_truth": ground_truth,
+                "predicted_text": predicted_text,
+                "extracted_answer": extracted_answer,
+                "is_correct": is_correct,
+                "extraction_failed": not bool(extracted_answer),
+            }
+        )
+
     # Calculate metrics
     accuracy = correct_answers / total_problems if total_problems > 0 else 0.0
-    extraction_success_rate = (total_problems - extraction_failures) / total_problems if total_problems > 0 else 0.0
-
+    extraction_success_rate = (
+        (total_problems - extraction_failures) / total_problems
+        if total_problems > 0
+        else 0.0
+    )
+
     metrics = {
-        'accuracy': accuracy,
-        'correct_answers': correct_answers,
-        'total_problems': total_problems,
-        'extraction_success_rate': extraction_success_rate,
-        'extraction_failures': extraction_failures,
-        'detailed_results': detailed_results
+        "accuracy": accuracy,
+        "correct_answers": correct_answers,
+        "total_problems": total_problems,
+        "extraction_success_rate": extraction_success_rate,
+        "extraction_failures": extraction_failures,
+        "detailed_results": detailed_results,
     }
-    
+
     return metrics
 
+
 def print_metrics_summary(metrics: Dict[str, Any]) -> None:
     """
     Print a formatted summary of the evaluation metrics.
-    
+
     Args:
         metrics: Dictionary containing evaluation metrics
     """
@@ -132,34 +144,39 @@ def print_metrics_summary(metrics: Dict[str, Any]) -> None:
     print(f"Total Problems: {metrics['total_problems']}")
     print(f"Correct Answers: {metrics['correct_answers']}")
     print(f"Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
-    print(f"Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")
+    print(
+        f"Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)"
+    )
     print(f"Extraction Failures: {metrics['extraction_failures']}")
-
-    if metrics['extraction_failures'] > 0:
-        print(f"\nNote: {metrics['extraction_failures']} problems had answer extraction failures.")
+
+    if metrics["extraction_failures"] > 0:
+        print(
+            f"\nNote: {metrics['extraction_failures']} problems had answer extraction failures."
+        )
         print("These are counted as incorrect answers.")
 
+
 if __name__ == "__main__":
     # Test the metrics calculation
     test_data = {
-        'answer': ['23', '33', '156', '902'],
-        'predicted_answer': [
-            'The answer is \\boxed{23}.',
-            'After solving, we get \\boxed{033}.',
-            'Therefore, the answer is \\boxed{156}.',
-            'The final answer is 902.'  # Test fallback extraction
+        "answer": ["23", "33", "156", "902"],
+        "predicted_answer": [
+            "The answer is \\boxed{23}.",
+            "After solving, we get \\boxed{033}.",
+            "Therefore, the answer is \\boxed{156}.",
+            "The final answer is 902.",  # Test fallback extraction
         ],
-        'id': ['2024-I-1', '2024-I-2', '2024-I-3', '2024-I-4']
+        "id": ["2024-I-1", "2024-I-2", "2024-I-3", "2024-I-4"],
     }
-    
+
     test_df = pd.DataFrame(test_data)
     metrics = calculate_metrics(test_df)
     print_metrics_summary(metrics)
-    
+
     print("\nDetailed Results:")
-    for result in metrics['detailed_results']:
+    for result in metrics["detailed_results"]:
         print(f"ID: {result['id']}")
         print(f"  Ground Truth: {result['ground_truth']}")
         print(f"  Extracted: {result['extracted_answer']}")
         print(f"  Correct: {result['is_correct']}")
-        print()
+        print()
diff --git a/benchmark/AIME2024/create_huggingface_dataset.py b/benchmark/AIME2024/create_huggingface_dataset.py
@@ -21,17 +21,18 @@
 which is standard in mathematical competition contexts.
 """
 
+
 def create_aime2024_dataset():
     """
     Process the AIME2024 dataset and convert it to the standardized benchmark format.
     """
     # Load the original dataset
     dataset = load_dataset("Maxwell-Jia/AIME_2024")
     df = dataset["train"].to_pandas()
-    
+
     # Create the standardized format
     processed_data = []
-    
+
     for _, row in df.iterrows():
         # Format the problem with clear instructions about the boxed answer format
         context = f"""Solve the following AIME (American Invitational Mathematics Examination) problem.
@@ -41,31 +42,36 @@ def create_aime2024_dataset():
 Instructions:
 - The answer should be an integer between 0 and 999
 - Please reason step by step, and put your final answer within \\boxed{{...}} format"""
-        
+
         question = "What is the answer to this problem?"
-        
+
         # The answer prefix encourages the model to show work before the final answer
         answer_prefix = ""
-
-        processed_data.append({
-            'context': context,
-            'question': question,
-            'answer_prefix': answer_prefix,
-            'answer': str(row['Answer']),  # Convert to string for consistency
-            'id': row['ID'],
-            'max_new_tokens': 32000,  # Allow comprehensive step-by-step solutions
-        })
-
+
+        processed_data.append(
+            {
+                "context": context,
+                "question": question,
+                "answer_prefix": answer_prefix,
+                "answer": str(row["Answer"]),  # Convert to string for consistency
+                "id": row["ID"],
+                "max_new_tokens": 32000,  # Allow comprehensive step-by-step solutions
+            }
+        )
+
     # Convert to Dataset
     processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))
-    
+
     return processed_dataset
 
+
 if __name__ == "__main__":
     # Test the dataset creation
     processed_dataset = create_aime2024_dataset()
     print(f"Created dataset with {len(processed_dataset)} examples")
     print("\nFirst example:")
     print(processed_dataset[0])
 
-    processed_dataset.push_to_hub("xAlg-AI/att-hub-aime2024", config_name=f"aime2024", split="test")
+    processed_dataset.push_to_hub(
+        "xAlg-AI/att-hub-aime2024", config_name=f"aime2024", split="test"
+    )