skylight-org · AlexCuadron · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/benchmark/AIME2024/__init__.py b/benchmark/AIME2024/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+AIME2024 benchmark module for evaluating mathematical reasoning.
+"""
+
+from .calculate_metrics import calculate_metrics
+from .aime2024 import AIME2024
+
+__all__ = ["calculate_metrics", "AIME2024"]
diff --git a/benchmark/AIME2024/aime2024.py b/benchmark/AIME2024/aime2024.py
@@ -0,0 +1,99 @@
+"""AIME2024 benchmark implementation for mathematical reasoning evaluation."""
+
+from typing import Dict, Any, List
+import pandas as pd
+
+from ..base import Benchmark
+from ..benchmark_registry import register_benchmark
+from .calculate_metrics import calculate_metrics
+
+
+@register_benchmark("aime2024")
+class AIME2024(Benchmark):
+    """AIME2024 benchmark for evaluating mathematical reasoning.
+
+    AIME2024 is a benchmark for evaluating the ability of large language models to solve
+    mathematical competition problems. It contains problems from the American Invitational
+    Mathematics Examination (AIME) 2024.
+
+    The benchmark evaluates mathematical reasoning capabilities:
+    - Problem solving with numerical answers (0-999)
+    - Step-by-step mathematical reasoning
+    - Answer extraction from \boxed{...} format
+
+    Example:
+        >>> aime2024 = AIME2024()
+        >>> results = aime2024.run_benchmark(adapter, result_dir="/path/to/results")
+        >>> print(f"Accuracy: {results}")
+    """
+
+    # AIME2024 has a single dataset
+    all_datasets: List[str] = ["aime2024"]
+
+    benchmark_name: str = "aime2024"
+    huggingface_dataset_id: str = "xAlg-AI/att-hub-aime2024"
+
+    def _load_datasets(self) -> pd.DataFrame:
+        """Load AIME2024 dataset.
+
+        AIME2024 uses a single dataset with all problems.
+
+        Returns:
+            pandas DataFrame with all AIME2024 problems.
+        """
+        print(f"Loading AIME2024 dataset")
+
+        try:
+            from datasets import load_dataset
+            dataset = load_dataset(self.huggingface_dataset_id, split="test")
+            df = dataset.to_pandas()
+            df["task"] = "aime2024"  # Ensure task column exists
+            print(f"  ✓ Loaded {len(df)} AIME2024 problems")
+            return df
+        except Exception as e:
+            raise Exception(f"Failed to load AIME2024 dataset: {str(e)}")
+
+    def post_run_evaluate(self, results_df: pd.DataFrame) -> Dict[str, Any]:
+        """Compute evaluation metrics for AIME2024 results.
+
+        Args:
+            results_df: DataFrame containing benchmark results with columns:
+                - predicted_answer: Model's predicted answer
+                - answer: Ground truth answers
+
+        Returns:
+            Dictionary containing computed metrics:
+            - accuracy: Overall accuracy
+            - extraction_success_rate: Rate of successful answer extraction
+            - detailed_results: Individual problem results
+        """
+        if len(results_df) == 0:
+            return {"error": "No results to evaluate"}
+
+        # Use the calculate_metrics function from HashAttention evaluation
+        metrics: Dict[str, Any] = calculate_metrics(results_df)
+
+        # Format the results for consistency with other benchmarks
+        overall_metrics: Dict[str, Any] = {
+            "overall_score": round(metrics["accuracy"], 4),
+            "accuracy": round(metrics["accuracy"], 4),
+            "extraction_success_rate": round(metrics["extraction_success_rate"], 4),
+            "correct_answers": metrics["correct_answers"],
+            "total_problems": metrics["total_problems"],
+            "extraction_failures": metrics["extraction_failures"],
+            "task_scores": {
+                "aime2024": {
+                    "accuracy": round(metrics["accuracy"], 4),
+                    "extraction_success_rate": round(metrics["extraction_success_rate"], 4)
+                }
+            },
+            "summary": {
+                "total_tasks": 1,
+                "total_samples": len(results_df)
+            }
+        }
+
+        print(f"  ✓ AIME2024 Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
+        print(f"  ✓ Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")
+
+        return overall_metrics 
diff --git a/benchmark/AIME2024/calculate_metrics.py b/benchmark/AIME2024/calculate_metrics.py
@@ -0,0 +1,165 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+import pandas as pd
+from typing import List, Dict, Any
+
+def extract_boxed_answer(text: str) -> str:
+    """
+    Extract the answer from \boxed{...} format in the text.
+
+    Args:
+        text: The model's response text
+
+    Returns:
+        The extracted answer as a string, or empty string if not found
+    """
+    # Look for \boxed{...} pattern
+    boxed_pattern = r'\\boxed\{([^}]*)\}'
+    matches = re.findall(boxed_pattern, text)
+
+    if matches:
+        # Take the last boxed answer in case there are multiple
+        answer = matches[-1].strip()
+
+        # Extract just the number if there's additional formatting
+        # Handle cases like "033", "23", "$23$", etc.
+        number_match = re.search(r'\d+', answer)
+        if number_match:
+            return number_match.group()
+        else:
+            return answer
+
+    # Fallback: look for numbers at the end of the text
+    # This handles cases where the model doesn't use \boxed format
+    lines = text.strip().split('\n')
+    for line in reversed(lines):
+        if line.strip():
+            # Look for a number in the last non-empty line
+            number_match = re.search(r'\b(\d{1,3})\b', line)
+            if number_match:
+                return number_match.group(1)
+
+    return ""
+
+def normalize_answer(answer: str) -> str:
+    """
+    Normalize an answer to a standard format.
+
+    Args:
+        answer: The answer string to normalize
+
+    Returns:
+        Normalized answer string
+    """
+    # Remove leading zeros but keep at least one digit
+    answer = answer.strip()
+    if answer.isdigit():
+        return str(int(answer))
+    return answer
+
+def calculate_metrics(df: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Calculate evaluation metrics for AIME2024 benchmark.
+
+    Args:
+        df: DataFrame with columns 'answer' (ground truth) and 'predicted_answer' (model output)
+
+    Returns:
+        Dictionary containing evaluation metrics
+    """
+    if 'predicted_answer' not in df.columns:
+        raise ValueError("DataFrame must contain 'predicted_answer' column")
+    if 'answer' not in df.columns:
+        raise ValueError("DataFrame must contain 'answer' column")
+
+    total_problems = len(df)
+    correct_answers = 0
+    extraction_failures = 0
+
+    detailed_results = []
+
+    for idx, row in df.iterrows():
+        ground_truth = normalize_answer(str(row['answer']))
+        predicted_text = str(row['predicted_answer']) if pd.notna(row['predicted_answer']) else ""
+
+        # Extract the predicted answer
+        extracted_answer = extract_boxed_answer(predicted_text)
+
+        if not extracted_answer:
+            extraction_failures += 1
+            is_correct = False
+        else:
+            extracted_answer = normalize_answer(extracted_answer)
+            is_correct = extracted_answer == ground_truth
+            if is_correct:
+                correct_answers += 1
+
+        detailed_results.append({
+            'id': row.get('id', f'problem_{idx}'),
+            'ground_truth': ground_truth,
+            'predicted_text': predicted_text,
+            'extracted_answer': extracted_answer,
+            'is_correct': is_correct,
+            'extraction_failed': not bool(extracted_answer)
+        })
+
+    # Calculate metrics
+    accuracy = correct_answers / total_problems if total_problems > 0 else 0.0
+    extraction_success_rate = (total_problems - extraction_failures) / total_problems if total_problems > 0 else 0.0
+
+    metrics = {
+        'accuracy': accuracy,
+        'correct_answers': correct_answers,
+        'total_problems': total_problems,
+        'extraction_success_rate': extraction_success_rate,
+        'extraction_failures': extraction_failures,
+        'detailed_results': detailed_results
+    }
+
+    return metrics
+
+def print_metrics_summary(metrics: Dict[str, Any]) -> None:
+    """
+    Print a formatted summary of the evaluation metrics.
+
+    Args:
+        metrics: Dictionary containing evaluation metrics
+    """
+    print("AIME2024 Evaluation Results")
+    print("=" * 40)
+    print(f"Total Problems: {metrics['total_problems']}")
+    print(f"Correct Answers: {metrics['correct_answers']}")
+    print(f"Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
+    print(f"Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")
+    print(f"Extraction Failures: {metrics['extraction_failures']}")
+
+    if metrics['extraction_failures'] > 0:
+        print(f"\nNote: {metrics['extraction_failures']} problems had answer extraction failures.")
+        print("These are counted as incorrect answers.")
+
+if __name__ == "__main__":
+    # Test the metrics calculation
+    test_data = {
+        'answer': ['23', '33', '156', '902'],
+        'predicted_answer': [
+            'The answer is \\boxed{23}.',
+            'After solving, we get \\boxed{033}.',
+            'Therefore, the answer is \\boxed{156}.',
+            'The final answer is 902.'  # Test fallback extraction
+        ],
+        'id': ['2024-I-1', '2024-I-2', '2024-I-3', '2024-I-4']
+    }
+
+    test_df = pd.DataFrame(test_data)
+    metrics = calculate_metrics(test_df)
+    print_metrics_summary(metrics)
+
+    print("\nDetailed Results:")
+    for result in metrics['detailed_results']:
+        print(f"ID: {result['id']}")
+        print(f"  Ground Truth: {result['ground_truth']}")
+        print(f"  Extracted: {result['extracted_answer']}")
+        print(f"  Correct: {result['is_correct']}")
+        print()
diff --git a/benchmark/AIME2024/create_huggingface_dataset.py b/benchmark/AIME2024/create_huggingface_dataset.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+from datasets import Dataset, load_dataset
+import pandas as pd
+
+"""
+AIME2024 Dataset Processing
+
+The AIME (American Invitational Mathematics Examination) 2024 dataset contains mathematical competition problems.
+Each problem requires a numerical answer between 0 and 999.
+
+Dataset structure:
+- ID: Problem identifier (e.g., "2024-I-1", "2024-II-4")
+- Problem: The mathematical problem statement
+- Solution: The solution explanation (not used for evaluation)
+- Answer: The correct numerical answer (integer between 0-999)
+
+For evaluation, we format the problems to instruct the model to wrap its answer in \boxed{...} format,
+which is standard in mathematical competition contexts.
+"""
+
+def create_aime2024_dataset():
+    """
+    Process the AIME2024 dataset and convert it to the standardized benchmark format.
+    """
+    # Load the original dataset
+    dataset = load_dataset("Maxwell-Jia/AIME_2024")
+    df = dataset["train"].to_pandas()
+
+    # Create the standardized format
+    processed_data = []
+
+    for _, row in df.iterrows():
+        # Format the problem with clear instructions about the boxed answer format
+        context = f"""Solve the following AIME (American Invitational Mathematics Examination) problem.
+
+Problem: {row['Problem']}
+f
+Instructions:
+- The answer should be an integer between 0 and 999
+- Please reason step by step, and put your final answer within \\boxed{{...}} format"""
+
+        question = "What is the answer to this problem?"
+
+        # The answer prefix encourages the model to show work before the final answer
+        answer_prefix = ""
+
+        processed_data.append({
+            'context': context,
+            'question': question,
+            'answer_prefix': answer_prefix,
+            'answer': str(row['Answer']),  # Convert to string for consistency
+            'id': row['ID'],
+            'max_new_tokens': 32000,  # Allow comprehensive step-by-step solutions
+        })
+
+    # Convert to Dataset
+    processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))
+
+    return processed_dataset
+
+if __name__ == "__main__":
+    # Test the dataset creation
+    processed_dataset = create_aime2024_dataset()
+    print(f"Created dataset with {len(processed_dataset)} examples")
+    print("\nFirst example:")
+    print(processed_dataset[0])
+
+    processed_dataset.push_to_hub("xAlg-AI/att-hub-aime2024", config_name=f"aime2024", split="test")