Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
297b084
Add: Benchmark (LongBench as an example)
apd10 Jul 15, 2025
0bbc23c
Add Cursor log
apd10 Jul 15, 2025
02f5488
Add a mock benchmark for end-to-end tests
apd10 Jul 15, 2025
4294c5c
Add Cursor log
apd10 Jul 15, 2025
722a0df
Add Benchmark Evaluation Matrix
apd10 Jul 15, 2025
cb3f040
Fixes and refactorin in benchmark
apd10 Jul 16, 2025
20d12b1
Add Cursor log
apd10 Jul 16, 2025
55019df
Fix CUDA setting in workers
apd10 Jul 16, 2025
b70f864
Fix escaping \n, \r for csv dumping
apd10 Jul 16, 2025
3ac5cec
Minor refactoring and sample benchmark script
apd10 Jul 16, 2025
86dd8e0
Add all implemnetations of benchmarks (KVPRESS + AIME)
apd10 Jul 17, 2025
091626d
Add Cursor log
apd10 Jul 17, 2025
b434595
Add default required files to raw_results.csv
apd10 Jul 17, 2025
cc67b27
Add missing benchmark files
apd10 Jul 17, 2025
9dfef97
Add missing files
apd10 Jul 17, 2025
dbfbe7e
Fix signatures for process_request (tests + base class)
apd10 Jul 17, 2025
2a6ff67
Add Cursor log
apd10 Jul 17, 2025
d4ea216
Add OracleTopPMasker
apd10 Jul 17, 2025
ed37ed1
Add Cursor log
apd10 Jul 17, 2025
35d7f5a
Add RandomSamplingMasker
apd10 Jul 17, 2025
88bb04b
Add Cursor log
apd10 Jul 17, 2025
a7b5511
Add missing files
apd10 Jul 17, 2025
ec24894
Add MagicPig implementation
apd10 Jul 18, 2025
77806bf
Add Cursor log
apd10 Jul 18, 2025
7f137e8
Add HashAttention utility to convert weights from USA
apd10 Jul 18, 2025
57ab13d
Fix apply_mask should happen in sparse mode
apd10 Jul 18, 2025
9c2cc7e
Add AdaptiveSamplingMasker (HashAttention-v2)
apd10 Jul 19, 2025
398c7ab
Add plans
apd10 Jul 19, 2025
b564f59
Add Cursor log
apd10 Jul 19, 2025
c382b05
Fixes OracleTopk and AdaptiveSampling
apd10 Jul 20, 2025
9496b29
Fix linting errors
apd10 Jul 20, 2025
e6ce6fb
Add demo for adaptive sampling
apd10 Jul 20, 2025
255e485
Add missing packages to toml
apd10 Jul 20, 2025
941209b
added recovery mechanism, copied from HAT code
AlexCuadron Jul 20, 2025
1ac2c19
Fix linting issues and test regex patterns
AlexCuadron Jul 20, 2025
0978463
Remove docs/modelhub_hf_registration.md
AlexCuadron Jul 20, 2025
b07ec5f
Remove examples/modelhub_hf_registration_example.py
AlexCuadron Jul 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions benchmark/AIME2024/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
AIME2024 benchmark module for evaluating mathematical reasoning.
"""

from .calculate_metrics import calculate_metrics
from .aime2024 import AIME2024

__all__ = ["calculate_metrics", "AIME2024"]
99 changes: 99 additions & 0 deletions benchmark/AIME2024/aime2024.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""AIME2024 benchmark implementation for mathematical reasoning evaluation."""

from typing import Dict, Any, List
import pandas as pd

from ..base import Benchmark
from ..benchmark_registry import register_benchmark
from .calculate_metrics import calculate_metrics


@register_benchmark("aime2024")
class AIME2024(Benchmark):
"""AIME2024 benchmark for evaluating mathematical reasoning.

AIME2024 is a benchmark for evaluating the ability of large language models to solve
mathematical competition problems. It contains problems from the American Invitational
Mathematics Examination (AIME) 2024.

The benchmark evaluates mathematical reasoning capabilities:
- Problem solving with numerical answers (0-999)
- Step-by-step mathematical reasoning
- Answer extraction from \boxed{...} format

Example:
>>> aime2024 = AIME2024()
>>> results = aime2024.run_benchmark(adapter, result_dir="/path/to/results")
>>> print(f"Accuracy: {results}")
"""

# AIME2024 has a single dataset
all_datasets: List[str] = ["aime2024"]

benchmark_name: str = "aime2024"
huggingface_dataset_id: str = "xAlg-AI/att-hub-aime2024"

def _load_datasets(self) -> pd.DataFrame:
"""Load AIME2024 dataset.

AIME2024 uses a single dataset with all problems.

Returns:
pandas DataFrame with all AIME2024 problems.
"""
print(f"Loading AIME2024 dataset")

try:
from datasets import load_dataset
dataset = load_dataset(self.huggingface_dataset_id, split="test")
df = dataset.to_pandas()
df["task"] = "aime2024" # Ensure task column exists
print(f" ✓ Loaded {len(df)} AIME2024 problems")
return df
except Exception as e:
raise Exception(f"Failed to load AIME2024 dataset: {str(e)}")

def post_run_evaluate(self, results_df: pd.DataFrame) -> Dict[str, Any]:
"""Compute evaluation metrics for AIME2024 results.

Args:
results_df: DataFrame containing benchmark results with columns:
- predicted_answer: Model's predicted answer
- answer: Ground truth answers

Returns:
Dictionary containing computed metrics:
- accuracy: Overall accuracy
- extraction_success_rate: Rate of successful answer extraction
- detailed_results: Individual problem results
"""
if len(results_df) == 0:
return {"error": "No results to evaluate"}

# Use the calculate_metrics function from HashAttention evaluation
metrics: Dict[str, Any] = calculate_metrics(results_df)

# Format the results for consistency with other benchmarks
overall_metrics: Dict[str, Any] = {
"overall_score": round(metrics["accuracy"], 4),
"accuracy": round(metrics["accuracy"], 4),
"extraction_success_rate": round(metrics["extraction_success_rate"], 4),
"correct_answers": metrics["correct_answers"],
"total_problems": metrics["total_problems"],
"extraction_failures": metrics["extraction_failures"],
"task_scores": {
"aime2024": {
"accuracy": round(metrics["accuracy"], 4),
"extraction_success_rate": round(metrics["extraction_success_rate"], 4)
}
},
"summary": {
"total_tasks": 1,
"total_samples": len(results_df)
}
}

print(f" ✓ AIME2024 Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
print(f" ✓ Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")

return overall_metrics
165 changes: 165 additions & 0 deletions benchmark/AIME2024/calculate_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import re
import pandas as pd
from typing import List, Dict, Any

def extract_boxed_answer(text: str) -> str:
"""
Extract the answer from \boxed{...} format in the text.

Args:
text: The model's response text

Returns:
The extracted answer as a string, or empty string if not found
"""
# Look for \boxed{...} pattern
boxed_pattern = r'\\boxed\{([^}]*)\}'
matches = re.findall(boxed_pattern, text)

if matches:
# Take the last boxed answer in case there are multiple
answer = matches[-1].strip()

# Extract just the number if there's additional formatting
# Handle cases like "033", "23", "$23$", etc.
number_match = re.search(r'\d+', answer)
if number_match:
return number_match.group()
else:
return answer

# Fallback: look for numbers at the end of the text
# This handles cases where the model doesn't use \boxed format
lines = text.strip().split('\n')
for line in reversed(lines):
if line.strip():
# Look for a number in the last non-empty line
number_match = re.search(r'\b(\d{1,3})\b', line)
if number_match:
return number_match.group(1)

return ""

def normalize_answer(answer: str) -> str:
"""
Normalize an answer to a standard format.

Args:
answer: The answer string to normalize

Returns:
Normalized answer string
"""
# Remove leading zeros but keep at least one digit
answer = answer.strip()
if answer.isdigit():
return str(int(answer))
return answer

def calculate_metrics(df: pd.DataFrame) -> Dict[str, Any]:
"""
Calculate evaluation metrics for AIME2024 benchmark.

Args:
df: DataFrame with columns 'answer' (ground truth) and 'predicted_answer' (model output)

Returns:
Dictionary containing evaluation metrics
"""
if 'predicted_answer' not in df.columns:
raise ValueError("DataFrame must contain 'predicted_answer' column")
if 'answer' not in df.columns:
raise ValueError("DataFrame must contain 'answer' column")

total_problems = len(df)
correct_answers = 0
extraction_failures = 0

detailed_results = []

for idx, row in df.iterrows():
ground_truth = normalize_answer(str(row['answer']))
predicted_text = str(row['predicted_answer']) if pd.notna(row['predicted_answer']) else ""

# Extract the predicted answer
extracted_answer = extract_boxed_answer(predicted_text)

if not extracted_answer:
extraction_failures += 1
is_correct = False
else:
extracted_answer = normalize_answer(extracted_answer)
is_correct = extracted_answer == ground_truth
if is_correct:
correct_answers += 1

detailed_results.append({
'id': row.get('id', f'problem_{idx}'),
'ground_truth': ground_truth,
'predicted_text': predicted_text,
'extracted_answer': extracted_answer,
'is_correct': is_correct,
'extraction_failed': not bool(extracted_answer)
})

# Calculate metrics
accuracy = correct_answers / total_problems if total_problems > 0 else 0.0
extraction_success_rate = (total_problems - extraction_failures) / total_problems if total_problems > 0 else 0.0

metrics = {
'accuracy': accuracy,
'correct_answers': correct_answers,
'total_problems': total_problems,
'extraction_success_rate': extraction_success_rate,
'extraction_failures': extraction_failures,
'detailed_results': detailed_results
}

return metrics

def print_metrics_summary(metrics: Dict[str, Any]) -> None:
"""
Print a formatted summary of the evaluation metrics.

Args:
metrics: Dictionary containing evaluation metrics
"""
print("AIME2024 Evaluation Results")
print("=" * 40)
print(f"Total Problems: {metrics['total_problems']}")
print(f"Correct Answers: {metrics['correct_answers']}")
print(f"Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
print(f"Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")
print(f"Extraction Failures: {metrics['extraction_failures']}")

if metrics['extraction_failures'] > 0:
print(f"\nNote: {metrics['extraction_failures']} problems had answer extraction failures.")
print("These are counted as incorrect answers.")

if __name__ == "__main__":
# Test the metrics calculation
test_data = {
'answer': ['23', '33', '156', '902'],
'predicted_answer': [
'The answer is \\boxed{23}.',
'After solving, we get \\boxed{033}.',
'Therefore, the answer is \\boxed{156}.',
'The final answer is 902.' # Test fallback extraction
],
'id': ['2024-I-1', '2024-I-2', '2024-I-3', '2024-I-4']
}

test_df = pd.DataFrame(test_data)
metrics = calculate_metrics(test_df)
print_metrics_summary(metrics)

print("\nDetailed Results:")
for result in metrics['detailed_results']:
print(f"ID: {result['id']}")
print(f" Ground Truth: {result['ground_truth']}")
print(f" Extracted: {result['extracted_answer']}")
print(f" Correct: {result['is_correct']}")
print()
71 changes: 71 additions & 0 deletions benchmark/AIME2024/create_huggingface_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0


from datasets import Dataset, load_dataset
import pandas as pd

"""
AIME2024 Dataset Processing

The AIME (American Invitational Mathematics Examination) 2024 dataset contains mathematical competition problems.
Each problem requires a numerical answer between 0 and 999.

Dataset structure:
- ID: Problem identifier (e.g., "2024-I-1", "2024-II-4")
- Problem: The mathematical problem statement
- Solution: The solution explanation (not used for evaluation)
- Answer: The correct numerical answer (integer between 0-999)

For evaluation, we format the problems to instruct the model to wrap its answer in \boxed{...} format,
which is standard in mathematical competition contexts.
"""

def create_aime2024_dataset():
"""
Process the AIME2024 dataset and convert it to the standardized benchmark format.
"""
# Load the original dataset
dataset = load_dataset("Maxwell-Jia/AIME_2024")
df = dataset["train"].to_pandas()

# Create the standardized format
processed_data = []

for _, row in df.iterrows():
# Format the problem with clear instructions about the boxed answer format
context = f"""Solve the following AIME (American Invitational Mathematics Examination) problem.

Problem: {row['Problem']}
f
Instructions:
- The answer should be an integer between 0 and 999
- Please reason step by step, and put your final answer within \\boxed{{...}} format"""

question = "What is the answer to this problem?"

# The answer prefix encourages the model to show work before the final answer
answer_prefix = ""

processed_data.append({
'context': context,
'question': question,
'answer_prefix': answer_prefix,
'answer': str(row['Answer']), # Convert to string for consistency
'id': row['ID'],
'max_new_tokens': 32000, # Allow comprehensive step-by-step solutions
})

# Convert to Dataset
processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))

return processed_dataset

if __name__ == "__main__":
# Test the dataset creation
processed_dataset = create_aime2024_dataset()
print(f"Created dataset with {len(processed_dataset)} examples")
print("\nFirst example:")
print(processed_dataset[0])

processed_dataset.push_to_hub("xAlg-AI/att-hub-aime2024", config_name=f"aime2024", split="test")
Loading