Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
374 changes: 202 additions & 172 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion benchmark/AIME2024/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
from .calculate_metrics import calculate_metrics
from .aime2024 import AIME2024

__all__ = ["calculate_metrics", "AIME2024"]
__all__ = ["calculate_metrics", "AIME2024"]
34 changes: 19 additions & 15 deletions benchmark/AIME2024/aime2024.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,23 @@ class AIME2024(Benchmark):

# AIME2024 has a single dataset
all_datasets: List[str] = ["aime2024"]

benchmark_name: str = "aime2024"
huggingface_dataset_id: str = "xAlg-AI/att-hub-aime2024"

def _load_datasets(self) -> pd.DataFrame:
"""Load AIME2024 dataset.

AIME2024 uses a single dataset with all problems.

Returns:
pandas DataFrame with all AIME2024 problems.
"""
print(f"Loading AIME2024 dataset")

try:
from datasets import load_dataset

dataset = load_dataset(self.huggingface_dataset_id, split="test")
df = dataset.to_pandas()
df["task"] = "aime2024" # Ensure task column exists
Expand Down Expand Up @@ -72,7 +73,7 @@ def post_run_evaluate(self, results_df: pd.DataFrame) -> Dict[str, Any]:

# Use the calculate_metrics function from HashAttention evaluation
metrics: Dict[str, Any] = calculate_metrics(results_df)

# Format the results for consistency with other benchmarks
overall_metrics: Dict[str, Any] = {
"overall_score": round(metrics["accuracy"], 4),
Expand All @@ -84,16 +85,19 @@ def post_run_evaluate(self, results_df: pd.DataFrame) -> Dict[str, Any]:
"task_scores": {
"aime2024": {
"accuracy": round(metrics["accuracy"], 4),
"extraction_success_rate": round(metrics["extraction_success_rate"], 4)
"extraction_success_rate": round(
metrics["extraction_success_rate"], 4
),
}
},
"summary": {
"total_tasks": 1,
"total_samples": len(results_df)
}
"summary": {"total_tasks": 1, "total_samples": len(results_df)},
}

print(f" ✓ AIME2024 Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
print(f" ✓ Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")

return overall_metrics

print(
f" ✓ AIME2024 Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)"
)
print(
f" ✓ Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)"
)

return overall_metrics
133 changes: 75 additions & 58 deletions benchmark/AIME2024/calculate_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,51 +5,53 @@
import pandas as pd
from typing import List, Dict, Any


def extract_boxed_answer(text: str) -> str:
"""
Extract the answer from \boxed{...} format in the text.

Args:
text: The model's response text

Returns:
The extracted answer as a string, or empty string if not found
"""
# Look for \boxed{...} pattern
boxed_pattern = r'\\boxed\{([^}]*)\}'
boxed_pattern = r"\\boxed\{([^}]*)\}"
matches = re.findall(boxed_pattern, text)

if matches:
# Take the last boxed answer in case there are multiple
answer = matches[-1].strip()

# Extract just the number if there's additional formatting
# Handle cases like "033", "23", "$23$", etc.
number_match = re.search(r'\d+', answer)
number_match = re.search(r"\d+", answer)
if number_match:
return number_match.group()
else:
return answer

# Fallback: look for numbers at the end of the text
# This handles cases where the model doesn't use \boxed format
lines = text.strip().split('\n')
lines = text.strip().split("\n")
for line in reversed(lines):
if line.strip():
# Look for a number in the last non-empty line
number_match = re.search(r'\b(\d{1,3})\b', line)
number_match = re.search(r"\b(\d{1,3})\b", line)
if number_match:
return number_match.group(1)

return ""


def normalize_answer(answer: str) -> str:
"""
Normalize an answer to a standard format.

Args:
answer: The answer string to normalize

Returns:
Normalized answer string
"""
Expand All @@ -59,34 +61,37 @@ def normalize_answer(answer: str) -> str:
return str(int(answer))
return answer


def calculate_metrics(df: pd.DataFrame) -> Dict[str, Any]:
"""
Calculate evaluation metrics for AIME2024 benchmark.

Args:
df: DataFrame with columns 'answer' (ground truth) and 'predicted_answer' (model output)

Returns:
Dictionary containing evaluation metrics
"""
if 'predicted_answer' not in df.columns:
if "predicted_answer" not in df.columns:
raise ValueError("DataFrame must contain 'predicted_answer' column")
if 'answer' not in df.columns:
if "answer" not in df.columns:
raise ValueError("DataFrame must contain 'answer' column")

total_problems = len(df)
correct_answers = 0
extraction_failures = 0

detailed_results = []

for idx, row in df.iterrows():
ground_truth = normalize_answer(str(row['answer']))
predicted_text = str(row['predicted_answer']) if pd.notna(row['predicted_answer']) else ""

ground_truth = normalize_answer(str(row["answer"]))
predicted_text = (
str(row["predicted_answer"]) if pd.notna(row["predicted_answer"]) else ""
)

# Extract the predicted answer
extracted_answer = extract_boxed_answer(predicted_text)

if not extracted_answer:
extraction_failures += 1
is_correct = False
Expand All @@ -95,35 +100,42 @@ def calculate_metrics(df: pd.DataFrame) -> Dict[str, Any]:
is_correct = extracted_answer == ground_truth
if is_correct:
correct_answers += 1

detailed_results.append({
'id': row.get('id', f'problem_{idx}'),
'ground_truth': ground_truth,
'predicted_text': predicted_text,
'extracted_answer': extracted_answer,
'is_correct': is_correct,
'extraction_failed': not bool(extracted_answer)
})


detailed_results.append(
{
"id": row.get("id", f"problem_{idx}"),
"ground_truth": ground_truth,
"predicted_text": predicted_text,
"extracted_answer": extracted_answer,
"is_correct": is_correct,
"extraction_failed": not bool(extracted_answer),
}
)

# Calculate metrics
accuracy = correct_answers / total_problems if total_problems > 0 else 0.0
extraction_success_rate = (total_problems - extraction_failures) / total_problems if total_problems > 0 else 0.0

extraction_success_rate = (
(total_problems - extraction_failures) / total_problems
if total_problems > 0
else 0.0
)

metrics = {
'accuracy': accuracy,
'correct_answers': correct_answers,
'total_problems': total_problems,
'extraction_success_rate': extraction_success_rate,
'extraction_failures': extraction_failures,
'detailed_results': detailed_results
"accuracy": accuracy,
"correct_answers": correct_answers,
"total_problems": total_problems,
"extraction_success_rate": extraction_success_rate,
"extraction_failures": extraction_failures,
"detailed_results": detailed_results,
}

return metrics


def print_metrics_summary(metrics: Dict[str, Any]) -> None:
"""
Print a formatted summary of the evaluation metrics.

Args:
metrics: Dictionary containing evaluation metrics
"""
Expand All @@ -132,34 +144,39 @@ def print_metrics_summary(metrics: Dict[str, Any]) -> None:
print(f"Total Problems: {metrics['total_problems']}")
print(f"Correct Answers: {metrics['correct_answers']}")
print(f"Accuracy: {metrics['accuracy']:.3f} ({metrics['accuracy']*100:.1f}%)")
print(f"Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)")
print(
f"Extraction Success Rate: {metrics['extraction_success_rate']:.3f} ({metrics['extraction_success_rate']*100:.1f}%)"
)
print(f"Extraction Failures: {metrics['extraction_failures']}")

if metrics['extraction_failures'] > 0:
print(f"\nNote: {metrics['extraction_failures']} problems had answer extraction failures.")

if metrics["extraction_failures"] > 0:
print(
f"\nNote: {metrics['extraction_failures']} problems had answer extraction failures."
)
print("These are counted as incorrect answers.")


if __name__ == "__main__":
# Test the metrics calculation
test_data = {
'answer': ['23', '33', '156', '902'],
'predicted_answer': [
'The answer is \\boxed{23}.',
'After solving, we get \\boxed{033}.',
'Therefore, the answer is \\boxed{156}.',
'The final answer is 902.' # Test fallback extraction
"answer": ["23", "33", "156", "902"],
"predicted_answer": [
"The answer is \\boxed{23}.",
"After solving, we get \\boxed{033}.",
"Therefore, the answer is \\boxed{156}.",
"The final answer is 902.", # Test fallback extraction
],
'id': ['2024-I-1', '2024-I-2', '2024-I-3', '2024-I-4']
"id": ["2024-I-1", "2024-I-2", "2024-I-3", "2024-I-4"],
}

test_df = pd.DataFrame(test_data)
metrics = calculate_metrics(test_df)
print_metrics_summary(metrics)

print("\nDetailed Results:")
for result in metrics['detailed_results']:
for result in metrics["detailed_results"]:
print(f"ID: {result['id']}")
print(f" Ground Truth: {result['ground_truth']}")
print(f" Extracted: {result['extracted_answer']}")
print(f" Correct: {result['is_correct']}")
print()
print()
38 changes: 22 additions & 16 deletions benchmark/AIME2024/create_huggingface_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@
which is standard in mathematical competition contexts.
"""


def create_aime2024_dataset():
"""
Process the AIME2024 dataset and convert it to the standardized benchmark format.
"""
# Load the original dataset
dataset = load_dataset("Maxwell-Jia/AIME_2024")
df = dataset["train"].to_pandas()

# Create the standardized format
processed_data = []

for _, row in df.iterrows():
# Format the problem with clear instructions about the boxed answer format
context = f"""Solve the following AIME (American Invitational Mathematics Examination) problem.
Expand All @@ -41,31 +42,36 @@ def create_aime2024_dataset():
Instructions:
- The answer should be an integer between 0 and 999
- Please reason step by step, and put your final answer within \\boxed{{...}} format"""

question = "What is the answer to this problem?"

# The answer prefix encourages the model to show work before the final answer
answer_prefix = ""

processed_data.append({
'context': context,
'question': question,
'answer_prefix': answer_prefix,
'answer': str(row['Answer']), # Convert to string for consistency
'id': row['ID'],
'max_new_tokens': 32000, # Allow comprehensive step-by-step solutions
})


processed_data.append(
{
"context": context,
"question": question,
"answer_prefix": answer_prefix,
"answer": str(row["Answer"]), # Convert to string for consistency
"id": row["ID"],
"max_new_tokens": 32000, # Allow comprehensive step-by-step solutions
}
)

# Convert to Dataset
processed_dataset = Dataset.from_pandas(pd.DataFrame(processed_data))

return processed_dataset


if __name__ == "__main__":
# Test the dataset creation
processed_dataset = create_aime2024_dataset()
print(f"Created dataset with {len(processed_dataset)} examples")
print("\nFirst example:")
print(processed_dataset[0])

processed_dataset.push_to_hub("xAlg-AI/att-hub-aime2024", config_name=f"aime2024", split="test")
processed_dataset.push_to_hub(
"xAlg-AI/att-hub-aime2024", config_name=f"aime2024", split="test"
)
Loading
Loading