Skip to content

Commit f54372c

Browse files
authored
Merge pull request #14 from andrewginns/add-costs-to-preprocessed-merbench-json
Add costs to output JSON and update build backend
2 parents 6891bff + 5c30203 commit f54372c

File tree

8 files changed

+1208
-8
lines changed

8 files changed

+1208
-8
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ install:
22
uv sync
33
npm install -g @mermaid-js/mermaid-cli
44

5+
upgrade:
6+
uv sync -U
7+
58
lint:
69
uv run ruff check .
710

agents_mcp_usage/evaluations/mermaid_evals/costs.json

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,29 @@
195195
]
196196
}
197197
},
198+
"gemini-2.5-flash-preview-05-20": {
199+
"friendly_name": "Gemini 2.5 Flash Preview (May)",
200+
"input": [
201+
{
202+
"up_to": "inf",
203+
"price": 0.15
204+
}
205+
],
206+
"output": {
207+
"non_thinking": [
208+
{
209+
"up_to": "inf",
210+
"price": 0.6
211+
}
212+
],
213+
"thinking": [
214+
{
215+
"up_to": "inf",
216+
"price": 3.5
217+
}
218+
]
219+
}
220+
},
198221
"gemini-2.5-flash-preview": {
199222
"friendly_name": "Gemini 2.5 Flash Preview",
200223
"input": [
@@ -429,6 +452,74 @@
429452
}
430453
]
431454
}
455+
},
456+
"bedrock:us.amazon.nova-micro-v1:0": {
457+
"friendly_name": "Amazon Nova Micro",
458+
"input": [
459+
{
460+
"up_to": "inf",
461+
"price": 0.035
462+
}
463+
],
464+
"output": {
465+
"default": [
466+
{
467+
"up_to": "inf",
468+
"price": 0.14
469+
}
470+
]
471+
}
472+
},
473+
"bedrock:us.amazon.nova-lite-v1:0": {
474+
"friendly_name": "Amazon Nova Lite",
475+
"input": [
476+
{
477+
"up_to": "inf",
478+
"price": 0.06
479+
}
480+
],
481+
"output": {
482+
"default": [
483+
{
484+
"up_to": "inf",
485+
"price": 0.24
486+
}
487+
]
488+
}
489+
},
490+
"bedrock:us.amazon.nova-pro-v1:0": {
491+
"friendly_name": "Amazon Nova Pro",
492+
"input": [
493+
{
494+
"up_to": "inf",
495+
"price": 0.80
496+
}
497+
],
498+
"output": {
499+
"default": [
500+
{
501+
"up_to": "inf",
502+
"price": 3.20
503+
}
504+
]
505+
}
506+
},
507+
"bedrock:us.amazon.nova-premier-v1:0": {
508+
"friendly_name": "Amazon Nova Premier",
509+
"input": [
510+
{
511+
"up_to": "inf",
512+
"price": 2.50
513+
}
514+
],
515+
"output": {
516+
"default": [
517+
{
518+
"up_to": "inf",
519+
"price": 12.50
520+
}
521+
]
522+
}
432523
}
433524
}
434525
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Merbench Visualization Documentation
2+
3+
This directory contains comprehensive documentation for understanding and using the Merbench evaluation output data to build visualizations.
4+
5+
## Documentation Files
6+
7+
### 1. [output_json_schema.md](./output_json_schema.md)
8+
**Complete schema reference for the preprocessed JSON output**
9+
- Detailed description of all data sections
10+
- Filter system explanation (Model, Provider, Test Group)
11+
- Implementation guidelines for AND logic filtering
12+
- Visualization examples and best practices
13+
14+
### 2. [visualization_example.ts](./visualization_example.ts)
15+
**Practical TypeScript implementation example**
16+
- Complete working example of a cost vs performance scatter plot
17+
- Filter application with real code
18+
- Statistics update implementation
19+
- Event handler setup for interactive filtering
20+
21+
### 3. [data_relationships_quickref.md](./data_relationships_quickref.md)
22+
**Quick reference for data relationships and common operations**
23+
- Key relationships between data sections
24+
- Common query patterns
25+
- Performance optimization tips
26+
- Data validation checks
27+
28+
## Key Concepts
29+
30+
### Data Flow
31+
```
32+
CSV Input → preprocess_merbench_data.py → JSON Output → Visualizations
33+
34+
Cost Calculations
35+
(from costs.json)
36+
```
37+
38+
### Filter Types (AND Logic)
39+
1. **Model Filter**: Select specific models
40+
2. **Provider Filter**: Google, Amazon, OpenAI, Other
41+
3. **Test Group Filter**: easy, medium, hard
42+
43+
### Primary Data Sections
44+
- **raw_data**: Source for all filtering and aggregation
45+
- **leaderboard**: Pre-aggregated model rankings
46+
- **pareto_data**: Performance vs efficiency metrics
47+
- **test_groups_data**: Performance by difficulty
48+
- **cost_breakdown_data**: Detailed cost analysis
49+
- **failure_analysis_data**: Failure reason counts
50+
51+
## Quick Start
52+
53+
1. **Load the JSON data**
54+
```javascript
55+
const data = await fetch('processed_results.json').then(r => r.json());
56+
```
57+
58+
2. **Apply filters to raw_data**
59+
```javascript
60+
const filtered = data.raw_data.filter(row =>
61+
row.provider === "Google" && row.test_group === "easy"
62+
);
63+
```
64+
65+
3. **Recalculate aggregates**
66+
```javascript
67+
const modelStats = {};
68+
filtered.forEach(row => {
69+
if (!modelStats[row.Model]) {
70+
modelStats[row.Model] = {runs: 0, success: 0, cost: 0};
71+
}
72+
modelStats[row.Model].runs++;
73+
modelStats[row.Model].success += row.Score_MermaidDiagramValid;
74+
modelStats[row.Model].cost += row.total_cost;
75+
});
76+
```
77+
78+
4. **Create visualizations**
79+
- Use pre-aggregated data for initial views
80+
- Recalculate from filtered raw_data when filters change
81+
- Update all related visualizations together
82+
83+
## Cost Calculation Notes
84+
85+
- Costs are calculated per token using tiered pricing from `costs.json`
86+
- Failed tests (Score_UsageLimitNotExceeded = 0) have $0 cost
87+
- Input and output costs are tracked separately
88+
- Thinking tokens may have different pricing than regular output tokens
89+
90+
## Visualization Types
91+
92+
1. **Leaderboard Table**: Model rankings by success rate
93+
2. **Pareto Scatter Plot**: Performance vs cost/duration/tokens
94+
3. **Grouped Bar Charts**: Performance by test difficulty
95+
4. **Stacked Bar Charts**: Failure reasons, cost breakdown
96+
5. **Heatmaps**: Model × difficulty performance matrix
97+
98+
## Tips for Developers
99+
100+
- Always start filtering from `raw_data`
101+
- Cache filter results for performance
102+
- Use the `provider` field for color coding
103+
- Show active filters in the UI
104+
- Handle empty filter results gracefully
105+
- Consider log scale for cost axes due to wide ranges
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
# Data Relationships Quick Reference
2+
3+
## Key Relationships
4+
5+
### Primary Keys and Groupings
6+
7+
1. **Model** - Primary identifier across all data sections
8+
2. **test_group** - Secondary grouping (easy, medium, hard)
9+
3. **provider** - Derived from Model name (Google, Amazon, etc.)
10+
11+
### Data Section Dependencies
12+
13+
```
14+
raw_data (source)
15+
16+
├── leaderboard (group by Model)
17+
├── pareto_data (group by Model)
18+
├── test_groups_data (group by Model + test_group)
19+
├── failure_analysis_data (group by Model, count failures)
20+
└── cost_breakdown_data (group by Model + test_group)
21+
```
22+
23+
## Common Queries and Aggregations
24+
25+
### 1. Get Model Performance Summary
26+
```javascript
27+
// From raw_data
28+
const modelSummary = rawData
29+
.filter(r => r.Model === "gemini-2.5-pro")
30+
.reduce((acc, r) => ({
31+
successRate: acc.successRate + r.Score_MermaidDiagramValid,
32+
totalCost: acc.totalCost + r.total_cost,
33+
count: acc.count + 1
34+
}), {successRate: 0, totalCost: 0, count: 0});
35+
36+
modelSummary.avgSuccessRate = modelSummary.successRate / modelSummary.count * 100;
37+
```
38+
39+
### 2. Filter by Multiple Conditions
40+
```javascript
41+
// Get Amazon models on hard tests that succeeded
42+
const filtered = rawData.filter(r =>
43+
r.provider === "Amazon" &&
44+
r.test_group === "hard" &&
45+
r.Score_MermaidDiagramValid === 1
46+
);
47+
```
48+
49+
### 3. Calculate Cost Breakdown by Test Group
50+
```javascript
51+
// Group costs by difficulty
52+
const costByDifficulty = {};
53+
["easy", "medium", "hard"].forEach(group => {
54+
const groupData = rawData.filter(r => r.test_group === group);
55+
costByDifficulty[group] = {
56+
avgCost: groupData.reduce((sum, r) => sum + r.total_cost, 0) / groupData.length,
57+
totalCost: groupData.reduce((sum, r) => sum + r.total_cost, 0)
58+
};
59+
});
60+
```
61+
62+
## Pre-Aggregated vs. Raw Data Usage
63+
64+
### Use Pre-Aggregated Data When:
65+
- Displaying initial unfiltered views
66+
- Performance is critical
67+
- Standard aggregations are sufficient
68+
69+
### Recalculate from Raw Data When:
70+
- Filters are applied
71+
- Custom aggregations needed
72+
- Combining multiple filter conditions
73+
74+
## Filter Application Order
75+
76+
1. **Start with raw_data**
77+
2. **Apply filters** (Model AND Provider AND TestGroup)
78+
3. **Recalculate aggregations**
79+
4. **Update visualizations**
80+
81+
## Cost Calculation Rules
82+
83+
- **Normal tests**: Cost = (input_tokens/1M × input_price) + (output_tokens/1M × output_price)
84+
- **Failed tests** (Score_UsageLimitNotExceeded = 0): Cost = $0
85+
- **Tiered pricing**: Price depends on total token count
86+
87+
## Data Validation Checks
88+
89+
```javascript
90+
// Ensure data consistency
91+
function validateData(jsonData) {
92+
// Check if model counts match
93+
const rawModels = new Set(jsonData.raw_data.map(r => r.Model));
94+
const leaderboardModels = new Set(jsonData.leaderboard.map(l => l.Model));
95+
96+
console.assert(rawModels.size === leaderboardModels.size,
97+
"Model count mismatch between raw and leaderboard");
98+
99+
// Verify cost calculations
100+
jsonData.raw_data.forEach(row => {
101+
if (row.Score_UsageLimitNotExceeded === 0) {
102+
console.assert(row.total_cost === 0,
103+
`Failed test should have 0 cost: ${row.Model}`);
104+
}
105+
});
106+
}
107+
```
108+
109+
## Performance Optimization Tips
110+
111+
1. **Cache Filter Results**
112+
```javascript
113+
const filterCache = new Map();
114+
function getCachedFilter(filterKey, rawData, filters) {
115+
if (!filterCache.has(filterKey)) {
116+
filterCache.set(filterKey, applyFilters(rawData, filters));
117+
}
118+
return filterCache.get(filterKey);
119+
}
120+
```
121+
122+
2. **Use Indexed Lookups**
123+
```javascript
124+
// Pre-index by model for fast lookups
125+
const modelIndex = {};
126+
rawData.forEach(row => {
127+
if (!modelIndex[row.Model]) modelIndex[row.Model] = [];
128+
modelIndex[row.Model].push(row);
129+
});
130+
```
131+
132+
3. **Batch Updates**
133+
```javascript
134+
// Update all visualizations at once
135+
function updateAllVisualizations(filteredData) {
136+
requestAnimationFrame(() => {
137+
updateLeaderboard(filteredData);
138+
updateParetoPlot(filteredData);
139+
updateCostBreakdown(filteredData);
140+
updateFailureAnalysis(filteredData);
141+
});
142+
}
143+
```

0 commit comments

Comments
 (0)