Merge pull request #16 from andrewginns/october-2025-updates

andrewginns · web-flow · commit 697c3fc37bc5 · 2025-10-14T11:56:32.000+01:00
feat: October 2025 updates
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/README.md b/agents_mcp_usage/evaluations/mermaid_evals/README.md
@@ -106,6 +106,16 @@ uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_da
   -o "agents_mcp_usage/evaluations/mermaid_evals/results/<timestamp>_processed.json"
 ```
 
+### 5. Merge Results from Multiple Runs (Optional)
+```bash
+# Combine results from different machines or benchmark runs
+uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/merge_benchmark_results.py \
+  -i results/run1.json results/run2.json \
+  -o results/merged.json \
+  --dedup keep-all \
+  --report merge_report.json
+```
+
 ## Evaluation Task & Test Cases
 
 The system challenges LLM agents to:
@@ -239,6 +249,125 @@ The local dashboard (`merbench_ui.py`) automatically detects and loads these CSV
 - Performance metrics and scores
 - Error messages and failure reasons
 
+## Merging Benchmark Results
+
+The `merge_benchmark_results.py` script enables combining multiple JSON benchmark result files generated by `preprocess_merbench_data.py`. This is particularly useful when:
+- Running benchmarks on different machines
+- Combining results from different time periods
+- Aggregating data from distributed benchmark runs
+
+### Features
+- **Multiple merge strategies** for handling duplicate test runs
+- **Complete recalculation** of all statistics from merged raw data
+- **Detailed merge reports** showing what was combined
+- **Preservation of all data sections** (leaderboard, failure analysis, cost breakdown, etc.)
+
+### Usage Examples
+
+#### Basic Merge
+```bash
+python scripts/merge_benchmark_results.py -i file1.json file2.json -o merged.json
+```
+
+#### With Deduplication Strategy
+```bash
+python scripts/merge_benchmark_results.py \
+  -i file1.json file2.json \
+  -o merged.json \
+  --dedup keep-first
+```
+
+#### Generate Detailed Report
+```bash
+python scripts/merge_benchmark_results.py \
+  -i file1.json file2.json \
+  -o merged.json \
+  --report merge_report.json \
+  --verbose
+```
+
+### Deduplication Strategies
+
+The script offers four strategies for handling duplicate (Model, Case) combinations:
+
+1. **`keep-all`** (default)
+   - Keeps all records, no deduplication
+   - Use when runs were performed under different conditions
+   - Preserves complete data for analysis
+
+2. **`keep-first`**
+   - Keeps the first occurrence from the first file
+   - Use when preferring older/original results
+   - Maintains consistency with initial benchmarks
+
+3. **`keep-last`**
+   - Keeps the last occurrence from the last file
+   - Use when preferring newer/updated results
+   - Good for iterative improvements
+
+4. **`average`**
+   - Averages metrics for duplicate combinations
+   - Use when multiple runs should be combined statistically
+   - Provides a balanced view across multiple executions
+
+### Output Structure
+
+The merged JSON file maintains the same structure as individual result files:
+- `stats`: Aggregate statistics
+- `leaderboard`: Model performance rankings
+- `pareto_data`: Efficiency analysis data
+- `test_groups_data`: Performance by test difficulty
+- `failure_analysis_data`: Failure type counts
+- `cost_breakdown_data`: Cost analysis by model and test group
+- `raw_data`: Individual run records
+- `config`: Dashboard configuration
+
+### Merge Report
+
+When using `--report`, the script generates a detailed report containing:
+- Timestamp of merge operation
+- Deduplication strategy used
+- Details of each input file (runs, models)
+- Summary statistics (total runs, duplicates handled)
+- Complete list of merged models
+
+### Complete Workflow Example
+
+#### 1. Run benchmarks on Machine A:
+```bash
+# Run evaluation
+uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py
+
+# Convert to JSON
+uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py \
+  -i mermaid_eval_results/machine_a_results.csv \
+  -o results/machine_a.json
+```
+
+#### 2. Run benchmarks on Machine B:
+```bash
+# Run evaluation
+uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py
+
+# Convert to JSON
+uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/preprocess_merbench_data.py \
+  -i mermaid_eval_results/machine_b_results.csv \
+  -o results/machine_b.json
+```
+
+#### 3. Merge results:
+```bash
+uv run agents_mcp_usage/evaluations/mermaid_evals/scripts/merge_benchmark_results.py \
+  -i results/machine_a.json results/machine_b.json \
+  -o results/combined.json \
+  --report results/merge_summary.json
+```
+
+### Notes
+- The script automatically recalculates all statistics from the merged raw data
+- Cost calculations use the same `costs.json` configuration as the preprocessing script
+- Provider detection is based on model name patterns (gemini→Google, nova→Amazon, etc.)
+
 ## Monitoring & Debugging
 
 All evaluation runs are traced with **Logfire** for comprehensive monitoring:
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/costs.json b/agents_mcp_usage/evaluations/mermaid_evals/costs.json
@@ -105,6 +105,31 @@
         ]
       }
     },
+    "gemini-2.5-pro": {
+      "friendly_name": "Gemini 2.5 Pro Preview",
+      "input": [
+        {
+          "up_to": 200000,
+          "price": 1.25
+        },
+        {
+          "up_to": "inf",
+          "price": 2.5
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": 200000,
+            "price": 10.0
+          },
+          {
+            "up_to": "inf",
+            "price": 15.0
+          }
+        ]
+      }
+    },
     "gemini-1.5-pro": {
       "friendly_name": "Gemini 1.5 Pro",
       "input": [
@@ -244,10 +269,6 @@
     "gemini-2.5-flash": {
       "friendly_name": "Gemini 2.5 Flash",
       "input": [
-        {
-          "up_to": 200000,
-          "price": 0.15
-        },
         {
           "up_to": "inf",
           "price": 0.3
@@ -256,9 +277,22 @@
       "output": {
         "default": [
           {
-            "up_to": 200000,
-            "price": 1.25
-          },
+            "up_to": "inf",
+            "price": 2.5
+          }
+        ]
+      }
+    },
+    "gemini-2.5-flash-preview-09-2025": {
+      "friendly_name": "Gemini 2.5 Flash Preview (Sept)",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.3
+        }
+      ],
+      "output": {
+        "default": [
           {
             "up_to": "inf",
             "price": 2.5
@@ -283,6 +317,40 @@
         ]
       }
     },
+    "gemini-2.5-flash-lite": {
+      "friendly_name": "Gemini 2.5 Flash Lite",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.1
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 0.4
+          }
+        ]
+      }
+    },
+    "gemini-2.5-flash-lite-preview-09-2025": {
+      "friendly_name": "Gemini 2.5 Flash Lite Preview (Sept)",
+      "input": [
+        {
+          "up_to": "inf",
+          "price": 0.1
+        }
+      ],
+      "output": {
+        "default": [
+          {
+            "up_to": "inf",
+            "price": 0.4
+          }
+        ]
+      }
+    },
     "openai:o4-mini": {
       "friendly_name": "OpenAI o4-mini",
       "input": [
diff --git a/agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py b/agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py
@@ -48,7 +48,10 @@
     # "gemini-2.5-pro-preview-05-06",
     # "gemini-2.5-pro-preview-03-25",
     # "gemini-2.0-flash",
-    "gemini-2.5-flash",
+    # "gemini-2.5-flash",
+    "gemini-2.5-flash-lite",
+    # "gemini-2.5-flash-preview-09-2025",
+    # "gemini-2.5-flash-lite-preview-09-2025"
     # "bedrock:us.amazon.nova-pro-v1:0",
     # "bedrock:us.amazon.nova-lite-v1:0",
     # "bedrock:us.amazon.nova-micro-v1:0",
@@ -511,13 +514,13 @@ async def main() -> None:
     parser.add_argument(
         "--runs",
         type=int,
-        default=5,
+        default=15,
         help="Number of evaluation runs per model",
     )
     parser.add_argument(
         "--judge-model",
         type=str,
-        default="gemini-2.5-pro-preview-06-05",
+        default="gemini-2.5-pro",
         help="Model to use for LLM judging",
     )
     parser.add_argument(
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,12 +14,12 @@ dependencies = [
     "langgraph>=0.3.31",
     "logfire>=3.20.0",
     "loguru>=0.7.3",
-    "mcp==1.9.0",
+    "mcp>=1.12.3",
     "openai-agents>=0.0.12",
     "pandas>=2.3.0",
     "plotly>=6.1.2",
-    "pydantic-ai-slim[bedrock,mcp]>=0.2.15",
-    "pydantic-evals[logfire]>=0.2.15",
+    "pydantic-ai-slim[bedrock,mcp]>=1.0.17",
+    "pydantic-evals[logfire]>=1.0.17",
     "python-dotenv>=1.1.0",
     "ruff>=0.11.10",
     "streamlit>=1.45.1",
diff --git a/uv.lock b/uv.lock