skylight-org · apd10 · Nov 4, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py
@@ -0,0 +1,123 @@
+"""Run configuration for hyperparameter search.
+
+All configuration parameters for the hyperparameter search are defined here.
+Modify this file to change search behavior without editing the main script.
+"""
+
+import os
+from typing import Dict, List, Optional
+
+# Model configurations
+# Weight files are loaded from SPARSE_ATTENTION_WEIGHTS_DIR environment variable
+# Set it to the directory containing your HashAttention weight files
+
+HASHATTENTION_WEIGHTS_DIR: str = "/data/apdesai/code/HashAttention-1.0/artifacts"
+DOUBLE_SPARSITY_CONFIG_DIR: str = "/data/apdesai/code/DoubleSparse/config"
+hashattention_dir: str = HASHATTENTION_WEIGHTS_DIR
+doublesparsity_config_dir: str = DOUBLE_SPARSITY_CONFIG_DIR
+
+
+MODEL_CONFIGS: Dict[str, Dict[str, str]] = {
+    "llama3.1-8b": {
+        "model_name": "meta-llama/Llama-3.1-8B-Instruct",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "llama3.1-8b-patch.64K.v1.hat_weights.pkl"),
+        "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.1-8B-Instruct.json"),
+    },
+    "llama3.2-1b": {
+        "model_name": "meta-llama/Llama-3.2-1B-Instruct",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
+        "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.2-1B-Instruct.json"),
+    },
+    "llama3.2-3b": {
+        "model_name": "meta-llama/Llama-3.2-3B-Instruct",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
+        "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.2-3B-Instruct.json"),
+    },
+    "deepseek": {
+        "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K_hat_weights.pkl"),
+    },
+    "mistral": {
+        "model_name": "mistralai/Mistral-7B-Instruct-v0.3",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "Mistral-7B-Instruct-v0.3.24K.20.500.hat_weights.pkl"),
+    },
+    "qwen3-30b-moe": {
+        "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
+        "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-30B-A3B-Instruct-2507.json"),
+    },
+    "qwen3-4b": {
+        "model_name": "Qwen/Qwen3-4B-Instruct-2507",
+        "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
+        "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-4B-Instruct-2507.json"), 
+    },
+}
+
+MODELS : List[str] = [
+    "llama3.1-8b", 
+    "llama3.2-1b",
+    "llama3.2-3b",
+    "qwen3-4b",
+    "qwen3-30b-moe",
+]
+
+TASKS: List[str] = [
+    "ruler32k/vt",
+    "ruler32k/qa_1",
+    "ruler32k/qa_2",
+    "ruler32k/fwe",
+    "ruler32k/niah_multikey_2",
+    "ruler32k/niah_multikey_3",
+]
+
+SPARSITY_OBJECTIVES: List[str] = [
+    2,
+    5,
+    10,
+    20,
+]
+
+MEMORY_OBJECTIVES: List[Optional[str]] = [
+    32,
+    64,
+    128,
+] # Memory objective parameter (e.g., "memory_32") for configs that need it
+
+BUILDER_NAMES: List[str] = [
+    # "dense",
+    # "double_sparsity", 
+    # "hashattention_topk",
+    # "magicpig",
+    # "oracle_topk",
+    # "oracle_topp", 
+    # "quest_topk",
+    # "vattention_hashattention",
+    # "vattention_oracle",
+    # "pqcache",
+    "vattention_pqcache",
+]  # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"])
+
+
+# SEARCH PARAMS
+NUM_SAMPLES: int = 1  # Number of samples per hyperparameter search
+SEARCH_MAX_NEW_TOKENS: int = 3  # Max new tokens for search trials
+SEARCH_MAX_CONTEXT_LENGTH: int = 40000  # Max context length for search trials
+SEARCH_MAX_REQUESTS: int = 3  # Max requests per search trial
+OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/code/DO_NOT_DELETE/vattention_pqcache_optimization/"  # Directory for storing optimal configurations
+RAY_RESULTS_DIR: str = "/tmp/ray_results"  # Directory for Ray Tune results
+SEARCH_TIMEOUT: int = 900  # Timeout per search trial in seconds
+ACTORS_PER_GPU: int = 1  # Number of actors per GPU for resource allocation
+
+
+""" DRY RUN 
+if true , it will do everything except the actual running of benchmark helper -- it will just return 
+randomly generated scores for each trial and choose based on that
+"""
+DRY_RUN: bool = False 
+
+
+""" If you use Time stamp then by default it will perform entire search again.
+"""
+USE_TIMESTAMP_FOR_RESULTS_DIR: bool = False
+FORCE_SEARCH: bool = False # Force re-run of search even if configs exist
+
diff --git a/benchmark/raytune/README.md b/benchmark/raytune/README.md
@@ -1,99 +1,102 @@
-# Ray Tune Benchmark Suite
+## Ray Tune Benchmark Suite
 
-A distributed benchmark suite for sparse attention configurations using Ray for parallel execution.
+Distributed benchmark suite for sparse attention configurations using Ray.
 
-## Setup
+### 1. Quick Start (Run existing builders on new models / settings / objectives)
 
-### Environment Variables
+- **Optimize configs**
 
-For HashAttention configurations, set the weights directory:
+  1. Edit `benchmark/raytune/OPTIMIZATION_EXPERIMENT.py` to choose:
+     - **Models**: `MODEL_CONFIGS`, `MODELS`
+     - **Tasks**: `TASKS`
+     - **Objectives**: `SPARSITY_OBJECTIVES`, `MEMORY_OBJECTIVES`
+     - **Builders**: `BUILDER_NAMES`
+     - **Search/runtime**: samples, timeouts, context limits, output dirs
+  2. Run the optimization:
 
 ```bash
-export SPARSE_ATTENTION_WEIGHTS_DIR=/path/to/your/hashattention/weights
+python3 benchmark/raytune/run_optimize_configs.py
 ```
 
-The directory should contain the HashAttention weight files for your models (e.g., `llama3.1-8b-patch.64K.v1.hat_weights.pkl`).
+  This writes one JSON config per (model, task, builder, objective) into the configured optimal-configs directory.
 
-## Quick Start
+- **Run benchmarks with optimized configs**
 
-### 1. Optimize Configurations
-Find optimal sparse attention configurations for your models:
-
-```bash
-python3 benchmark/raytune/run_optimize_configs.py \
-  --objective sparsity_10 \
-  --optimal-configs-dir <base_dir>> \
-  --num-samples 1 \
-  --search-max-new-tokens 5 \
-  --search-max-context-length 32678 \
-  --search-max-requests 2 \
-  --actors-per-gpu 1
-```
-
-### 2. Run Benchmarks
-Execute benchmarks using the optimized configurations:
+  Use the config directory produced above with `run_config_dir.py`:
 
 ```bash
 python3 benchmark/raytune/run_config_dir.py \
-  --configs-dir <base_dir/config_dir> \
+  --configs-dir /path/to/optimal/configs \
   --max-new-tokens 100 \
   --max-context-length 32678 \
   --max-requests 2 \
   --actors-per-gpu 1 \
-  --benchmark-results-dir ./test_bench.1/
+  --benchmark-results-dir ./bench_results/
 ```
 
-## Workflow
-
-### Phase 1: Configuration Optimization
-Use `run_optimize_configs.py` to search for optimal sparse attention parameters:
-
-**Configuration Sources:**
-- **Models**: Defined in `get_run_configuration()` function
-- **Tasks**: Specified in the configuration
-- **Sparse Configs**: Two types handled:
-  - `to_optimize_configs`: Configurations that need hyperparameter search
-  - `optimal_configs`: Pre-optimized configurations (used as-is)
-- **Search Spaces**: Each config type can have its own search space defined separately. Example:
-
-```python
-# Create a ResearchAttentionConfig with custom search spaces
-config = ResearchAttentionConfig(masker_configs=[
-    SinkMaskerConfig(sink_size=128),
-    LocalMaskerConfig(window_size=128),
-    OracleTopKConfig(heavy_size=0.10),
-    AdaptiveSamplingMaskerConfig(
-        base_rate_sampling=0.1,
-        epsilon=0.25,
-        delta=0.25,
-        init_offset=128,
-        local_offset=128
-    )
-])
-
-# Define search spaces for specific maskers
-config.masker_configs[2].search_space = {
-    "heavy_size": tune.grid_search([0.01, 0.05, 0.1, 0.2])
-}
-config.masker_configs[3].search_space = {
-    "base_rate_sampling": tune.grid_search([0.01, 0.02, 0.05]),
-    "epsilon": tune.grid_search([0.05, 0.1, 0.2]),
-    "delta": tune.grid_search([0.05, 0.1, 0.2])
-}
-``` 
-
-**Output**: Optimal configurations are written to `<base_dir>/run_<timestamp>/` directory with individual JSON files per model-task-config combination.
-
-### Phase 2: Benchmark Execution
-Use `run_config_dir.py` to run full benchmarks with the found configurations:
-
-**Input**: Pass the config directory (e.g., `<base_dir>/run_<timestamp>/`) containing all the JSON configuration files generated in Phase 1.
-
-**Output**: Benchmark results saved to the specified `--benchmark-results-dir`.
-
-## Features
-
-- **Distributed Execution**: Ray-based parallel processing across multiple GPUs
-- **Automatic Resource Management**: Efficient GPU utilization and task scheduling
-- **Sparse Attention Support**: Multiple masker types and configurations
-- **Comprehensive Metrics**: Detailed performance and accuracy measurements
+### 2. Implementation of optimization
+
+- **Config builders**: For each sparse attention method, a config builder constructs a `ResearchAttentionConfig` (masker stack, defaults, and metadata) for a given model/task/objective.
+- **Search spaces**: Builders attach Ray Tune search spaces (e.g. `config.masker_configs[i].search_space`) to selected hyperparameters; `run_optimize_configs.py` passes these to Ray.
+- **Validity checker**: Each builder defines a small validity checker that rejects invalid hyperparameter combinations early so trials can be skipped before running the benchmark.
+
+High-level flow:
+
+```text
+(model, task, objectives, builder name)
+                │
+                ▼
+         Config builder
+      ┌─────────┴────────────────────────────┐
+      │                                      │
+      ▼                                      ▼
+ResearchAttentionConfig          Ray Tune search_space attached
+      │
+      ▼
+Ray Tune iterates over configs ──► validity checker ──►
+      │                           │
+      ├─ valid  ──► run benchmark trial
+      └─ invalid ──► skip early (no trial)
+```
+
+### 3. Adding a new builder
+
+- **Create a builder**: Copy an existing builder from `benchmark/raytune/config_builders/`, rename it, and adapt:
+  - masker composition and default parameters
+  - Ray Tune search spaces on the relevant hyperparameters
+  - the validity checker logic for early exit on bad configs
+- **Wire it up**:
+  - Register the new builder name wherever builders are dispatched (e.g. builder registry/factory).
+  - Add the new name to `BUILDER_NAMES` in `OPTIMIZATION_EXPERIMENT.py` so it is included in optimization and benchmarking.
+
+**Example sketch (`vattention_pqcache`)** in `config_builders/vattention_pqcache.py` (Check the file for details) :
+
+- **1. Builder name**:
+
+  - Decorator: `@register_builder("vattention_pqcache")`
+  - Class: `VAttentionPQCacheConfigBuilder`
+
+- **2. Search space**:
+
+  - Base definition on the PQCache masker:
+
+    ```python
+    config.masker_configs[2].search_space = {
+        "pq_group_factor": tune.grid_search([2, 4]),
+        "pq_bits": tune.grid_search([4, 8]),
+        "kmeans_iter": tune.grid_search([10]),
+        "metric": tune.grid_search(["euclidean"]),
+    }
+    ```
+
+  - Plus sparsity-dependent grids on PQCache + AdaptiveSampling (e.g. `config.masker_configs[2].search_space["heavy_size"] = ...`, `config.masker_configs[3].search_space = {...}` inside the `if sparsity_objective == ...` blocks).
+
+- **3. Validity checker**:
+
+  - Function: `_validity_check(config, sparsity_val)` at the top of the file.
+  - Attached to the config with:
+
+    ```python
+    config.validity_constraint = partial(_validity_check, sparsity_val=sparsity_val)
+    ```
+