Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions benchmark/raytune/OPTIMIZATION_EXPERIMENT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Run configuration for hyperparameter search.

All configuration parameters for the hyperparameter search are defined here.
Modify this file to change search behavior without editing the main script.
"""

import os
from typing import Dict, List, Optional

# Model configurations
# Weight files are loaded from SPARSE_ATTENTION_WEIGHTS_DIR environment variable
# Set it to the directory containing your HashAttention weight files

HASHATTENTION_WEIGHTS_DIR: str = "/data/apdesai/code/HashAttention-1.0/artifacts"
DOUBLE_SPARSITY_CONFIG_DIR: str = "/data/apdesai/code/DoubleSparse/config"
hashattention_dir: str = HASHATTENTION_WEIGHTS_DIR
doublesparsity_config_dir: str = DOUBLE_SPARSITY_CONFIG_DIR


MODEL_CONFIGS: Dict[str, Dict[str, str]] = {
"llama3.1-8b": {
"model_name": "meta-llama/Llama-3.1-8B-Instruct",
"hash_attention_weight_file": os.path.join(hashattention_dir, "llama3.1-8b-patch.64K.v1.hat_weights.pkl"),
"double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.1-8B-Instruct.json"),
},
"llama3.2-1b": {
"model_name": "meta-llama/Llama-3.2-1B-Instruct",
"hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
"double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.2-1B-Instruct.json"),
},
"llama3.2-3b": {
"model_name": "meta-llama/Llama-3.2-3B-Instruct",
"hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
"double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.2-3B-Instruct.json"),
},
"deepseek": {
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"hash_attention_weight_file": os.path.join(hashattention_dir, "DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K_hat_weights.pkl"),
},
"mistral": {
"model_name": "mistralai/Mistral-7B-Instruct-v0.3",
"hash_attention_weight_file": os.path.join(hashattention_dir, "Mistral-7B-Instruct-v0.3.24K.20.500.hat_weights.pkl"),
},
"qwen3-30b-moe": {
"model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507",
"hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
"double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-30B-A3B-Instruct-2507.json"),
},
"qwen3-4b": {
"model_name": "Qwen/Qwen3-4B-Instruct-2507",
"hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"),
"double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-4B-Instruct-2507.json"),
},
}

MODELS : List[str] = [
"llama3.1-8b",
"llama3.2-1b",
"llama3.2-3b",
"qwen3-4b",
"qwen3-30b-moe",
]

TASKS: List[str] = [
"ruler32k/vt",
"ruler32k/qa_1",
"ruler32k/qa_2",
"ruler32k/fwe",
"ruler32k/niah_multikey_2",
"ruler32k/niah_multikey_3",
]

SPARSITY_OBJECTIVES: List[str] = [
2,
5,
10,
20,
]

MEMORY_OBJECTIVES: List[Optional[str]] = [
32,
64,
128,
] # Memory objective parameter (e.g., "memory_32") for configs that need it

BUILDER_NAMES: List[str] = [
# "dense",
# "double_sparsity",
# "hashattention_topk",
# "magicpig",
# "oracle_topk",
# "oracle_topp",
# "quest_topk",
# "vattention_hashattention",
# "vattention_oracle",
# "pqcache",
"vattention_pqcache",
] # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"])


# SEARCH PARAMS
NUM_SAMPLES: int = 1 # Number of samples per hyperparameter search
SEARCH_MAX_NEW_TOKENS: int = 3 # Max new tokens for search trials
SEARCH_MAX_CONTEXT_LENGTH: int = 40000 # Max context length for search trials
SEARCH_MAX_REQUESTS: int = 3 # Max requests per search trial
OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/code/DO_NOT_DELETE/vattention_pqcache_optimization/" # Directory for storing optimal configurations
RAY_RESULTS_DIR: str = "/tmp/ray_results" # Directory for Ray Tune results
SEARCH_TIMEOUT: int = 900 # Timeout per search trial in seconds
ACTORS_PER_GPU: int = 1 # Number of actors per GPU for resource allocation


""" DRY RUN
if true , it will do everything except the actual running of benchmark helper -- it will just return
randomly generated scores for each trial and choose based on that
"""
DRY_RUN: bool = False


""" If you use Time stamp then by default it will perform entire search again.
"""
USE_TIMESTAMP_FOR_RESULTS_DIR: bool = False
FORCE_SEARCH: bool = False # Force re-run of search even if configs exist

163 changes: 83 additions & 80 deletions benchmark/raytune/README.md
Original file line number Diff line number Diff line change
@@ -1,99 +1,102 @@
# Ray Tune Benchmark Suite
## Ray Tune Benchmark Suite

A distributed benchmark suite for sparse attention configurations using Ray for parallel execution.
Distributed benchmark suite for sparse attention configurations using Ray.

## Setup
### 1. Quick Start (Run existing builders on new models / settings / objectives)

### Environment Variables
- **Optimize configs**

For HashAttention configurations, set the weights directory:
1. Edit `benchmark/raytune/OPTIMIZATION_EXPERIMENT.py` to choose:
- **Models**: `MODEL_CONFIGS`, `MODELS`
- **Tasks**: `TASKS`
- **Objectives**: `SPARSITY_OBJECTIVES`, `MEMORY_OBJECTIVES`
- **Builders**: `BUILDER_NAMES`
- **Search/runtime**: samples, timeouts, context limits, output dirs
2. Run the optimization:

```bash
export SPARSE_ATTENTION_WEIGHTS_DIR=/path/to/your/hashattention/weights
python3 benchmark/raytune/run_optimize_configs.py
```

The directory should contain the HashAttention weight files for your models (e.g., `llama3.1-8b-patch.64K.v1.hat_weights.pkl`).
This writes one JSON config per (model, task, builder, objective) into the configured optimal-configs directory.

## Quick Start
- **Run benchmarks with optimized configs**

### 1. Optimize Configurations
Find optimal sparse attention configurations for your models:

```bash
python3 benchmark/raytune/run_optimize_configs.py \
--objective sparsity_10 \
--optimal-configs-dir <base_dir>> \
--num-samples 1 \
--search-max-new-tokens 5 \
--search-max-context-length 32678 \
--search-max-requests 2 \
--actors-per-gpu 1
```

### 2. Run Benchmarks
Execute benchmarks using the optimized configurations:
Use the config directory produced above with `run_config_dir.py`:

```bash
python3 benchmark/raytune/run_config_dir.py \
--configs-dir <base_dir/config_dir> \
--configs-dir /path/to/optimal/configs \
--max-new-tokens 100 \
--max-context-length 32678 \
--max-requests 2 \
--actors-per-gpu 1 \
--benchmark-results-dir ./test_bench.1/
--benchmark-results-dir ./bench_results/
```

## Workflow

### Phase 1: Configuration Optimization
Use `run_optimize_configs.py` to search for optimal sparse attention parameters:

**Configuration Sources:**
- **Models**: Defined in `get_run_configuration()` function
- **Tasks**: Specified in the configuration
- **Sparse Configs**: Two types handled:
- `to_optimize_configs`: Configurations that need hyperparameter search
- `optimal_configs`: Pre-optimized configurations (used as-is)
- **Search Spaces**: Each config type can have its own search space defined separately. Example:

```python
# Create a ResearchAttentionConfig with custom search spaces
config = ResearchAttentionConfig(masker_configs=[
SinkMaskerConfig(sink_size=128),
LocalMaskerConfig(window_size=128),
OracleTopKConfig(heavy_size=0.10),
AdaptiveSamplingMaskerConfig(
base_rate_sampling=0.1,
epsilon=0.25,
delta=0.25,
init_offset=128,
local_offset=128
)
])

# Define search spaces for specific maskers
config.masker_configs[2].search_space = {
"heavy_size": tune.grid_search([0.01, 0.05, 0.1, 0.2])
}
config.masker_configs[3].search_space = {
"base_rate_sampling": tune.grid_search([0.01, 0.02, 0.05]),
"epsilon": tune.grid_search([0.05, 0.1, 0.2]),
"delta": tune.grid_search([0.05, 0.1, 0.2])
}
```

**Output**: Optimal configurations are written to `<base_dir>/run_<timestamp>/` directory with individual JSON files per model-task-config combination.

### Phase 2: Benchmark Execution
Use `run_config_dir.py` to run full benchmarks with the found configurations:

**Input**: Pass the config directory (e.g., `<base_dir>/run_<timestamp>/`) containing all the JSON configuration files generated in Phase 1.

**Output**: Benchmark results saved to the specified `--benchmark-results-dir`.

## Features

- **Distributed Execution**: Ray-based parallel processing across multiple GPUs
- **Automatic Resource Management**: Efficient GPU utilization and task scheduling
- **Sparse Attention Support**: Multiple masker types and configurations
- **Comprehensive Metrics**: Detailed performance and accuracy measurements
### 2. Implementation of optimization

- **Config builders**: For each sparse attention method, a config builder constructs a `ResearchAttentionConfig` (masker stack, defaults, and metadata) for a given model/task/objective.
- **Search spaces**: Builders attach Ray Tune search spaces (e.g. `config.masker_configs[i].search_space`) to selected hyperparameters; `run_optimize_configs.py` passes these to Ray.
- **Validity checker**: Each builder defines a small validity checker that rejects invalid hyperparameter combinations early so trials can be skipped before running the benchmark.

High-level flow:

```text
(model, task, objectives, builder name)
Config builder
┌─────────┴────────────────────────────┐
│ │
▼ ▼
ResearchAttentionConfig Ray Tune search_space attached
Ray Tune iterates over configs ──► validity checker ──►
│ │
├─ valid ──► run benchmark trial
└─ invalid ──► skip early (no trial)
```

### 3. Adding a new builder

- **Create a builder**: Copy an existing builder from `benchmark/raytune/config_builders/`, rename it, and adapt:
- masker composition and default parameters
- Ray Tune search spaces on the relevant hyperparameters
- the validity checker logic for early exit on bad configs
- **Wire it up**:
- Register the new builder name wherever builders are dispatched (e.g. builder registry/factory).
- Add the new name to `BUILDER_NAMES` in `OPTIMIZATION_EXPERIMENT.py` so it is included in optimization and benchmarking.

**Example sketch (`vattention_pqcache`)** in `config_builders/vattention_pqcache.py` (Check the file for details) :

- **1. Builder name**:

- Decorator: `@register_builder("vattention_pqcache")`
- Class: `VAttentionPQCacheConfigBuilder`

- **2. Search space**:

- Base definition on the PQCache masker:

```python
config.masker_configs[2].search_space = {
"pq_group_factor": tune.grid_search([2, 4]),
"pq_bits": tune.grid_search([4, 8]),
"kmeans_iter": tune.grid_search([10]),
"metric": tune.grid_search(["euclidean"]),
}
```

- Plus sparsity-dependent grids on PQCache + AdaptiveSampling (e.g. `config.masker_configs[2].search_space["heavy_size"] = ...`, `config.masker_configs[3].search_space = {...}` inside the `if sparsity_objective == ...` blocks).

- **3. Validity checker**:

- Function: `_validity_check(config, sparsity_val)` at the top of the file.
- Attached to the config with:

```python
config.validity_constraint = partial(_validity_check, sparsity_val=sparsity_val)
```

Loading
Loading