Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion benchmark/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,21 @@ def _benchmark_worker(

# Import here to avoid issues with multiprocessing
from sparse_attention_hub.adapters.huggingface import ModelAdapterHF
from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig

# Extract recovery settings if available
recovery_kwargs = {}
if isinstance(stub.sparse_attention_config, ResearchAttentionConfig):
recovery_kwargs['recovery_enabled'] = stub.sparse_attention_config.recovery_enabled
recovery_kwargs['recovery_interval'] = stub.sparse_attention_config.recovery_interval
recovery_kwargs['recovery_dense_attention'] = stub.sparse_attention_config.recovery_dense_attention

adapter = ModelAdapterHF(
model_name=stub.model_name,
sparse_attention_config=stub.sparse_attention_config,
model_kwargs=stub.adapter_config.model_kwargs,
tokenizer_kwargs=stub.adapter_config.tokenizer_kwargs
tokenizer_kwargs=stub.adapter_config.tokenizer_kwargs,
**recovery_kwargs
)

# Create benchmark instance
Expand Down
75 changes: 45 additions & 30 deletions benchmark/scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,63 @@
from pathlib import Path

# Add the project root to the path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from benchmark.executor import BenchmarkExecutor
from benchmark.executor_config import BenchmarkConfig, AdapterConfig
from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig
from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import (
LocalMaskerConfig, SinkMaskerConfig
)
from sparse_attention_hub.sparse_attention import (
ChannelConfig,
HashAttentionTopKMaskerConfig,
)

from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import (
AdaptiveSamplingMaskerConfig
)

from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import (
OracleTopKConfig
)
# ============================================================================
# CONFIGURATION
# ============================================================================

# GPU Configuration
GPUS = [0,2,7] # Use all available GPUs
MAX_CONCURRENT_RUNS = 3 # One per GPU
GPUS = [0] # Use all available GPUs
MAX_CONCURRENT_RUNS = 1 # One per GPU

# Model List
MODELS = [
"microsoft/Phi-4-mini-instruct",
"meta-llama/Llama-3.2-1B-Instruct",
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
]

usa_weight_file = "/workspace/HashAttention-1.0/artifacts/DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K.pt"
weight_file = "/workspace/HashAttention-1.0/artifacts/DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K.hat_weights.pkl"

#from sparse_attention_hub.sparse_attention.utils.hashattention_utils import create_hat_weights_file_from_usa
#create_hat_weights_file_from_usa(usa_weight_file, weight_file, num_layers=32, num_heads=32, device="cpu")

# Sparse Attention Configurations

SPARSE_CONFIGS = [
# Dense baseline (no sparse attention)
("dense", None),

# StreamingLLM configurations
("streaming_conservative", ResearchAttentionConfig(masker_configs=[
SinkMaskerConfig(sink_size=4),
LocalMaskerConfig(window_size=16)
])),
#("dense", None),
("test_oracle_topk_norecovery_10pct_r1", ResearchAttentionConfig(
masker_configs=[
SinkMaskerConfig(sink_size=128),
LocalMaskerConfig(window_size=128),
OracleTopKConfig(heavy_size=0.1),
],
recovery_enabled=False,
recovery_interval=32000,
))
]




# Benchmark List
# 1. InfiniteBench - using passkey task
infinite_bench_config = BenchmarkConfig(
Expand Down Expand Up @@ -107,15 +129,7 @@

# List of all sample configurations
BENCHMARKS = [
infinite_bench_config,
ruler_config,
loogle_config,
zero_scrolls_config,
longbenchv2_config,
aime2024_config,
aime2025_config,
longbench_config,
mock_benchmark_config
aime2024_config
]


Expand All @@ -124,6 +138,7 @@
adapter_name="huggingface",
model_kwargs={
"torch_dtype": torch.bfloat16,
"attn_implementation" : "flash_attention_2"
},
tokenizer_kwargs={
"padding_side": "left",
Expand All @@ -132,23 +147,23 @@

# Generation Parameters
GENERATION_KWARGS = {
"max_new_tokens": 50,
"do_sample": False,
"temperature": 1.0,
"top_p": 1.0,
"max_new_tokens": 32768,
"do_sample": True,
"temperature": 0.6,
"top_p": 0.95,
"pad_token_id": None,
}

# Request Parameters
REQUEST_KWARGS = {
"max_context_length": 256,
"max_requests": 2, # Limit for testing
"max_context_length": 32768,
"max_requests": 30, # Limit for testing
}

# Execution Settings
RESULT_DIR = "./benchmark_results"
RESULT_DIR = "./benchmark_results_test.1"
ENABLE_RESUMABILITY = True
TIMEOUT_PER_BENCHMARK = 3600.0 # 1 hour
TIMEOUT_PER_BENCHMARK = 60 * 60 * 24 # 1 day

# ============================================================================
# MAIN EXECUTION
Expand Down
Loading