skylight-org · AlexCuadron · Sep 6, 2025 · Sep 13, 2025 · Sep 16, 2025 · Sep 18, 2025
diff --git a/benchmark/executor.py b/benchmark/executor.py
@@ -217,12 +217,21 @@ def _benchmark_worker(
 
                     # Import here to avoid issues with multiprocessing
                     from sparse_attention_hub.adapters.huggingface import ModelAdapterHF
+                    from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig
+
+                    # Extract recovery settings if available
+                    recovery_kwargs = {}
+                    if isinstance(stub.sparse_attention_config, ResearchAttentionConfig):
+                        recovery_kwargs['recovery_enabled'] = stub.sparse_attention_config.recovery_enabled
+                        recovery_kwargs['recovery_interval'] = stub.sparse_attention_config.recovery_interval
+                        recovery_kwargs['recovery_dense_attention'] = stub.sparse_attention_config.recovery_dense_attention
 
                     adapter = ModelAdapterHF(
                         model_name=stub.model_name,
                         sparse_attention_config=stub.sparse_attention_config,
                         model_kwargs=stub.adapter_config.model_kwargs,
-                        tokenizer_kwargs=stub.adapter_config.tokenizer_kwargs
+                        tokenizer_kwargs=stub.adapter_config.tokenizer_kwargs,
+                        **recovery_kwargs
                     )
 
                     # Create benchmark instance

diff --git a/benchmark/scripts/benchmark.py b/benchmark/scripts/benchmark.py
@@ -15,41 +15,63 @@
 from pathlib import Path
 
 # Add the project root to the path
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 
 from benchmark.executor import BenchmarkExecutor
 from benchmark.executor_config import BenchmarkConfig, AdapterConfig
 from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig
 from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import (
     LocalMaskerConfig, SinkMaskerConfig
 )
+from sparse_attention_hub.sparse_attention import (
+    ChannelConfig,
+    HashAttentionTopKMaskerConfig,
+)
+
+from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import (
+    AdaptiveSamplingMaskerConfig
+)
 
+from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import (
+    OracleTopKConfig
+)
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 
 # GPU Configuration
-GPUS = [0,2,7]  # Use all available GPUs
-MAX_CONCURRENT_RUNS = 3  # One per GPU
+GPUS = [0]  # Use all available GPUs
+MAX_CONCURRENT_RUNS = 1  # One per GPU
 
 # Model List
 MODELS = [
-    "microsoft/Phi-4-mini-instruct", 
-    "meta-llama/Llama-3.2-1B-Instruct",  
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 ]
 
+usa_weight_file = "/workspace/HashAttention-1.0/artifacts/DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K.pt"
+weight_file = "/workspace/HashAttention-1.0/artifacts/DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K.hat_weights.pkl"
+
+#from sparse_attention_hub.sparse_attention.utils.hashattention_utils import create_hat_weights_file_from_usa
+#create_hat_weights_file_from_usa(usa_weight_file, weight_file, num_layers=32, num_heads=32, device="cpu")
+
 # Sparse Attention Configurations
+
 SPARSE_CONFIGS = [
-    # Dense baseline (no sparse attention)
-    ("dense", None),
-
-    # StreamingLLM configurations
-    ("streaming_conservative", ResearchAttentionConfig(masker_configs=[
-        SinkMaskerConfig(sink_size=4),
-        LocalMaskerConfig(window_size=16)
-    ])),
+    #("dense", None),
+    ("test_oracle_topk_norecovery_10pct_r1", ResearchAttentionConfig(
+        masker_configs=[
+            SinkMaskerConfig(sink_size=128),
+            LocalMaskerConfig(window_size=128),
+            OracleTopKConfig(heavy_size=0.1),
+        ],
+        recovery_enabled=False,
+        recovery_interval=32000,
+    ))
 ]
 
+
+
+
 # Benchmark List
 # 1. InfiniteBench - using passkey task
 infinite_bench_config = BenchmarkConfig(
@@ -107,15 +129,7 @@
 
 # List of all sample configurations
 BENCHMARKS = [
-    infinite_bench_config,
-    ruler_config,
-    loogle_config,
-    zero_scrolls_config,
-    longbenchv2_config,
-    aime2024_config,
-    aime2025_config,
-    longbench_config,
-    mock_benchmark_config
+    aime2024_config
 ]
 
 
@@ -124,6 +138,7 @@
     adapter_name="huggingface",
     model_kwargs={
         "torch_dtype": torch.bfloat16,
+        "attn_implementation" : "flash_attention_2"
     },
     tokenizer_kwargs={
         "padding_side": "left",
@@ -132,23 +147,23 @@
 
 # Generation Parameters
 GENERATION_KWARGS = {
-    "max_new_tokens": 50,
-    "do_sample": False,
-    "temperature": 1.0,
-    "top_p": 1.0,
+    "max_new_tokens": 32768,
+    "do_sample": True,
+    "temperature": 0.6,
+    "top_p": 0.95,
     "pad_token_id": None,
 }
 
 # Request Parameters
 REQUEST_KWARGS = {
-    "max_context_length": 256,
-    "max_requests": 2,  # Limit for testing
+    "max_context_length": 32768,
+    "max_requests": 30,  # Limit for testing
 }
 
 # Execution Settings
-RESULT_DIR = "./benchmark_results"
+RESULT_DIR = "./benchmark_results_test.1"
 ENABLE_RESUMABILITY = True
-TIMEOUT_PER_BENCHMARK = 3600.0  # 1 hour
+TIMEOUT_PER_BENCHMARK = 60 * 60 * 24  # 1 day
 
 # ============================================================================
 # MAIN EXECUTION