From e9cfb6f5192b298741d74a52d35e5ccfc69cd0b4 Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Tue, 4 Nov 2025 07:24:55 -0800 Subject: [PATCH 1/7] Phase 1: refactor remove the logic of config builders to config_builders/ --- benchmark/raytune/config_builders/__init__.py | 34 +++ benchmark/raytune/config_builders/base.py | 39 ++++ benchmark/raytune/config_builders/dense.py | 39 ++++ .../config_builders/double_sparsity.py | 79 +++++++ benchmark/raytune/config_builders/factory.py | 97 ++++++++ .../config_builders/hashattention_topk.py | 62 +++++ benchmark/raytune/config_builders/magicpig.py | 65 ++++++ .../raytune/config_builders/oracle_topk.py | 55 +++++ .../raytune/config_builders/oracle_topp.py | 60 +++++ .../raytune/config_builders/quest_top_k.py | 79 +++++++ .../config_builders/random_sampling.py | 58 +++++ .../raytune/{ => config_builders}/utility.py | 1 + .../vattention_hashattention.py | 136 +++++++++++ .../config_builders/vattention_oracle.py | 129 +++++++++++ benchmark/raytune/optimizer_factory.py | 5 +- benchmark/raytune/run_config_dir.py | 2 +- benchmark/raytune/run_optimize_configs.py | 217 +++--------------- .../scripts/single_benchmark_model_example.py | 27 +-- 18 files changed, 986 insertions(+), 198 deletions(-) create mode 100644 benchmark/raytune/config_builders/__init__.py create mode 100644 benchmark/raytune/config_builders/base.py create mode 100644 benchmark/raytune/config_builders/dense.py create mode 100644 benchmark/raytune/config_builders/double_sparsity.py create mode 100644 benchmark/raytune/config_builders/factory.py create mode 100644 benchmark/raytune/config_builders/hashattention_topk.py create mode 100644 benchmark/raytune/config_builders/magicpig.py create mode 100644 benchmark/raytune/config_builders/oracle_topk.py create mode 100644 benchmark/raytune/config_builders/oracle_topp.py create mode 100644 benchmark/raytune/config_builders/quest_top_k.py create mode 100644 benchmark/raytune/config_builders/random_sampling.py rename benchmark/raytune/{ => config_builders}/utility.py (99%) create mode 100644 benchmark/raytune/config_builders/vattention_hashattention.py create mode 100644 benchmark/raytune/config_builders/vattention_oracle.py diff --git a/benchmark/raytune/config_builders/__init__.py b/benchmark/raytune/config_builders/__init__.py new file mode 100644 index 00000000..f1910409 --- /dev/null +++ b/benchmark/raytune/config_builders/__init__.py @@ -0,0 +1,34 @@ +"""Configuration builders for sparse attention configs.""" + +from .base import BaseConfigBuilder +from .factory import get_config_builder, get_all_config_builders, register_builder + +# Import builders to trigger registration via decorators +from .dense import DenseConfigBuilder # noqa: E402, F401 +from .double_sparsity import DoubleSparsityConfigBuilder # noqa: E402, F401 +from .vattention_oracle import VAttentionOracleConfigBuilder # noqa: E402, F401 +from .vattention_hashattention import VAttentionHashAttentionConfigBuilder # noqa: E402, F401 +from .oracle_topk import OracleTopKConfigBuilder # noqa: E402, F401 +from .oracle_topp import OracleTopPConfigBuilder # noqa: E402, F401 +from .hashattention_topk import HashAttentionTopKConfigBuilder # noqa: E402, F401 +from .magicpig import MagicPigConfigBuilder # noqa: E402, F401 +from .quest_top_k import QuestTopKConfigBuilder # noqa: E402, F401 +from .random_sampling import RandomSamplingConfigBuilder # noqa: E402, F401 + +__all__ = [ + "BaseConfigBuilder", + "DenseConfigBuilder", + "DoubleSparsityConfigBuilder", + "VAttentionOracleConfigBuilder", + "VAttentionHashAttentionConfigBuilder", + "OracleTopKConfigBuilder", + "OracleTopPConfigBuilder", + "HashAttentionTopKConfigBuilder", + "MagicPigConfigBuilder", + "QuestTopKConfigBuilder", + "RandomSamplingConfigBuilder", + "get_config_builder", + "get_all_config_builders", + "register_builder", +] + diff --git a/benchmark/raytune/config_builders/base.py b/benchmark/raytune/config_builders/base.py new file mode 100644 index 00000000..276ca1ac --- /dev/null +++ b/benchmark/raytune/config_builders/base.py @@ -0,0 +1,39 @@ +"""Base class for configuration builders.""" + +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig + + +class BaseConfigBuilder(ABC): + """Abstract base class for building sparse attention configurations. + + Each builder is responsible for creating configurations for a specific + sparse attention method or combination of methods. + """ + + @abstractmethod + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Build sparse attention configurations. + + Args: + weight_file: Path to weight file (required for some configs) + objective: Objective function name (e.g., "sparsity_5", "default") + **kwargs: Additional parameters specific to the builder + + Returns: + Tuple of (optimal_configs, to_optimize_configs) where each is a list + of (name, full_config, masker_classes) tuples. + + - optimal_configs: Configs that don't need hyperparameter search + - to_optimize_configs: Configs that need Ray Tune optimization + """ + pass + diff --git a/benchmark/raytune/config_builders/dense.py b/benchmark/raytune/config_builders/dense.py new file mode 100644 index 00000000..88cbf003 --- /dev/null +++ b/benchmark/raytune/config_builders/dense.py @@ -0,0 +1,39 @@ +"""Configuration builder for dense (no sparse attention) model.""" + +from typing import List, Optional, Tuple + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig + +from .base import BaseConfigBuilder +from .factory import register_builder + + +@register_builder("dense") +class DenseConfigBuilder(BaseConfigBuilder): + """Builder for dense (no sparse attention) configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get dense baseline configuration. + + Returns list of (name, full_config, masker_classes) tuples. + + For dense models, sparse_config and masker_classes are None to indicate + no sparse attention is used. + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + # Dense baseline: no sparse attention, so sparse_config and masker_classes are None + optimal_configs.append(("dense", None, None)) + + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/double_sparsity.py b/benchmark/raytune/config_builders/double_sparsity.py new file mode 100644 index 00000000..c6bb84ae --- /dev/null +++ b/benchmark/raytune/config_builders/double_sparsity.py @@ -0,0 +1,79 @@ +"""Configuration builder for DoubleSparsity attention.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + DoubleSparsityTopKMaskerConfig, + LocalMaskerConfig, + SinkMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("double_sparsity") +class DoubleSparsityConfigBuilder(BaseConfigBuilder): + """Builder for DoubleSparsity sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + memory_objective: Optional[str] = None, + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all double sparsity attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Args: + weight_file: Path to weight file (required but not used for DoubleSparsity) + objective: Objective function name (e.g., "sparsity_5") + memory_objective: Memory objective parameter (e.g., "32") - required + **kwargs: Additional parameters + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + assert weight_file is not None, "Weight file is required for HashAttention Masker" + assert memory_objective is not None, "memory_objective is required for get_double_sparsity_configs" + + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + heavy_size: float = float(objective.split("_")[1]) / 100.0 - (256.0 / 32768) + aux_mem: int = int(memory_objective) + + classes = [SinkMaskerConfig, LocalMaskerConfig, DoubleSparsityTopKMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size, "aux_mem": aux_mem}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + DoubleSparsityTopKMaskerConfig( + heavy_size=heavy_size, + group_factor=8, + label_bits=2, + sorted_channel_file="/data/apdesai/code/DoubleSparse/config/meta-llama/Llama-3.1-8B-Instruct.json", + channel_selection="q_proj"), + ]) + + config.masker_configs[2].search_space = { + "channel_selection": tune.grid_search(["q_proj", "qk_proj"]), + "group_factor": tune.grid_search([2, 4, 8, 16]), + "label_bits": tune.grid_search([1, 2, 4, 8, 16]), + } + config.validity_constraint = lambda config: ((128 // config.masker_configs[2].group_factor) * config.masker_configs[2].label_bits == aux_mem) + to_optimize_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/factory.py b/benchmark/raytune/config_builders/factory.py new file mode 100644 index 00000000..7e97eec4 --- /dev/null +++ b/benchmark/raytune/config_builders/factory.py @@ -0,0 +1,97 @@ +"""Factory for creating configuration builders.""" + +from typing import Dict, List, Optional, Tuple + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig + +from .base import BaseConfigBuilder + +# Registry of available config builders +_BUILDER_REGISTRY: Dict[str, type[BaseConfigBuilder]] = {} + + +def register_builder(name: str): + """Decorator to register a configuration builder. + + Usage: + @register_builder("my_builder") + class MyBuilder(BaseConfigBuilder): + ... + + Args: + name: Name to register the builder under + """ + def decorator(builder_class: type[BaseConfigBuilder]) -> type[BaseConfigBuilder]: + if not issubclass(builder_class, BaseConfigBuilder): + raise TypeError(f"Builder class must inherit from BaseConfigBuilder") + _BUILDER_REGISTRY[name] = builder_class + return builder_class + return decorator + + +def get_config_builder(builder_name: str) -> BaseConfigBuilder: + """Get a configuration builder by name. + + Args: + builder_name: Name of the builder (e.g., "double_sparsity", "vattention_oracle") + + Returns: + Instance of the requested builder + + Raises: + ValueError: If builder_name is not registered + """ + if builder_name not in _BUILDER_REGISTRY: + available = ", ".join(_BUILDER_REGISTRY.keys()) + raise ValueError(f"Unknown builder '{builder_name}'. Available builders: {available}") + + builder_class = _BUILDER_REGISTRY[builder_name] + return builder_class() + + +def get_all_config_builders() -> Dict[str, BaseConfigBuilder]: + """Get all registered configuration builders. + + Returns: + Dictionary mapping builder names to builder instances + """ + return {name: get_config_builder(name) for name in _BUILDER_REGISTRY.keys()} + + +def build_all_configs( + weight_file: Optional[str] = None, + objective: str = "default", + builder_names: Optional[List[str]] = None, + **kwargs +) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Build configs using all specified builders. + + Args: + weight_file: Path to weight file + objective: Objective function name + builder_names: List of builder names to use. If None, uses all builders. + **kwargs: Additional parameters passed to each builder + + Returns: + Tuple of (optimal_configs, to_optimize_configs) aggregated from all builders + """ + if builder_names is None: + builders = get_all_config_builders() + else: + builders = {name: get_config_builder(name) for name in builder_names} + + all_optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + all_to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + for builder_name, builder in builders.items(): + optimal_configs, to_optimize_configs = builder.build_configs( + weight_file=weight_file, + objective=objective, + **kwargs + ) + all_optimal_configs.extend(optimal_configs) + all_to_optimize_configs.extend(to_optimize_configs) + + return all_optimal_configs, all_to_optimize_configs + diff --git a/benchmark/raytune/config_builders/hashattention_topk.py b/benchmark/raytune/config_builders/hashattention_topk.py new file mode 100644 index 00000000..2deddc5e --- /dev/null +++ b/benchmark/raytune/config_builders/hashattention_topk.py @@ -0,0 +1,62 @@ +"""Configuration builder for HashAttention TopK attention.""" + +from typing import List, Optional, Tuple + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + HashAttentionTopKMaskerConfig, + LocalMaskerConfig, + SinkMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("hashattention_topk") +class HashAttentionTopKConfigBuilder(BaseConfigBuilder): + """Builder for HashAttention TopK sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all HashAttention TopK attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + assert weight_file is not None, "Weight file is required for HashAttention Masker" + + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + for heavy_size in [0.02, 0.05, 0.1, 0.2]: + classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + HashAttentionTopKMaskerConfig( + heavy_size=heavy_size - (256.0 / 32768), + hat_bits=32, + hat_mlp_layers=3, + hat_mlp_hidden_size=128, + hat_mlp_activation="silu", + hat_weight_file=weight_file + ), + ]) + optimal_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/magicpig.py b/benchmark/raytune/config_builders/magicpig.py new file mode 100644 index 00000000..9d43b76e --- /dev/null +++ b/benchmark/raytune/config_builders/magicpig.py @@ -0,0 +1,65 @@ +"""Configuration builder for MagicPig attention.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + SinkMaskerConfig, +) +from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import ( + MagicPigConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("magicpig") +class MagicPigConfigBuilder(BaseConfigBuilder): + """Builder for MagicPig sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all MagicPig attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + classes = [SinkMaskerConfig, LocalMaskerConfig, MagicPigConfig] + name: str = get_masker_list_name(classes, other_params={"objective": objective}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + MagicPigConfig( + lsh_l=8, # Default value from search space + lsh_k=64 # Default value from search space + ) + ]) + + # Set up search space for LSH parameters + config.masker_configs[2].search_space = { + "lsh_l": tune.grid_search([16, 32, 64, 128]), + "lsh_k": tune.grid_search([2, 4, 8, 16, 32]), + } + + to_optimize_configs.append((name, config, classes)) + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/oracle_topk.py b/benchmark/raytune/config_builders/oracle_topk.py new file mode 100644 index 00000000..54d98408 --- /dev/null +++ b/benchmark/raytune/config_builders/oracle_topk.py @@ -0,0 +1,55 @@ +"""Configuration builder for Oracle TopK attention.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + OracleTopKConfig, + SinkMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("oracle_topk") +class OracleTopKConfigBuilder(BaseConfigBuilder): + """Builder for Oracle TopK sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all Oracle TopK attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + for heavy_size in [0.02, 0.05, 0.1, 0.2]: + classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig] + name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + OracleTopKConfig(heavy_size=heavy_size - (256.0 / 32768)), # Default value + ]) + optimal_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/oracle_topp.py b/benchmark/raytune/config_builders/oracle_topp.py new file mode 100644 index 00000000..7bdb6ca5 --- /dev/null +++ b/benchmark/raytune/config_builders/oracle_topp.py @@ -0,0 +1,60 @@ +"""Configuration builder for Oracle TopP attention.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + OracleTopPMaskerConfig, + SinkMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("oracle_topp") +class OracleTopPConfigBuilder(BaseConfigBuilder): + """Builder for Oracle TopP sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all Oracle TopP attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopPMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"objective": objective}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + OracleTopPMaskerConfig(top_p=0.7) # Default middle value from search space + ]) + + # Set up search space for top_p parameter + # Using the default search space from OracleTopPMaskerConfig + config.masker_configs[2].search_space = { + "top_p": tune.grid_search([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99]), + } + + to_optimize_configs.append((name, config, classes)) + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/quest_top_k.py b/benchmark/raytune/config_builders/quest_top_k.py new file mode 100644 index 00000000..af21357e --- /dev/null +++ b/benchmark/raytune/config_builders/quest_top_k.py @@ -0,0 +1,79 @@ +"""Configuration builder for Quest TopK attention.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + QuestTopKMaskerConfig, + SinkMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("quest_top_k") +class QuestTopKConfigBuilder(BaseConfigBuilder): + """Builder for Quest TopK sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + memory_objective: Optional[str] = None, + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all Quest TopK attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Args: + weight_file: Path to weight file (required but not used for QuestTopK) + objective: Objective function name (e.g., "sparsity_5") + memory_objective: Memory objective parameter (e.g., "32") - required + **kwargs: Additional parameters + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + assert weight_file is not None, "Weight file is required for QuestTopK Masker" + assert memory_objective is not None, "memory_objective is required for get_quest_top_k_configs" + + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + heavy_size: float = float(objective.split("_")[1]) / 100.0 - (256.0 / 32768) + aux_mem: int = int(memory_objective) + + classes = [SinkMaskerConfig, LocalMaskerConfig, QuestTopKMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size, "aux_mem": aux_mem}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + QuestTopKMaskerConfig( + heavy_size=heavy_size, + page_size=128, + label_bits=16), + ]) + + config.masker_configs[2].search_space = { + "page_size": tune.grid_search([8, 16, 32, 64, 128]), + "label_bits": tune.grid_search([2, 4, 8, 16]), + } + # Memory constraint: similar to double_sparsity pattern + # For quest_top_k, memory usage depends on page_size and label_bits + # Adjust this constraint based on actual memory requirements + config.validity_constraint = lambda config: (aux_mem == 2 * (128 * config.masker_configs[2].label_bits) / config.masker_configs[2].page_size ) + to_optimize_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/random_sampling.py b/benchmark/raytune/config_builders/random_sampling.py new file mode 100644 index 00000000..ccef0416 --- /dev/null +++ b/benchmark/raytune/config_builders/random_sampling.py @@ -0,0 +1,58 @@ +"""Configuration builder for Random Sampling attention.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + SinkMaskerConfig, +) +from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import ( + RandomSamplingMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("random_sampling") +class RandomSamplingConfigBuilder(BaseConfigBuilder): + """Builder for Random Sampling sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all Random Sampling attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + classes = [SinkMaskerConfig, LocalMaskerConfig, RandomSamplingMaskerConfig] + + + for budget_size in [0.02, 0.05, 0.1, 0.2]: + name: str = get_masker_list_name(classes, other_params={"budget_size": budget_size}) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), # Middle value from search space + LocalMaskerConfig(window_size=128), # Middle value from search space + RandomSamplingMaskerConfig(sampling_rate=budget_size- (256.0 / 32768)) # Middle value from search space + ]) + optimal_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/utility.py b/benchmark/raytune/config_builders/utility.py similarity index 99% rename from benchmark/raytune/utility.py rename to benchmark/raytune/config_builders/utility.py index b84c0d83..62350fc0 100644 --- a/benchmark/raytune/utility.py +++ b/benchmark/raytune/config_builders/utility.py @@ -67,6 +67,7 @@ def objective(error: float, density: float) -> float: # Pre-defined objective functions for common sparsity levels OBJECTIVE_FUNCTIONS = { + "sparsity_2": create_sparsity_objective(0.02), "sparsity_5": create_sparsity_objective(0.05), "sparsity_10": create_sparsity_objective(0.10), "sparsity_15": create_sparsity_objective(0.15), diff --git a/benchmark/raytune/config_builders/vattention_hashattention.py b/benchmark/raytune/config_builders/vattention_hashattention.py new file mode 100644 index 00000000..0b7fa1d8 --- /dev/null +++ b/benchmark/raytune/config_builders/vattention_hashattention.py @@ -0,0 +1,136 @@ +"""Configuration builder for VAttention HashAttention TopK configurations.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + HashAttentionTopKMaskerConfig, + LocalMaskerConfig, + SinkMaskerConfig, +) +from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import ( + AdaptiveSamplingMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("vattention_hashattention") +class VAttentionHashAttentionConfigBuilder(BaseConfigBuilder): + """Builder for VAttention HashAttention TopK sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all sparse attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Args: + weight_file: Path to weight file (required for HashAttention) + objective: Objective function name (e.g., "sparsity_2", "sparsity_5", etc.) + **kwargs: Additional parameters + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + assert weight_file is not None, "Weight file is required for HashAttention Masker" + + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig, AdaptiveSamplingMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"objective": objective}) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + HashAttentionTopKMaskerConfig( + heavy_size=0.05, # Middle value from search space + hat_bits=32, # Required parameter + hat_mlp_layers=3, # Required parameter + hat_mlp_hidden_size=128, # Required parameter + hat_mlp_activation="silu", # Required parameter + hat_weight_file=weight_file # Weight file is required + ), + AdaptiveSamplingMaskerConfig( + base_rate_sampling=0.05, # Middle value + epsilon=0.05, # Middle value + delta=0.05, # Middle value + init_offset=128, # Middle value + local_offset=128 # Middle value + ) + ]) + + if objective == "sparsity_2": + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), + "epsilon": tune.grid_search([0.1, 0.2, 0.3, 0.4]), + "delta": tune.grid_search([0.1, 0.2, 0.3, 0.4]) + } + + elif objective == "sparsity_5": + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.01, 0.025, 0.05]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.01, 0.02, 0.03]), + "epsilon": tune.grid_search([0.05, 0.1, 0.2, 0.3]), + "delta": tune.grid_search([0.05, 0.1, 0.2, 0.3]) + } + + elif objective == "sparsity_10": + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.025, 0.05, 0.075, 0.1]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.025, 0.05, 0.075]), + "epsilon": tune.grid_search([0.025, 0.05, 0.075]), + "delta": tune.grid_search([0.025, 0.05, 0.075]) + } + elif objective == "sparsity_15": + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.04, 0.06, 0.1]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } + + elif objective == "sparsity_20": + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } + else: + raise ValueError(f"objective not supported: {objective}") + + sparsity = float(objective.split("_")[1]) / 100.0 + config.validity_constraint = lambda config: ((config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity) + + to_optimize_configs.append((name, config, classes)) + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/config_builders/vattention_oracle.py b/benchmark/raytune/config_builders/vattention_oracle.py new file mode 100644 index 00000000..f920339c --- /dev/null +++ b/benchmark/raytune/config_builders/vattention_oracle.py @@ -0,0 +1,129 @@ +"""Configuration builder for VAttention Oracle TopK configurations.""" + +from typing import List, Optional, Tuple + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + OracleTopKConfig, + SinkMaskerConfig, +) +from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import ( + AdaptiveSamplingMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("vattention_oracle") +class VAttentionOracleConfigBuilder(BaseConfigBuilder): + """Builder for VAttention Oracle TopK sparse attention configurations.""" + + def build_configs( + self, + weight_file: Optional[str] = None, + objective: str = "default", + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all sparse attention configurations. + + Returns list of (name, full_config, masker_classes) tuples. + + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Args: + weight_file: Path to weight file (required but not used for this config) + objective: Objective function name (e.g., "sparsity_2", "sparsity_5", etc.) + **kwargs: Additional parameters + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + assert weight_file is not None, "Weight file is required for HashAttention Masker" + + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig, AdaptiveSamplingMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"objective": objective}) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + OracleTopKConfig(heavy_size=0.05), # Middle value from search space + AdaptiveSamplingMaskerConfig( + base_rate_sampling=0.05, # Middle value + epsilon=0.05, # Middle value + delta=0.05, # Middle value + init_offset=128, # Middle value + local_offset=128 # Middle value + ) + ]) + + if objective == "sparsity_2": + #1. Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), + "epsilon": tune.grid_search([0.1, 0.2, 0.3, 0.4]), + "delta": tune.grid_search([0.1, 0.2, 0.3, 0.4]) + } + + elif objective == "sparsity_5": + #1. Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.01, 0.025, 0.05]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.01, 0.02, 0.03]), + "epsilon": tune.grid_search([0.05, 0.1, 0.2, 0.3]), + "delta": tune.grid_search([0.05, 0.1, 0.2, 0.3]) + } + + elif objective == "sparsity_10": + #1. Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.025, 0.05, 0.075, 0.1]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.025, 0.05, 0.075]), + "epsilon": tune.grid_search([0.025, 0.05, 0.075]), + "delta": tune.grid_search([0.025, 0.05, 0.075]) + } + elif objective == "sparsity_15": + #1. Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.04, 0.06, 0.1]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } + + elif objective == "sparsity_20": + #1. Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } + else: + raise ValueError(f"objective not supported: {objective}") + + sparsity = float(objective.split("_")[1]) / 100.0 + config.validity_constraint = lambda config: ((config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity ) + + to_optimize_configs.append((name, config, classes)) + return optimal_configs, to_optimize_configs + diff --git a/benchmark/raytune/optimizer_factory.py b/benchmark/raytune/optimizer_factory.py index 6e2f849a..0930ca22 100755 --- a/benchmark/raytune/optimizer_factory.py +++ b/benchmark/raytune/optimizer_factory.py @@ -73,7 +73,10 @@ def create_config_from_params(self, params: Dict[str, Any]) -> ResearchAttention setattr(masker_config_copy, key, value) masker_instances.append(masker_config_copy) - return ResearchAttentionConfig(masker_configs=masker_instances) + new_config = ResearchAttentionConfig(masker_configs=masker_instances) + if hasattr(self.research_attention_config, 'validity_constraint'): + new_config.validity_constraint = self.research_attention_config.validity_constraint + return new_config def create_optimizer(research_attention_config: Optional[ResearchAttentionConfig] = None) -> SparseConfigOptimizer: """ diff --git a/benchmark/raytune/run_config_dir.py b/benchmark/raytune/run_config_dir.py index d47dc76e..bf2463ba 100755 --- a/benchmark/raytune/run_config_dir.py +++ b/benchmark/raytune/run_config_dir.py @@ -38,7 +38,7 @@ from sparse_attention_hub.adapters.huggingface import ModelAdapterHF from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig from sparse_attention_hub.metric_logging.logger import MicroMetricLogger -from utility import deserialize_sparse_config +from config_builders.utility import deserialize_sparse_config diff --git a/benchmark/raytune/run_optimize_configs.py b/benchmark/raytune/run_optimize_configs.py index 98265255..d0eefcb2 100755 --- a/benchmark/raytune/run_optimize_configs.py +++ b/benchmark/raytune/run_optimize_configs.py @@ -31,7 +31,7 @@ from sparse_attention_hub.adapters.huggingface import ModelAdapterHF from sparse_attention_hub.metric_logging.logger import MicroMetricLogger from optimizer_factory import create_optimizer -from utility import ( +from config_builders.utility import ( get_masker_list_name, create_sparsity_objective, OBJECTIVE_FUNCTIONS, @@ -40,22 +40,10 @@ serialize_sparse_config, deserialize_sparse_config, ) +from config_builders.factory import build_all_configs # Import all masker configs from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig -from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( - LocalMaskerConfig, - SinkMaskerConfig, - OracleTopKConfig, - OracleTopPMaskerConfig, - HashAttentionTopKMaskerConfig, - DoubleSparsityTopKMaskerConfig, -) -from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import ( - AdaptiveSamplingMaskerConfig, - RandomSamplingMaskerConfig, - MagicPigConfig, -) class BenchmarkHelper: @@ -86,6 +74,12 @@ def __init__(self, config: dict): def __call__(self, attention_config, task_name: str, model_name: str) -> Tuple[float, float, float]: """Run benchmark and return (score, density, error) tuple.""" try: + # Early validation check - skip expensive benchmark if constraint fails + if hasattr(attention_config, 'validity_constraint') and attention_config.validity_constraint is not None: + if not attention_config.validity_constraint(attention_config): + logging.info(f"Config failed validity constraint, returning penalty score") + return 100.0, 1.0, 1.0 # Penalty score, worst density, worst error + benchmark_name, subset_name = task_name.split("/", 1) if "/" in task_name else (task_name, None) # Create result directory for this specific run @@ -477,181 +471,38 @@ def run_search(config: dict, actors_per_gpu: int = 1) -> Dict[str, OptimalConfig RUN_TASKS = [ "ruler32k/vt", + "ruler32k/qa_1", + "ruler32k/qa_2", + "ruler32k/fwe", + "ruler32k/niah_multikey_2", + "ruler32k/niah_multikey_3", ] -def get_all_sparse_configs(weight_file: str = None, objective: str = "default") -> List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]: +def get_all_sparse_configs(weight_file: str = None, objective: str = "default", memory_objective: str = None) -> List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]: """Get all sparse attention configurations. Returns list of (name, full_config, masker_classes) tuples. Note: The configs returned here are only used to determine which masker classes to use. The actual parameter values will be determined by Ray Tune search. + + Args: + weight_file: Path to weight file (required) + objective: Objective function name (e.g., "sparsity_5") + memory_objective: Memory objective parameter for configs that need it + + Returns: + Tuple of (optimal_configs, to_optimize_configs) """ assert weight_file is not None, "Weight file is required for HashAttention Masker" - optimal_configs = [] - to_optimize_configs = [] - - - # ############################## optimal configs ############################## - #1. Dense baseline - optimal_configs.append(("dense", None, None)) - - # 2. Oracle top k (already included above with adaptive, but also standalone) - for heavy_size in [0.1]: - classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig] - name = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopKConfig(heavy_size=heavy_size) - ]) - optimal_configs.append((name, config, classes)) - - #3. HashAttention top k - for heavy_size in [0.1]: - classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig] - name = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - HashAttentionTopKMaskerConfig( - heavy_size=heavy_size, - hat_bits=32, - hat_mlp_layers=3, - hat_mlp_hidden_size=128, - hat_mlp_activation="silu", - hat_weight_file=weight_file - ), - ]) - optimal_configs.append((name, config, classes)) - - # 4. Random sampling with sink and local - classes = [SinkMaskerConfig, LocalMaskerConfig, RandomSamplingMaskerConfig] - name = get_masker_list_name(classes) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), # Middle value from search space [4, 8, 16, 32, 64, 128] - LocalMaskerConfig(window_size=128), # Middle value from search space [32, 64, 128, 256] - RandomSamplingMaskerConfig(sampling_rate=0.095) # Middle value from search space [0.01, 0.05, 0.1, 0.2, 0.3, 0.5] - ]) - optimal_configs.append((name, config, classes)) - - ############################# to optimize configs ############################## - - - #1. Adaptive sampling with oracle top k - classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig, AdaptiveSamplingMaskerConfig] - name = get_masker_list_name(classes, other_params={"objective": objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopKConfig(heavy_size=0.10), # Middle value from search space - AdaptiveSamplingMaskerConfig( - base_rate_sampling=0.1, # Middle value - epsilon=0.25, # Middle value - delta=0.25, # Middle value - init_offset=128, # Middle value - local_offset=128 # Middle value - ) - ]) - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.01, 0.02]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.01, 0.02]), - "epsilon": tune.grid_search([0.05]), - "delta": tune.grid_search([0.05]), - "init_offset": tune.grid_search([0.01]), - "local_offset": tune.grid_search([0.01]), - } - to_optimize_configs.append((name, config, classes)) - - # 2. Adaptive sampling with oracle top p - - classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopPMaskerConfig, AdaptiveSamplingMaskerConfig] - name = get_masker_list_name(classes) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopPMaskerConfig(top_p=0.10), # Middle value from search space - AdaptiveSamplingMaskerConfig( - base_rate_sampling=0.1, # Middle value - epsilon=0.25, # Middle value - delta=0.25, # Middle value - init_offset=128, # Middle value - local_offset=128 # Middle value - ) - ]) - to_optimize_configs.append((name, config, classes)) - - # #3. Adaptive sampling with HAT top k - classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig, AdaptiveSamplingMaskerConfig] - name = get_masker_list_name(classes, other_params={"objective": objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - HashAttentionTopKMaskerConfig( - heavy_size=0.05, # Required parameter - hat_bits=32, # Required parameter - hat_mlp_layers=3, # Required parameter - hat_mlp_hidden_size=128, # Required parameter - hat_mlp_activation="silu", # Required parameter - hat_weight_file=weight_file # Weight file is required - ), - AdaptiveSamplingMaskerConfig( - base_rate_sampling=0.1, - epsilon=0.25, - delta=0.25, - init_offset=128, - local_offset=128 - ) - ]) - to_optimize_configs.append((name, config, classes)) - - - # # 4. Oracle top p - classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopPMaskerConfig] - name = get_masker_list_name(classes, other_params={"objective": objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopPMaskerConfig(top_p=0.7) # Default middle value from search space - ]) - to_optimize_configs.append((name, config, classes)) - - # # 5. MagicPig config - classes = [SinkMaskerConfig, LocalMaskerConfig, MagicPigConfig] - name = get_masker_list_name(classes) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - MagicPigConfig( - lsh_l=8, # Default value from search space - lsh_k=8 # Default value from search space - ) - ]) - to_optimize_configs.append((name, config, classes)) - - - # 5. Double Sparsity Top K config - # sorted_channel_file is available in the author's repository - # https://github.com/andy-yang-1/DoubleSparse/tree/main/config - # TODO: fix the path via environment variable or something else - - for heavy_size in [0.1, 0.2]: - classes = [SinkMaskerConfig, LocalMaskerConfig, DoubleSparsityTopKMaskerConfig] - name = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) - - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - DoubleSparsityTopKMaskerConfig( - heavy_size=heavy_size, - group_factor=2, - label_bits=2, - sorted_channel_file="/home/ubuntu/DoubleSparse/config/meta-llama/Llama-3.1-8B-Instruct.json", - channel_selection="q_proj"), - ]) - optimal_configs.append((name, config, classes)) + # Use factory to build all configs + # Currently using double_sparsity builder, can be extended to use multiple builders + optimal_configs, to_optimize_configs = build_all_configs( + weight_file=weight_file, + objective=objective, + builder_names=["magicpig"], # Specify which builders to use + memory_objective=memory_objective + ) return optimal_configs, to_optimize_configs @@ -666,7 +517,8 @@ def get_run_configuration( search_max_requests: int, force_search: bool, optimal_configs_dir: str, - ray_results_dir: str + ray_results_dir: str, + memory_objective: str = None ) -> dict: """Build complete configuration from command-line arguments.""" num_gpus = torch.cuda.device_count() @@ -681,7 +533,7 @@ def get_run_configuration( print(f"Warning: HashAttention weights not found, using {weight_file}") # Get all sparse configs - optimal_configs, to_optimize_configs = get_all_sparse_configs(weight_file, objective=objective) + optimal_configs, to_optimize_configs = get_all_sparse_configs(weight_file, objective=objective, memory_objective=memory_objective) # Filter configs based on debug mode if debug: @@ -740,6 +592,7 @@ def main( ray_results_dir: str = "./ray_results", search_timeout: int = 900, actors_per_gpu: int = 1, + memory_objective: str = None, ): """ Hyperparameter search for sparse attention methods. @@ -756,6 +609,7 @@ def main( ray_results_dir: Directory for Ray Tune results (default: "./ray_results") search_timeout: Timeout per search trial in seconds (default: 900) actors_per_gpu: Number of actors per GPU for resource allocation (default: 1) + memory_objective: Memory objective parameter (e.g., "memory_32") for configs that need it (default: None) """ # Validate objective function if objective not in OBJECTIVE_FUNCTIONS: @@ -772,6 +626,7 @@ def main( force_search=force_search, optimal_configs_dir=optimal_configs_dir, ray_results_dir=ray_results_dir, + memory_objective=memory_objective, ) if not ray.is_initialized(): diff --git a/benchmark/scripts/single_benchmark_model_example.py b/benchmark/scripts/single_benchmark_model_example.py index ad70e8ce..f0a864d3 100644 --- a/benchmark/scripts/single_benchmark_model_example.py +++ b/benchmark/scripts/single_benchmark_model_example.py @@ -24,13 +24,15 @@ import sys # Change to directory two levels below current location -os.chdir('/home/ubuntu/sparse-attention-hub') -sys.path.insert(0, '/home/ubuntu/sparse-attention-hub') +os.chdir('/data/apdesai/code/sparse-attention-hub') +sys.path.insert(0, '/data/apdesai/code/sparse-attention-hub') from sparse_attention_hub.metric_logging.logger import MicroMetricLogger from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( - DoubleSparsityTopKMaskerConfig + SinkMaskerConfig, + LocalMaskerConfig, + QuestTopKMaskerConfig ) #from benchmark.longbench import LongBench @@ -38,20 +40,15 @@ from sparse_attention_hub.adapters import ModelAdapterHF def main(): - model_name = "meta-llama/Llama-3.1-8B-Instruct" + model_name = "Qwen/Qwen3-4B-Instruct-2507" device = 0 # sorted_channel_file is available in the author's repository # https://github.com/andy-yang-1/DoubleSparse/tree/main/config # TODO: is there a better way to use the paths in scripts? sparse_attention_config = ResearchAttentionConfig(masker_configs=[ - DoubleSparsityTopKMaskerConfig( - heavy_size=4096, - group_factor=2, - label_bits=2, - sorted_channel_file="/home/ubuntu/DoubleSparse/config/meta-llama/Llama-3.1-8B-Instruct.json", - channel_selection="q_proj" - ) + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128) ]) print(" ✓ Loading model...") @@ -61,14 +58,14 @@ def main(): adapter = ModelAdapterHF( model_name=model_name, sparse_attention_config=sparse_attention_config, - model_kwargs= {"torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_3"}, + model_kwargs= {"torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2"}, device=device ) #benchmark = LongBench(['passage_retrieval_en']) - benchmark = Ruler32K(['vt']) + benchmark = Ruler32K(['niah_multikey_2']) - result_dir = Path("./test_results.vt.4096.2.2.q_proj/") + result_dir = Path("./test_results.4B/") result_dir.mkdir(exist_ok=True) metric_logger = MicroMetricLogger() metric_logger.configure_logging( @@ -79,7 +76,7 @@ def main(): ], ) metric_logger.flush() - benchmark.run_benchmark(adapter, result_dir, request_kwargs={"max_requests": 10, "max_context_length": 1000000}, generation_kwargs={"max_new_tokens": 500}) + benchmark.run_benchmark(adapter, result_dir, request_kwargs={"max_requests": 100, "max_context_length": 1000000}, generation_kwargs={"max_new_tokens": 500}) if __name__ == "__main__": main() From af87a83c92f7ecda8b06280f2b6c3b075f158890 Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Fri, 7 Nov 2025 10:39:14 -0800 Subject: [PATCH 2/7] Refactoring Phase 2 Move out benchmark helper, search manager and experiment parameters --- benchmark/raytune/OPTIMIZATION_EXPERIMENT.py | 60 ++ benchmark/raytune/benchmark_helper.py | 189 ++++++ benchmark/raytune/run_optimize_configs.py | 665 ++++--------------- benchmark/raytune/search_manager.py | 266 ++++++++ 4 files changed, 655 insertions(+), 525 deletions(-) create mode 100644 benchmark/raytune/OPTIMIZATION_EXPERIMENT.py create mode 100644 benchmark/raytune/benchmark_helper.py create mode 100644 benchmark/raytune/search_manager.py diff --git a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py new file mode 100644 index 00000000..bc04dbb8 --- /dev/null +++ b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py @@ -0,0 +1,60 @@ +"""Run configuration for hyperparameter search. + +All configuration parameters for the hyperparameter search are defined here. +Modify this file to change search behavior without editing the main script. +""" + +import os +from typing import Dict, List, Optional + +# Model configurations +# Weight files are loaded from SPARSE_ATTENTION_WEIGHTS_DIR environment variable +# Set it to the directory containing your HashAttention weight files +weights_dir: str = os.environ.get("SPARSE_ATTENTION_WEIGHTS_DIR", "./weights") + +MODEL_CONFIGS: Dict[str, Dict[str, str]] = { + "llama": { + "weight_file": os.path.join(weights_dir, "llama3.1-8b-patch.64K.v1.hat_weights.pkl"), + "model_name": "meta-llama/Llama-3.1-8B-Instruct" + }, + "deepseek": { + "weight_file": os.path.join(weights_dir, "DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K_hat_weights.pkl"), + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + }, + "mistral": { + "weight_file": os.path.join(weights_dir, "Mistral-7B-Instruct-v0.3.24K.20.500.hat_weights.pkl"), + "model_name": "mistralai/Mistral-7B-Instruct-v0.3" + } +} + +DEFAULT_MODEL: str = "llama" + +# Task configurations +DEBUG_TASKS: List[str] = ["loogle/shortdep_qa"] + +RUN_TASKS: List[str] = [ + "ruler32k/vt", + "ruler32k/qa_1", + "ruler32k/qa_2", + "ruler32k/fwe", + "ruler32k/niah_multikey_2", + "ruler32k/niah_multikey_3", +] + +# Hyperparameter search configuration +OBJECTIVE: str = "default" # Objective function to use for optimization +NUM_SAMPLES: int = 100 # Number of samples per hyperparameter search +SEARCH_MAX_NEW_TOKENS: int = 100 # Max new tokens for search trials +SEARCH_MAX_CONTEXT_LENGTH: int = 2048 # Max context length for search trials +SEARCH_MAX_REQUESTS: int = 10 # Max requests per search trial +DEBUG: bool = False # Debug mode with minimal configs +FORCE_SEARCH: bool = False # Force re-run of search even if configs exist +OPTIMAL_CONFIGS_DIR: str = "./debug" # Directory for storing optimal configurations +RAY_RESULTS_DIR: str = "./ray_results" # Directory for Ray Tune results +SEARCH_TIMEOUT: int = 900 # Timeout per search trial in seconds +ACTORS_PER_GPU: int = 1 # Number of actors per GPU for resource allocation +MEMORY_OBJECTIVE: Optional[str] = None # Memory objective parameter (e.g., "memory_32") for configs that need it + +# Config builder configuration +BUILDER_NAMES: List[str] = ["dense", "oracle_topk"] # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"]) + diff --git a/benchmark/raytune/benchmark_helper.py b/benchmark/raytune/benchmark_helper.py new file mode 100644 index 00000000..57708fa5 --- /dev/null +++ b/benchmark/raytune/benchmark_helper.py @@ -0,0 +1,189 @@ +"""Benchmark helper for executing individual benchmark runs during config search.""" + +import json +import logging +import math +import os +import sys +from pathlib import Path +from typing import Dict, Tuple + +# Path setup +current_dir = Path(__file__).parent +root_path = current_dir.parent.parent +sys.path.extend([str(current_dir), str(root_path)]) +os.environ["PYTHONPATH"] = os.environ.get("PYTHONPATH", "") + f":{current_dir}:{root_path}" + +import torch + +from benchmark.executor_config import AdapterConfig +from benchmark.benchmark_registry import create_benchmark_instance +from sparse_attention_hub.adapters.huggingface import ModelAdapterHF +from sparse_attention_hub.metric_logging.logger import MicroMetricLogger +from config_builders.utility import OBJECTIVE_FUNCTIONS + + +class BenchmarkHelper: + """Handles individual benchmark runs during config search. + + This class is responsible for executing a single benchmark run with a given + sparse attention configuration and returning the evaluation metrics (score, density, error). + """ + + def __init__(self, config: Dict[str, any]) -> None: + """Initialize the benchmark helper with configuration. + + Args: + config: Dictionary containing benchmark configuration including: + - search_result_dir: Base directory for search results + - search_max_new_tokens: Maximum new tokens for generation + - search_max_context_length: Maximum context length + - search_max_requests: Maximum requests per trial + - objective_function: Name of objective function to use + """ + self.config: Dict[str, any] = config + self.base_result_dir: Path = Path(config["search_result_dir"]) + self.adapter_config: AdapterConfig = AdapterConfig( + adapter_name="huggingface", + model_kwargs={"torch_dtype": torch.bfloat16}, + tokenizer_kwargs={"padding_side": "left"}, + ) + self.generation_kwargs: Dict[str, any] = { + "max_new_tokens": config["search_max_new_tokens"], + "do_sample": False + } + self.request_kwargs: Dict[str, any] = { + "max_context_length": config["search_max_context_length"], + "max_requests": config["search_max_requests"], + } + + # Get objective function + self.objective_name: str = config.get("objective_function", "default") + self.objective_function = OBJECTIVE_FUNCTIONS.get(self.objective_name, OBJECTIVE_FUNCTIONS["default"]) + logging.info(f"Using objective function: {self.objective_name}") + + def __call__(self, attention_config: any, task_name: str, model_name: str) -> Tuple[float, float, float]: + """Run benchmark and return (score, density, error) tuple. + + Args: + attention_config: Sparse attention configuration to test + task_name: Name of the benchmark task (may include subset, e.g., "benchmark/subset") + model_name: Name of the model to use + + Returns: + Tuple of (score, density, error) where: + - score: Combined objective score (lower is better) + - density: Attention density (0.0 to 1.0) + - error: Attention output error (0.0 to 1.0) + """ + try: + # Early validation check - skip expensive benchmark if constraint fails + if hasattr(attention_config, 'validity_constraint') and attention_config.validity_constraint is not None: + if not attention_config.validity_constraint(attention_config): + logging.info(f"Config failed validity constraint, returning penalty score") + return 100.0, 1.0, 1.0 # Penalty score, worst density, worst error + + benchmark_name: str + subset_name: str | None + benchmark_name, subset_name = task_name.split("/", 1) if "/" in task_name else (task_name, None) + + # Create result directory for this specific run + result_dir: Path = self.base_result_dir / f"{model_name}_{task_name}_{hash(str(attention_config)) % 1000000}" + result_dir.mkdir(parents=True, exist_ok=True) + + # Create model adapter + adapter: ModelAdapterHF = ModelAdapterHF( + model_name=model_name, + sparse_attention_config=attention_config, + model_kwargs=self.adapter_config.model_kwargs, + tokenizer_kwargs=self.adapter_config.tokenizer_kwargs + ) + + # Create benchmark instance + benchmark = create_benchmark_instance( + benchmark_name=benchmark_name, + subsets=[subset_name] if subset_name else None + ) + print("The result directory is ", result_dir, flush=True) + # Setup micro metric logger + metric_logger: MicroMetricLogger = MicroMetricLogger() + metric_logger.configure_logging( + log_path=str(result_dir), + enabled_metrics=["research_attention_density", "research_attention_output_error"], + ) + + # Run benchmark directly + metrics = benchmark.run_benchmark( + adapter=adapter, + result_dir=str(result_dir), + generation_kwargs=self.generation_kwargs, + request_kwargs=self.request_kwargs + ) + + # Flush the metric logger to ensure all metrics are written + metric_logger.flush() + + # Extract micro metrics for sparse attention evaluation + micro_metrics: Dict[str, float] = self._extract_micro_metrics(result_dir) + error: float = micro_metrics["attention_error"] + density: float = micro_metrics["density"] + + # For dense configuration (density=1.0, error=0.0), use a simple score + if density == 1.0 and error == 0.0: + # Dense baseline: use benchmark accuracy metrics instead of sparse metrics + score: float = 100.0 # Small baseline score for dense + else: + # Use the selected objective function + score = self.objective_function(error, density) + # Also print to stdout so the test script can detect it + print(f"Objective: {self.objective_name}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") + logging.info(f"Objective: {self.objective_name}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") + + return score, density, error + + except Exception as e: + logging.error(f"Benchmark failed: {e}") + import traceback + traceback.print_exc() + + return 5.0, 1.0, 1.0 # Penalty score, worst-case density, and worst-case error + + def _extract_micro_metrics(self, result_dir: Path) -> Dict[str, float]: + """Extract attention error and density from micro metrics. + + Args: + result_dir: Directory containing the micro_metrics.jsonl file + + Returns: + Dictionary with keys: + - attention_error: Average attention output error (0.0 to 1.0) + - density: Average attention density (0.0 to 1.0) + """ + micro_metrics_file: Path = result_dir / "micro_metrics.jsonl" + if not micro_metrics_file.exists(): + # For dense configuration, micro_metrics.jsonl won't exist since no sparse attention is used + # Return default values: 0 error (perfect) and 1.0 density (fully dense) + logging.info(f"micro_metrics.jsonl not found in {result_dir}, using dense defaults") + return {"attention_error": 0.0, "density": 1.0} + + errors: list[float] = [] + densities: list[float] = [] + with open(micro_metrics_file, "r") as f: + for line in f: + try: + entry: dict = json.loads(line.strip()) + metric: str | None = entry.get("metric") + value: any = entry.get("value") + if value is not None and not (isinstance(value, float) and math.isnan(value)): + if metric == "research_attention_output_error": + errors.append(float(value)) + elif metric == "research_attention_density": + densities.append(float(value)) + except (json.JSONDecodeError, ValueError, TypeError): + continue + + return { + "attention_error": sum(errors) / len(errors) if errors else 1.0, + "density": sum(densities) / len(densities) if densities else 1.0 + } + diff --git a/benchmark/raytune/run_optimize_configs.py b/benchmark/raytune/run_optimize_configs.py index d0eefcb2..5adfd066 100755 --- a/benchmark/raytune/run_optimize_configs.py +++ b/benchmark/raytune/run_optimize_configs.py @@ -3,16 +3,9 @@ Hyperparameter search for optimal sparse attention configurations. """ -import fire -import json import logging -import math import os import sys -import time -import traceback -from dataclasses import asdict, dataclass, field -from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -24,352 +17,148 @@ import torch import ray -from ray import tune -from benchmark.executor_config import AdapterConfig, BenchmarkConfig -from benchmark.benchmark_registry import create_benchmark_instance -from sparse_attention_hub.adapters.huggingface import ModelAdapterHF -from sparse_attention_hub.metric_logging.logger import MicroMetricLogger -from optimizer_factory import create_optimizer -from config_builders.utility import ( - get_masker_list_name, - create_sparsity_objective, - OBJECTIVE_FUNCTIONS, - OptimalConfig, - get_all_masker_config_classes, - serialize_sparse_config, - deserialize_sparse_config, -) +from config_builders.utility import OBJECTIVE_FUNCTIONS, OptimalConfig from config_builders.factory import build_all_configs # Import all masker configs from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +# Import search manager +from search_manager import ConfigSearchManager + +# Import run configuration +from OPTIMIZATION_EXPERIMENT import ( + MODEL_CONFIGS, + DEFAULT_MODEL, + RUN_TASKS, + OBJECTIVE, + NUM_SAMPLES, + SEARCH_MAX_NEW_TOKENS, + SEARCH_MAX_CONTEXT_LENGTH, + SEARCH_MAX_REQUESTS, + FORCE_SEARCH, + OPTIMAL_CONFIGS_DIR, + RAY_RESULTS_DIR, + SEARCH_TIMEOUT, + ACTORS_PER_GPU, + MEMORY_OBJECTIVE, + BUILDER_NAMES, +) -class BenchmarkHelper: - """Handles individual benchmark runs during config search.""" +def get_all_sparse_configs(weight_file: str = None, objective: str = "default", memory_objective: str = None, builder_names: List[str] = None) -> List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]: + """Get all sparse attention configurations. + Returns list of (name, full_config, masker_classes) tuples. - def __init__(self, config: dict): - self.config = config - self.base_result_dir = Path(config["search_result_dir"]) - self.adapter_config = AdapterConfig( - adapter_name="huggingface", - model_kwargs={"torch_dtype": torch.bfloat16}, - tokenizer_kwargs={"padding_side": "left"}, - ) - self.generation_kwargs = { - "max_new_tokens": config["search_max_new_tokens"], - "do_sample": False - } - self.request_kwargs = { - "max_context_length": config["search_max_context_length"], - "max_requests": config["search_max_requests"], - } + Note: The configs returned here are only used to determine which masker classes + to use. The actual parameter values will be determined by Ray Tune search. + + Args: + weight_file: Path to weight file (required) + objective: Objective function name (e.g., "sparsity_5") + memory_objective: Memory objective parameter for configs that need it + builder_names: List of builder names to use - # Get objective function - self.objective_name = config.get("objective_function", "default") - self.objective_function = OBJECTIVE_FUNCTIONS.get(self.objective_name, OBJECTIVE_FUNCTIONS["default"]) - logging.info(f"Using objective function: {self.objective_name}") + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + assert weight_file is not None, "Weight file is required for HashAttention Masker" + + # Use factory to build all configs + optimal_configs, to_optimize_configs = build_all_configs( + weight_file=weight_file, + objective=objective, + builder_names=builder_names or BUILDER_NAMES, + memory_objective=memory_objective + ) + + return optimal_configs, to_optimize_configs - def __call__(self, attention_config, task_name: str, model_name: str) -> Tuple[float, float, float]: - """Run benchmark and return (score, density, error) tuple.""" - try: - # Early validation check - skip expensive benchmark if constraint fails - if hasattr(attention_config, 'validity_constraint') and attention_config.validity_constraint is not None: - if not attention_config.validity_constraint(attention_config): - logging.info(f"Config failed validity constraint, returning penalty score") - return 100.0, 1.0, 1.0 # Penalty score, worst density, worst error - - benchmark_name, subset_name = task_name.split("/", 1) if "/" in task_name else (task_name, None) - - # Create result directory for this specific run - result_dir = self.base_result_dir / f"{model_name}_{task_name}_{hash(str(attention_config)) % 1000000}" - result_dir.mkdir(parents=True, exist_ok=True) - - # Create model adapter - adapter = ModelAdapterHF( - model_name=model_name, - sparse_attention_config=attention_config, - model_kwargs=self.adapter_config.model_kwargs, - tokenizer_kwargs=self.adapter_config.tokenizer_kwargs - ) - - # Create benchmark instance - benchmark = create_benchmark_instance( - benchmark_name=benchmark_name, - subsets=[subset_name] if subset_name else None - ) - print("The result directory is ", result_dir, flush=True) - # Setup micro metric logger - metric_logger = MicroMetricLogger() - metric_logger.configure_logging( - log_path=str(result_dir), - enabled_metrics=["research_attention_density", "research_attention_output_error"], - ) - - # Run benchmark directly - metrics = benchmark.run_benchmark( - adapter=adapter, - result_dir=str(result_dir), - generation_kwargs=self.generation_kwargs, - request_kwargs=self.request_kwargs - ) - - # Flush the metric logger to ensure all metrics are written - metric_logger.flush() - - # Extract micro metrics for sparse attention evaluation - micro_metrics = self._extract_micro_metrics(result_dir) - error, density = micro_metrics["attention_error"], micro_metrics["density"] - - # For dense configuration (density=1.0, error=0.0), use a simple score - if density == 1.0 and error == 0.0: - # Dense baseline: use benchmark accuracy metrics instead of sparse metrics - score = 100.0 # Small baseline score for dense - else: - # Use the selected objective function - score = self.objective_function(error, density) - # Also print to stdout so the test script can detect it - print(f"Objective: {self.objective_name}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") - logging.info(f"Objective: {self.objective_name}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") - - return score, density, error - - except Exception as e: - logging.error(f"Benchmark failed: {e}") - import traceback - traceback.print_exc() - - return 5.0, 1.0, 1.0 # Penalty score, worst-case density, and worst-case error + +def get_run_configuration() -> dict: + """Build complete configuration from RUN_CONFIG.py.""" + num_gpus: int = torch.cuda.device_count() - def _extract_micro_metrics(self, result_dir: Path) -> dict: - """Extract attention error and density from micro metrics.""" - micro_metrics_file = result_dir / "micro_metrics.jsonl" - if not micro_metrics_file.exists(): - # For dense configuration, micro_metrics.jsonl won't exist since no sparse attention is used - # Return default values: 0 error (perfect) and 1.0 density (fully dense) - logging.info(f"micro_metrics.jsonl not found in {result_dir}, using dense defaults") - return {"attention_error": 0.0, "density": 1.0} - - errors, densities = [], [] - with open(micro_metrics_file, "r") as f: - for line in f: - try: - entry = json.loads(line.strip()) - metric, value = entry.get("metric"), entry.get("value") - if value is not None and not (isinstance(value, float) and math.isnan(value)): - if metric == "research_attention_output_error": - errors.append(float(value)) - elif metric == "research_attention_density": - densities.append(float(value)) - except (json.JSONDecodeError, ValueError, TypeError): - continue - - return { - "attention_error": sum(errors) / len(errors) if errors else 1.0, - "density": sum(densities) / len(densities) if densities else 1.0 - } + # Get model configuration + model_config: Dict[str, str] = MODEL_CONFIGS[DEFAULT_MODEL] + weight_file: str = model_config["weight_file"] + model_name: str = model_config["model_name"] -class ConfigSearchManager: - """Manages Phase 1: Hyperparameter search for optimal configs.""" + if not os.path.exists(weight_file): + weight_file = "./hat_weights.pkl" + print(f"Warning: HashAttention weights not found, using {weight_file}") - def __init__(self, base_config: dict): - self.config = base_config - # Add timestamp to the results directory - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - base_dir = Path(base_config["optimal_configs_dir"]) - self.results_dir = base_dir / f"run_{timestamp}" - self.results_dir.mkdir(parents=True, exist_ok=True) - self.timestamp = timestamp - print(f"Saving optimal configs to: {self.results_dir}") - - def search_optimal_config( - self, - model: str, - task: str, - masker_name: str, - masker_classes: Optional[List], - full_sparse_config: Optional[ResearchAttentionConfig] = None, - actors_per_gpu: int = 1 - ) -> OptimalConfig: - """Search for optimal hyperparameters for a single combination.""" - - config_file = self.results_dir / f"{model}_{task}_{masker_name}.json".replace("/", "_") - - # Check if already exists - if config_file.exists() and not self.config.get("force_search", False): - print(f" → Loading existing config") - return self._load_config(config_file) - - # Handle dense config (no optimization needed) - if masker_classes is None: - optimal = OptimalConfig( - model=model, - task=task, - masker_name=masker_name, - sparse_config=None, - masker_classes=None, - hyperparams={}, - score=0.0, - search_time=0.0, - num_trials=1 - ) - self._save_config(optimal, config_file) - return optimal - - # Run hyperparameter search - start_time = time.time() - - try: - # Create optimizer with template config for fixed parameters - optimizer = create_optimizer(full_sparse_config) - - # Show what we're searching - search_space = optimizer.create_search_space(task) - print(f" → Search space parameters:") - for param, space_obj in search_space.items(): - # Extract actual values from Ray Tune objects - if hasattr(space_obj, 'categories'): - values = space_obj.categories - print(f" - {param}: {values}") - else: - print(f" - {param}: {space_obj}") - - # Create objective function - def objective(trial_config): - runner = BenchmarkHelper(self.config) - attention_config = optimizer.create_config_from_params(trial_config) - score, density, error = runner(attention_config, task, model) - return {"combined_score": score, "density": density, "error": error} - - # ### run a sample objective to ensure there are no errors - print("="*10, "Running a short test objective to ensure there are no errors", flush=True) - sample_config = { - "AdaptiveSamplingMaskerConfig_base_rate_sampling": 0.1, - "AdaptiveSamplingMaskerConfig_epsilon": 0.25, - "AdaptiveSamplingMaskerConfig_delta": 0.25 - } - result = objective(sample_config) - print("="*10, "Successfully ran a short test objective", flush=True) - print(sample_config) - print(result) - print("="*100, flush=True) - - # Run Ray Tune - sanitized_name = f"{model}_{task}_{masker_name}".replace("/", "_") - analysis = tune.run( - objective, - config=search_space, - metric="combined_score", - mode="min", - resources_per_trial={"CPU": 1, "GPU": 1.0 / actors_per_gpu}, - storage_path=os.path.abspath(self.config["ray_results_dir"]), - name=sanitized_name, - verbose=1, # Show Ray Tune progress - stop={"training_iteration": 1}, # One evaluation per config - ) - - # Get best config - best_trial = analysis.get_best_trial("combined_score", "min", "last") - best_config = optimizer.create_config_from_params(best_trial.config) - - # Save detailed trial information for post-analysis - trials_info = [] - for trial in analysis.trials: - trial_info = { - "trial_id": trial.trial_id, - "config": trial.config, - "score": trial.last_result.get("combined_score", float('inf')) if trial.last_result else float('inf'), - "status": trial.status, - "start_time": trial.start_time.isoformat() if hasattr(trial, 'start_time') and trial.start_time else None, - "metric_history": trial.metric_analysis.get("combined_score", {}) if hasattr(trial, 'metric_analysis') else {} - } - trials_info.append(trial_info) - - # Save trial details to separate file - trials_file = self.results_dir / f"{model}_{task}_{masker_name}_trials.json".replace("/", "_") - with open(trials_file, "w") as f: - json.dump({ - "model": model, - "task": task, - "masker_name": masker_name, - "objective_function": self.config.get("objective_function", "default"), - "best_trial_id": best_trial.trial_id, - "trials": trials_info, - "analysis_dataframe_path": str(self.results_dir / f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_")) - }, f, indent=2) - - # Save Ray analysis dataframe for detailed analysis - df = analysis.dataframe() - df.to_csv(self.results_dir / f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_"), index=False) - - optimal = OptimalConfig( - model=model, - task=task, - masker_name=masker_name, - sparse_config=best_config, - masker_classes=masker_classes, - hyperparams=best_trial.config, - score=best_trial.last_result["combined_score"], - search_time=time.time() - start_time, - num_trials=len(analysis.trials) - ) - - self._save_config(optimal, config_file) - return optimal - - except Exception as e: - print(f" ✗ Search failed: {e}") - traceback.print_exc() - # Return failure config - optimal = OptimalConfig( - model=model, - task=task, - masker_name=masker_name, - sparse_config=full_sparse_config, - masker_classes=masker_classes, - hyperparams={}, - score=5.0, - search_time=time.time() - start_time, - num_trials=0 - ) - self._save_config(optimal, config_file) - return optimal + # Get all sparse configs + optimal_configs, to_optimize_configs = get_all_sparse_configs( + weight_file, + objective=OBJECTIVE, + memory_objective=MEMORY_OBJECTIVE, + builder_names=BUILDER_NAMES + ) - def _save_config(self, config: OptimalConfig, filepath: Path): - """Save configuration to JSON.""" - data = asdict(config) - - # Convert sparse config to serializable format - if config.sparse_config: - data["sparse_config"] = serialize_sparse_config(config.sparse_config) - - # Convert masker classes to strings - if config.masker_classes: - data["masker_classes"] = [cls.__name__ for cls in config.masker_classes] - - with open(filepath, "w") as f: - json.dump(data, f, indent=2) + # Set models, tasks, and num_samples + models: List[str] = [model_name] + tasks: List[str] = RUN_TASKS + num_samples: int = NUM_SAMPLES - def _load_config(self, filepath: Path) -> OptimalConfig: - """Load configuration from JSON.""" - with open(filepath, "r") as f: - data = json.load(f) - - # Reconstruct sparse config if present - if data.get("sparse_config"): - data["sparse_config"] = deserialize_sparse_config(data["sparse_config"]) + # Build config maps + optimal_configs_map: Dict[str, tuple] = {} + to_optimize_configs_map: Dict[str, tuple] = {} + for name, full_config, classes in optimal_configs: + optimal_configs_map[name] = (classes, full_config) + for name, full_config, classes in to_optimize_configs: + to_optimize_configs_map[name] = (classes, full_config) + + return { + "models": models, + "tasks": tasks, + "optimal_configs": optimal_configs, + "to_optimize_configs": to_optimize_configs, + "optimal_configs_map": optimal_configs_map, + "to_optimize_configs_map": to_optimize_configs_map, + "gpu_ids": list(range(num_gpus)), + "num_samples": num_samples, + "objective_function": OBJECTIVE, - # Reconstruct masker classes from strings - if data.get("masker_classes"): - # Dynamically discover all available masker config classes - class_map = get_all_masker_config_classes() - data["masker_classes"] = [class_map[name] for name in data["masker_classes"]] + # Directories + "optimal_configs_dir": OPTIMAL_CONFIGS_DIR, + "ray_results_dir": RAY_RESULTS_DIR, + "search_result_dir": os.path.join(RAY_RESULTS_DIR, "search_runs"), - return OptimalConfig(**data) + # Search params + "search_timeout": SEARCH_TIMEOUT, + "search_max_new_tokens": SEARCH_MAX_NEW_TOKENS, + "search_max_context_length": SEARCH_MAX_CONTEXT_LENGTH, + "search_max_requests": SEARCH_MAX_REQUESTS, + "force_search": FORCE_SEARCH, + } + + +def run_search(config: Dict[str, Any], actors_per_gpu: int = 1) -> Dict[str, OptimalConfig]: + """Find optimal configurations for all combinations. -def run_search(config: dict, actors_per_gpu: int = 1) -> Dict[str, OptimalConfig]: - """Find optimal configurations for all combinations.""" + This function orchestrates the search process across all model/task/config + combinations, using ConfigSearchManager to handle individual searches. + + Args: + config: Dictionary containing search configuration with keys: + - models: List of model names + - tasks: List of task names + - optimal_configs: List of optimal configs (don't need search) + - to_optimize_configs: List of configs to optimize + - optimal_configs_map: Map of optimal configs + - to_optimize_configs_map: Map of configs to optimize + - num_samples: Number of samples per search + - objective_function: Objective function name + - search_max_new_tokens: Max new tokens for search + - search_max_context_length: Max context length + - search_max_requests: Max requests per trial + - search_timeout: Timeout per trial + actors_per_gpu: Number of actors per GPU for resource allocation + + Returns: + Dictionary mapping config keys to OptimalConfig objects + """ print("\n" + "="*80) print("HYPERPARAMETER SEARCH") print("="*80) @@ -383,7 +172,7 @@ def run_search(config: dict, actors_per_gpu: int = 1) -> Dict[str, OptimalConfig # Display objective function details if config['objective_function'].startswith('sparsity_'): - target = int(config['objective_function'].split('_')[1]) + target: int = int(config['objective_function'].split('_')[1]) print(f" → Targeting {target}% density (0.{target:02d} fraction)") print(f" → Formula: 0.99 * error + 0.01 * density + penalty for exceeding target") @@ -397,11 +186,11 @@ def run_search(config: dict, actors_per_gpu: int = 1) -> Dict[str, OptimalConfig print("values (e.g., window_size, sink_size, sampling_rate) to find the best combination.") print("="*80) - manager = ConfigSearchManager(config) - optimal_configs = {} + manager: ConfigSearchManager = ConfigSearchManager(config) + optimal_configs: Dict[str, OptimalConfig] = {} - total = len(config["models"]) * len(config["tasks"]) * len(config["to_optimize_configs"]) + len(config["models"]) * len(config["tasks"]) * len(config["optimal_configs"]) - current = 0 + total: int = len(config["models"]) * len(config["tasks"]) * len(config["to_optimize_configs"]) + len(config["models"]) * len(config["tasks"]) * len(config["optimal_configs"]) + current: int = 0 for model in config["models"]: print(f"\nModel: {model}") @@ -410,10 +199,10 @@ def run_search(config: dict, actors_per_gpu: int = 1) -> Dict[str, OptimalConfig for task in config["tasks"]: for masker_name, (masker_classes, full_config) in config["to_optimize_configs_map"].items(): current += 1 - key = f"{model}_{task}_{masker_name}".replace("/", "_") + key: str = f"{model}_{task}_{masker_name}".replace("/", "_") print(f"\n[{current}/{total}] Task: {task} | Config: {masker_name}") - optimal = manager.search_optimal_config( + optimal: OptimalConfig = manager.search_optimal_config( model, task, masker_name, masker_classes, full_config, actors_per_gpu ) optimal_configs[key] = optimal @@ -443,197 +232,23 @@ def run_search(config: dict, actors_per_gpu: int = 1) -> Dict[str, OptimalConfig return optimal_configs -################################################################# CONFIGURE YOUR RUN HERE ################################################################# - -# Model configurations -# Weight files are loaded from SPARSE_ATTENTION_WEIGHTS_DIR environment variable -# Set it to the directory containing your HashAttention weight files -weights_dir = os.environ.get("SPARSE_ATTENTION_WEIGHTS_DIR", "./weights") -MODEL_CONFIGS = { - "llama": { - "weight_file": os.path.join(weights_dir, "llama3.1-8b-patch.64K.v1.hat_weights.pkl"), - "model_name": "meta-llama/Llama-3.1-8B-Instruct" - }, - "deepseek": { - "weight_file": os.path.join(weights_dir, "DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K_hat_weights.pkl"), - "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" - }, - "mistral": { - "weight_file": os.path.join(weights_dir, "Mistral-7B-Instruct-v0.3.24K.20.500.hat_weights.pkl"), - "model_name": "mistralai/Mistral-7B-Instruct-v0.3" - } -} - -DEFAULT_MODEL = "llama" - -# Task configurations -DEBUG_TASKS = ["loogle/shortdep_qa"] - -RUN_TASKS = [ - "ruler32k/vt", - "ruler32k/qa_1", - "ruler32k/qa_2", - "ruler32k/fwe", - "ruler32k/niah_multikey_2", - "ruler32k/niah_multikey_3", -] - -def get_all_sparse_configs(weight_file: str = None, objective: str = "default", memory_objective: str = None) -> List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]: - """Get all sparse attention configurations. - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. +def main() -> None: + """Hyperparameter search for sparse attention methods. - Args: - weight_file: Path to weight file (required) - objective: Objective function name (e.g., "sparsity_5") - memory_objective: Memory objective parameter for configs that need it - - Returns: - Tuple of (optimal_configs, to_optimize_configs) - """ - assert weight_file is not None, "Weight file is required for HashAttention Masker" - - # Use factory to build all configs - # Currently using double_sparsity builder, can be extended to use multiple builders - optimal_configs, to_optimize_configs = build_all_configs( - weight_file=weight_file, - objective=objective, - builder_names=["magicpig"], # Specify which builders to use - memory_objective=memory_objective - ) - - return optimal_configs, to_optimize_configs - - -def get_run_configuration( - objective: str, - debug: bool, - num_samples: int, - search_timeout: int, - search_max_new_tokens: int, - search_max_context_length: int, - search_max_requests: int, - force_search: bool, - optimal_configs_dir: str, - ray_results_dir: str, - memory_objective: str = None -) -> dict: - """Build complete configuration from command-line arguments.""" - num_gpus = torch.cuda.device_count() - - # Get model configuration - model_config = MODEL_CONFIGS[DEFAULT_MODEL] - weight_file = model_config["weight_file"] - model_name = model_config["model_name"] - - if not os.path.exists(weight_file): - weight_file = "./hat_weights.pkl" - print(f"Warning: HashAttention weights not found, using {weight_file}") - - # Get all sparse configs - optimal_configs, to_optimize_configs = get_all_sparse_configs(weight_file, objective=objective, memory_objective=memory_objective) - - # Filter configs based on debug mode - if debug: - sparse_configs = to_optimize_configs[:3] # Just first 3 for debug - models = [model_name] - tasks = DEBUG_TASKS - num_samples = 8 - else: - models = [model_name] - tasks = RUN_TASKS - # num_samples is already passed as parameter - - # Build config maps - optimal_configs_map = {} - to_optimize_configs_map = {} - for name, full_config, classes in optimal_configs: - optimal_configs_map[name] = (classes, full_config) - for name, full_config, classes in to_optimize_configs: - to_optimize_configs_map[name] = (classes, full_config) - - return { - "models": models, - "tasks": tasks, - "optimal_configs": optimal_configs, - "to_optimize_configs": to_optimize_configs, - "optimal_configs_map": optimal_configs_map, - "to_optimize_configs_map": to_optimize_configs_map, - "gpu_ids": list(range(num_gpus)), - "num_samples": num_samples, - "objective_function": objective, - - # Directories - "optimal_configs_dir": optimal_configs_dir, - "ray_results_dir": ray_results_dir, - "search_result_dir": os.path.join(ray_results_dir, "search_runs"), - - # Search params - "search_timeout": search_timeout, - "search_max_new_tokens": search_max_new_tokens, - "search_max_context_length": search_max_context_length, - "search_max_requests": search_max_requests, - "force_search": force_search, - } - -######################################################### CONFIGURATION ENDS HERE #########################################################`` - -def main( - objective: str, - num_samples: int, - search_max_new_tokens: int, - search_max_context_length: int, - search_max_requests: int, - debug: bool = False, - force_search: bool = False, - optimal_configs_dir: str = "./optimal_configs", - ray_results_dir: str = "./ray_results", - search_timeout: int = 900, - actors_per_gpu: int = 1, - memory_objective: str = None, -): - """ - Hyperparameter search for sparse attention methods. - - Args: - objective: Objective function to use for optimization (required) - num_samples: Number of samples per hyperparameter search (required) - search_max_new_tokens: Max new tokens for search trials (required) - search_max_context_length: Max context length for search trials (required) - search_max_requests: Max requests per search trial (required) - debug: Debug mode with minimal configs (default: False) - force_search: Force re-run of search even if configs exist (default: False) - optimal_configs_dir: Directory for storing optimal configurations (default: "./optimal_configs") - ray_results_dir: Directory for Ray Tune results (default: "./ray_results") - search_timeout: Timeout per search trial in seconds (default: 900) - actors_per_gpu: Number of actors per GPU for resource allocation (default: 1) - memory_objective: Memory objective parameter (e.g., "memory_32") for configs that need it (default: None) + All configuration is loaded from RUN_CONFIG.py. Modify that file to change + search parameters instead of passing command-line arguments. """ # Validate objective function - if objective not in OBJECTIVE_FUNCTIONS: - raise ValueError(f"Invalid objective function '{objective}'. Choose from: {list(OBJECTIVE_FUNCTIONS.keys())}") + if OBJECTIVE not in OBJECTIVE_FUNCTIONS: + raise ValueError(f"Invalid objective function '{OBJECTIVE}'. Choose from: {list(OBJECTIVE_FUNCTIONS.keys())}") - config = get_run_configuration( - objective=objective, - debug=debug, - num_samples=num_samples, - search_timeout=search_timeout, - search_max_new_tokens=search_max_new_tokens, - search_max_context_length=search_max_context_length, - search_max_requests=search_max_requests, - force_search=force_search, - optimal_configs_dir=optimal_configs_dir, - ray_results_dir=ray_results_dir, - memory_objective=memory_objective, - ) + config: Dict[str, Any] = get_run_configuration() if not ray.is_initialized(): ray.init(ignore_reinit_error=True, log_to_driver=False, runtime_env={"working_dir": str(root_path)}) - optimal_configs = run_search(config, actors_per_gpu) + optimal_configs: Dict[str, OptimalConfig] = run_search(config, ACTORS_PER_GPU) ray.shutdown() @@ -642,4 +257,4 @@ def main( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) - fire.Fire(main) + main() diff --git a/benchmark/raytune/search_manager.py b/benchmark/raytune/search_manager.py new file mode 100644 index 00000000..78bb263a --- /dev/null +++ b/benchmark/raytune/search_manager.py @@ -0,0 +1,266 @@ +"""Search manager for orchestrating Ray Tune hyperparameter search.""" + +import json +import os +import sys +import time +import traceback +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional + +# Path setup +current_dir = Path(__file__).parent +root_path = current_dir.parent.parent +sys.path.extend([str(current_dir), str(root_path)]) +os.environ["PYTHONPATH"] = os.environ.get("PYTHONPATH", "") + f":{current_dir}:{root_path}" + +from ray import tune + +from optimizer_factory import create_optimizer +from config_builders.utility import ( + OptimalConfig, + get_all_masker_config_classes, + serialize_sparse_config, + deserialize_sparse_config, +) +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from benchmark_helper import BenchmarkHelper + + +class ConfigSearchManager: + """Manages Phase 1: Hyperparameter search for optimal configs. + + This class orchestrates Ray Tune hyperparameter search to find optimal + sparse attention configurations for given model/task combinations. + """ + + def __init__(self, base_config: Dict[str, any]) -> None: + """Initialize the search manager with configuration. + + Args: + base_config: Dictionary containing search configuration including: + - optimal_configs_dir: Directory to save optimal configs + - force_search: Whether to force re-search even if configs exist + """ + self.config: Dict[str, any] = base_config + # Add timestamp to the results directory + timestamp: str = datetime.now().strftime("%Y%m%d_%H%M%S") + base_dir: Path = Path(base_config["optimal_configs_dir"]) + self.results_dir: Path = base_dir / f"run_{timestamp}" + self.results_dir.mkdir(parents=True, exist_ok=True) + self.timestamp: str = timestamp + print(f"Saving optimal configs to: {self.results_dir}") + + def search_optimal_config( + self, + model: str, + task: str, + masker_name: str, + masker_classes: Optional[List], + full_sparse_config: Optional[ResearchAttentionConfig] = None, + actors_per_gpu: int = 1 + ) -> OptimalConfig: + """Search for optimal hyperparameters for a single combination. + + Args: + model: Model name to use + task: Task name to benchmark + masker_name: Name of the masker configuration + masker_classes: List of masker classes (None for dense configs) + full_sparse_config: Full sparse attention config template + actors_per_gpu: Number of actors per GPU for resource allocation + + Returns: + OptimalConfig containing the best configuration found + """ + config_file: Path = self.results_dir / f"{model}_{task}_{masker_name}.json".replace("/", "_") + + # Check if already exists + if config_file.exists() and not self.config.get("force_search", False): + print(f" → Loading existing config") + return self._load_config(config_file) + + # Handle dense config (no optimization needed) + if masker_classes is None: + optimal: OptimalConfig = OptimalConfig( + model=model, + task=task, + masker_name=masker_name, + sparse_config=None, + masker_classes=None, + hyperparams={}, + score=0.0, + search_time=0.0, + num_trials=1 + ) + self._save_config(optimal, config_file) + return optimal + + # Run hyperparameter search + start_time: float = time.time() + + try: + # Create optimizer with template config for fixed parameters + optimizer = create_optimizer(full_sparse_config) + + # Show what we're searching + search_space: Dict[str, any] = optimizer.create_search_space(task) + print(f" → Search space parameters:") + for param, space_obj in search_space.items(): + # Extract actual values from Ray Tune objects + if hasattr(space_obj, 'categories'): + values = space_obj.categories + print(f" - {param}: {values}") + else: + print(f" - {param}: {space_obj}") + + # Create objective function + def objective(trial_config: Dict[str, any]) -> Dict[str, float]: + runner: BenchmarkHelper = BenchmarkHelper(self.config) + attention_config = optimizer.create_config_from_params(trial_config) + score: float + density: float + error: float + score, density, error = runner(attention_config, task, model) + return {"combined_score": score, "density": density, "error": error} + + # ### run a sample objective to ensure there are no errors + print("="*10, "Running a short test objective to ensure there are no errors", flush=True) + sample_config: Dict[str, float] = { + "AdaptiveSamplingMaskerConfig_base_rate_sampling": 0.1, + "AdaptiveSamplingMaskerConfig_epsilon": 0.25, + "AdaptiveSamplingMaskerConfig_delta": 0.25 + } + result: Dict[str, float] = objective(sample_config) + print("="*10, "Successfully ran a short test objective", flush=True) + print(sample_config) + print(result) + print("="*100, flush=True) + + # Run Ray Tune + sanitized_name: str = f"{model}_{task}_{masker_name}".replace("/", "_") + analysis = tune.run( + objective, + config=search_space, + metric="combined_score", + mode="min", + resources_per_trial={"CPU": 1, "GPU": 1.0 / actors_per_gpu}, + storage_path=os.path.abspath(self.config["ray_results_dir"]), + name=sanitized_name, + verbose=1, # Show Ray Tune progress + stop={"training_iteration": 1}, # One evaluation per config + ) + + # Get best config + best_trial = analysis.get_best_trial("combined_score", "min", "last") + best_config = optimizer.create_config_from_params(best_trial.config) + + # Save detailed trial information for post-analysis + trials_info: List[Dict[str, any]] = [] + for trial in analysis.trials: + trial_info: Dict[str, any] = { + "trial_id": trial.trial_id, + "config": trial.config, + "score": trial.last_result.get("combined_score", float('inf')) if trial.last_result else float('inf'), + "status": trial.status, + "start_time": trial.start_time.isoformat() if hasattr(trial, 'start_time') and trial.start_time else None, + "metric_history": trial.metric_analysis.get("combined_score", {}) if hasattr(trial, 'metric_analysis') else {} + } + trials_info.append(trial_info) + + # Save trial details to separate file + trials_file: Path = self.results_dir / f"{model}_{task}_{masker_name}_trials.json".replace("/", "_") + with open(trials_file, "w") as f: + json.dump({ + "model": model, + "task": task, + "masker_name": masker_name, + "objective_function": self.config.get("objective_function", "default"), + "best_trial_id": best_trial.trial_id, + "trials": trials_info, + "analysis_dataframe_path": str(self.results_dir / f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_")) + }, f, indent=2) + + # Save Ray analysis dataframe for detailed analysis + df = analysis.dataframe() + df.to_csv(self.results_dir / f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_"), index=False) + + optimal = OptimalConfig( + model=model, + task=task, + masker_name=masker_name, + sparse_config=best_config, + masker_classes=masker_classes, + hyperparams=best_trial.config, + score=best_trial.last_result["combined_score"], + search_time=time.time() - start_time, + num_trials=len(analysis.trials) + ) + + self._save_config(optimal, config_file) + return optimal + + except Exception as e: + print(f" ✗ Search failed: {e}") + traceback.print_exc() + # Return failure config + optimal = OptimalConfig( + model=model, + task=task, + masker_name=masker_name, + sparse_config=full_sparse_config, + masker_classes=masker_classes, + hyperparams={}, + score=5.0, + search_time=time.time() - start_time, + num_trials=0 + ) + self._save_config(optimal, config_file) + return optimal + + def _save_config(self, config: OptimalConfig, filepath: Path) -> None: + """Save configuration to JSON. + + Args: + config: OptimalConfig to save + filepath: Path where to save the config + """ + data: Dict[str, any] = asdict(config) + + # Convert sparse config to serializable format + if config.sparse_config: + data["sparse_config"] = serialize_sparse_config(config.sparse_config) + + # Convert masker classes to strings + if config.masker_classes: + data["masker_classes"] = [cls.__name__ for cls in config.masker_classes] + + with open(filepath, "w") as f: + json.dump(data, f, indent=2) + + def _load_config(self, filepath: Path) -> OptimalConfig: + """Load configuration from JSON. + + Args: + filepath: Path to the config file to load + + Returns: + OptimalConfig loaded from file + """ + with open(filepath, "r") as f: + data: Dict[str, any] = json.load(f) + + # Reconstruct sparse config if present + if data.get("sparse_config"): + data["sparse_config"] = deserialize_sparse_config(data["sparse_config"]) + + # Reconstruct masker classes from strings + if data.get("masker_classes"): + # Dynamically discover all available masker config classes + class_map: Dict[str, type] = get_all_masker_config_classes() + data["masker_classes"] = [class_map[name] for name in data["masker_classes"]] + + return OptimalConfig(**data) + From f8879526dfe7b731a49e4ab9a255a9ad36971ae4 Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Fri, 7 Nov 2025 11:11:19 -0800 Subject: [PATCH 3/7] Update model config --- benchmark/raytune/OPTIMIZATION_EXPERIMENT.py | 41 +++++++++++++++----- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py index bc04dbb8..7605922d 100644 --- a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py +++ b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py @@ -10,21 +10,44 @@ # Model configurations # Weight files are loaded from SPARSE_ATTENTION_WEIGHTS_DIR environment variable # Set it to the directory containing your HashAttention weight files -weights_dir: str = os.environ.get("SPARSE_ATTENTION_WEIGHTS_DIR", "./weights") +hashattention_dir: str = os.environ.get("HASHATTENTION_WEIGHTS_DIR", "./") +doublesparsity_config_dir: str = os.environ.get("DOUBLE_SPARSITY_CONFIG_DIR", "./") + MODEL_CONFIGS: Dict[str, Dict[str, str]] = { - "llama": { - "weight_file": os.path.join(weights_dir, "llama3.1-8b-patch.64K.v1.hat_weights.pkl"), - "model_name": "meta-llama/Llama-3.1-8B-Instruct" + "llama3.1-8b": { + "model_name": "meta-llama/Llama-3.1-8B-Instruct", + "hash_attention_weight_file": os.path.join(hashattention_dir, "llama3.1-8b-patch.64K.v1.hat_weights.pkl"), + "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.1-8B-Instruct.json"), + }, + "llama3.2-1b": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"), + "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.2-1B-Instruct.json"), + }, + "llama3.2-3b": { + "model_name": "meta-llama/Llama-3.2-3B-Instruct", + "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"), + "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "meta-llama/Llama-3.2-3B-Instruct.json"), }, "deepseek": { - "weight_file": os.path.join(weights_dir, "DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K_hat_weights.pkl"), - "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "hash_attention_weight_file": os.path.join(hashattention_dir, "DeepSeek-R1-Distill-Llama-8B-patch-layers2-dim64-max-context-24K_hat_weights.pkl"), }, "mistral": { - "weight_file": os.path.join(weights_dir, "Mistral-7B-Instruct-v0.3.24K.20.500.hat_weights.pkl"), - "model_name": "mistralai/Mistral-7B-Instruct-v0.3" - } + "model_name": "mistralai/Mistral-7B-Instruct-v0.3", + "hash_attention_weight_file": os.path.join(hashattention_dir, "Mistral-7B-Instruct-v0.3.24K.20.500.hat_weights.pkl"), + }, + "qwen3-30b-moe": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"), + "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-30B-A3B-Instruct-2507.json"), + }, + "qwen3-4b": { + "model_name": "Qwen/Qwen3-4B-Instruct-2507", + "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"), + "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-4B-Instruct-2507.json"), + }, } DEFAULT_MODEL: str = "llama" From ee4d8d048fa719b1335fa563593b615d4192ebb1 Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Fri, 7 Nov 2025 13:02:31 -0800 Subject: [PATCH 4/7] allow multiple models, memory and sparsity objectives all in a single run --- benchmark/raytune/OPTIMIZATION_EXPERIMENT.py | 90 +++++--- benchmark/raytune/benchmark_helper.py | 50 +++-- benchmark/raytune/config_builders/dense.py | 19 +- .../config_builders/double_sparsity.py | 93 ++++---- benchmark/raytune/config_builders/factory.py | 17 +- .../config_builders/hashattention_topk.py | 40 +++- benchmark/raytune/config_builders/magicpig.py | 61 ++--- .../raytune/config_builders/oracle_topk.py | 33 ++- .../raytune/config_builders/oracle_topp.py | 55 +++-- .../raytune/config_builders/quest_top_k.py | 85 +++---- .../config_builders/random_sampling.py | 31 ++- benchmark/raytune/config_builders/utility.py | 14 +- .../vattention_hashattention.py | 205 +++++++++-------- .../config_builders/vattention_oracle.py | 178 ++++++++------- benchmark/raytune/optimizer_factory.py | 2 + benchmark/raytune/run_config_dir.py | 4 +- benchmark/raytune/run_optimize_configs.py | 212 +++++------------- benchmark/raytune/search_manager.py | 69 +++--- 18 files changed, 657 insertions(+), 601 deletions(-) diff --git a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py index 7605922d..3d53a610 100644 --- a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py +++ b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py @@ -10,8 +10,11 @@ # Model configurations # Weight files are loaded from SPARSE_ATTENTION_WEIGHTS_DIR environment variable # Set it to the directory containing your HashAttention weight files -hashattention_dir: str = os.environ.get("HASHATTENTION_WEIGHTS_DIR", "./") -doublesparsity_config_dir: str = os.environ.get("DOUBLE_SPARSITY_CONFIG_DIR", "./") + +HASHATTENTION_WEIGHTS_DIR: str = "/data/apdesai/code/HashAttention-1.0/artifacts" +DOUBLE_SPARSITY_CONFIG_DIR: str = "/data/apdesai/code/DoubleSparse/config" +hashattention_dir: str = HASHATTENTION_WEIGHTS_DIR +doublesparsity_config_dir: str = DOUBLE_SPARSITY_CONFIG_DIR MODEL_CONFIGS: Dict[str, Dict[str, str]] = { @@ -46,38 +49,73 @@ "qwen3-4b": { "model_name": "Qwen/Qwen3-4B-Instruct-2507", "hash_attention_weight_file": os.path.join(hashattention_dir, "DNE.pkl"), - "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-4B-Instruct-2507.json"), + "double_sparsity_config_file": os.path.join(doublesparsity_config_dir, "Qwen/Qwen3-4B-Instruct-2507.json"), }, } -DEFAULT_MODEL: str = "llama" - -# Task configurations -DEBUG_TASKS: List[str] = ["loogle/shortdep_qa"] +MODELS : List[str] = [ + "llama3.1-8b", + "llama3.2-1b", + "llama3.2-3b", + "qwen3-4b", + "qwen3-30b-moe", +] -RUN_TASKS: List[str] = [ - "ruler32k/vt", - "ruler32k/qa_1", - "ruler32k/qa_2", - "ruler32k/fwe", - "ruler32k/niah_multikey_2", +TASKS: List[str] = [ + # "ruler32k/vt", + # "ruler32k/qa_1", + # "ruler32k/qa_2", + # "ruler32k/fwe", + # "ruler32k/niah_multikey_2", "ruler32k/niah_multikey_3", ] -# Hyperparameter search configuration -OBJECTIVE: str = "default" # Objective function to use for optimization -NUM_SAMPLES: int = 100 # Number of samples per hyperparameter search -SEARCH_MAX_NEW_TOKENS: int = 100 # Max new tokens for search trials -SEARCH_MAX_CONTEXT_LENGTH: int = 2048 # Max context length for search trials -SEARCH_MAX_REQUESTS: int = 10 # Max requests per search trial -DEBUG: bool = False # Debug mode with minimal configs -FORCE_SEARCH: bool = False # Force re-run of search even if configs exist -OPTIMAL_CONFIGS_DIR: str = "./debug" # Directory for storing optimal configurations -RAY_RESULTS_DIR: str = "./ray_results" # Directory for Ray Tune results +SPARSITY_OBJECTIVES: List[str] = [ + 2, + 5, + 10, + 20, +] + +MEMORY_OBJECTIVES: List[Optional[str]] = [ + 32, + 64, + 128, +] # Memory objective parameter (e.g., "memory_32") for configs that need it + +BUILDER_NAMES: List[str] = [ + # "dense", + # "double_sparsity", + # "hashattention_topk", + "magicpig", + # "oracle_topk", + # "oracle_topp", + # "quest_topk", + # "vattention_hashattention", + # "vattention_oracle", +] # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"]) + + +# SEARCH PARAMS +NUM_SAMPLES: int = 1 # Number of samples per hyperparameter search +SEARCH_MAX_NEW_TOKENS: int = 3 # Max new tokens for search trials +SEARCH_MAX_CONTEXT_LENGTH: int = 40000 # Max context length for search trials +SEARCH_MAX_REQUESTS: int = 3 # Max requests per search trial +OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/DO_NOT_DELETE/magicpig_optimization" # Directory for storing optimal configurations +RAY_RESULTS_DIR: str = "/tmp/ray_results" # Directory for Ray Tune results SEARCH_TIMEOUT: int = 900 # Timeout per search trial in seconds ACTORS_PER_GPU: int = 1 # Number of actors per GPU for resource allocation -MEMORY_OBJECTIVE: Optional[str] = None # Memory objective parameter (e.g., "memory_32") for configs that need it -# Config builder configuration -BUILDER_NAMES: List[str] = ["dense", "oracle_topk"] # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"]) + +""" DRY RUN +if true , it will do everything except the actual running of benchmark helper -- it will just return +randomly generated scores for each trial and choose based on that +""" +DRY_RUN: bool = False + + +""" If you use Time stamp then by default it will perform entire search again. +""" +USE_TIMESTAMP_FOR_RESULTS_DIR: bool = False +FORCE_SEARCH: bool = False # Force re-run of search even if configs exist diff --git a/benchmark/raytune/benchmark_helper.py b/benchmark/raytune/benchmark_helper.py index 57708fa5..149efedc 100644 --- a/benchmark/raytune/benchmark_helper.py +++ b/benchmark/raytune/benchmark_helper.py @@ -21,6 +21,8 @@ from sparse_attention_hub.adapters.huggingface import ModelAdapterHF from sparse_attention_hub.metric_logging.logger import MicroMetricLogger from config_builders.utility import OBJECTIVE_FUNCTIONS +from OPTIMIZATION_EXPERIMENT import DRY_RUN +import random class BenchmarkHelper: @@ -30,7 +32,10 @@ class BenchmarkHelper: sparse attention configuration and returning the evaluation metrics (score, density, error). """ - def __init__(self, config: Dict[str, any]) -> None: + def __init__(self, + base_result_dir: Path, + generation_kwargs: Dict[str, any], + request_kwargs: Dict[str, any]) -> None: """Initialize the benchmark helper with configuration. Args: @@ -41,26 +46,14 @@ def __init__(self, config: Dict[str, any]) -> None: - search_max_requests: Maximum requests per trial - objective_function: Name of objective function to use """ - self.config: Dict[str, any] = config - self.base_result_dir: Path = Path(config["search_result_dir"]) + self.base_result_dir: Path = base_result_dir self.adapter_config: AdapterConfig = AdapterConfig( adapter_name="huggingface", model_kwargs={"torch_dtype": torch.bfloat16}, tokenizer_kwargs={"padding_side": "left"}, ) - self.generation_kwargs: Dict[str, any] = { - "max_new_tokens": config["search_max_new_tokens"], - "do_sample": False - } - self.request_kwargs: Dict[str, any] = { - "max_context_length": config["search_max_context_length"], - "max_requests": config["search_max_requests"], - } - - # Get objective function - self.objective_name: str = config.get("objective_function", "default") - self.objective_function = OBJECTIVE_FUNCTIONS.get(self.objective_name, OBJECTIVE_FUNCTIONS["default"]) - logging.info(f"Using objective function: {self.objective_name}") + self.generation_kwargs: Dict[str, any] = generation_kwargs + self.request_kwargs: Dict[str, any] = request_kwargs def __call__(self, attention_config: any, task_name: str, model_name: str) -> Tuple[float, float, float]: """Run benchmark and return (score, density, error) tuple. @@ -82,14 +75,25 @@ def __call__(self, attention_config: any, task_name: str, model_name: str) -> Tu if not attention_config.validity_constraint(attention_config): logging.info(f"Config failed validity constraint, returning penalty score") return 100.0, 1.0, 1.0 # Penalty score, worst density, worst error + else: + raise ValueError(f"No validity constraint found for attention configuration: {attention_config}. If there is no validity constraint . just set lambda: True in builder.") + + if hasattr(attention_config, 'objective') and attention_config.objective is not None: + objective_function = OBJECTIVE_FUNCTIONS[attention_config.objective] + logging.info(f"Using objective function: {objective_function.__name__} for attention configuration: {attention_config}") + else: + raise ValueError(f"No objective function found for attention configuration: {attention_config}. If config is objective agnostic just set default in builder.") + if DRY_RUN: + return random.random(), random.random(), random.random() + benchmark_name: str subset_name: str | None benchmark_name, subset_name = task_name.split("/", 1) if "/" in task_name else (task_name, None) # Create result directory for this specific run - result_dir: Path = self.base_result_dir / f"{model_name}_{task_name}_{hash(str(attention_config)) % 1000000}" - result_dir.mkdir(parents=True, exist_ok=True) + result_dir: Path = os.path.join(self.base_result_dir, f"{model_name}_{task_name}_{hash(str(attention_config)) % 1000000}") + os.makedirs(result_dir, exist_ok=True) # Create model adapter adapter: ModelAdapterHF = ModelAdapterHF( @@ -134,10 +138,10 @@ def __call__(self, attention_config: any, task_name: str, model_name: str) -> Tu score: float = 100.0 # Small baseline score for dense else: # Use the selected objective function - score = self.objective_function(error, density) + score = objective_function(error, density) # Also print to stdout so the test script can detect it - print(f"Objective: {self.objective_name}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") - logging.info(f"Objective: {self.objective_name}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") + print(f"Objective: {objective_function.__name__}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") + logging.info(f"Objective: {objective_function.__name__}, Error: {error:.4f}, Density: {density:.4f}, Score: {score:.4f}") return score, density, error @@ -159,8 +163,8 @@ def _extract_micro_metrics(self, result_dir: Path) -> Dict[str, float]: - attention_error: Average attention output error (0.0 to 1.0) - density: Average attention density (0.0 to 1.0) """ - micro_metrics_file: Path = result_dir / "micro_metrics.jsonl" - if not micro_metrics_file.exists(): + micro_metrics_file: Path = os.path.join(result_dir, "micro_metrics.jsonl") + if not os.path.exists(micro_metrics_file): # For dense configuration, micro_metrics.jsonl won't exist since no sparse attention is used # Return default values: 0 error (perfect) and 1.0 density (fully dense) logging.info(f"micro_metrics.jsonl not found in {result_dir}, using dense defaults") diff --git a/benchmark/raytune/config_builders/dense.py b/benchmark/raytune/config_builders/dense.py index 88cbf003..d084e94b 100644 --- a/benchmark/raytune/config_builders/dense.py +++ b/benchmark/raytune/config_builders/dense.py @@ -1,6 +1,6 @@ """Configuration builder for dense (no sparse attention) model.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig @@ -14,17 +14,18 @@ class DenseConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get dense baseline configuration. - - Returns list of (name, full_config, masker_classes) tuples. - - For dense models, sparse_config and masker_classes are None to indicate - no sparse attention is used. + + Ignores: + sparsity_objectives: List[int] - List of sparsity objectives + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration Returns: Tuple of (optimal_configs, to_optimize_configs) @@ -33,6 +34,8 @@ def build_configs( to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] # Dense baseline: no sparse attention, so sparse_config and masker_classes are None + # Since dense doesn't depend on sparsity or memory objectives, we just return a single config + # with None values (no sparse attention configuration needed) optimal_configs.append(("dense", None, None)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/double_sparsity.py b/benchmark/raytune/config_builders/double_sparsity.py index c6bb84ae..8b3d2ce9 100644 --- a/benchmark/raytune/config_builders/double_sparsity.py +++ b/benchmark/raytune/config_builders/double_sparsity.py @@ -1,7 +1,8 @@ """Configuration builder for DoubleSparsity attention.""" -from typing import List, Optional, Tuple - +from functools import partial +from typing import List, Optional, Tuple, Dict +import os from ray import tune from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig @@ -15,6 +16,13 @@ from .factory import register_builder from .utility import get_masker_list_name +from logging import getLogger +logger = getLogger(__name__) + +def _validity_check(config: ResearchAttentionConfig, mem_obj: int) -> bool: + """Check if the config meets the memory objective constraint.""" + return (128 // config.masker_configs[2].group_factor) * config.masker_configs[2].label_bits == mem_obj + @register_builder("double_sparsity") class DoubleSparsityConfigBuilder(BaseConfigBuilder): @@ -22,58 +30,59 @@ class DoubleSparsityConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", - memory_objective: Optional[str] = None, + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all double sparsity attention configurations. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + memory_objectives: List[int] - List of memory objectives to build the configurations. + model_config: Dict[str, str] - Model configuration - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. - - Args: - weight_file: Path to weight file (required but not used for DoubleSparsity) - objective: Objective function name (e.g., "sparsity_5") - memory_objective: Memory objective parameter (e.g., "32") - required - **kwargs: Additional parameters - Returns: Tuple of (optimal_configs, to_optimize_configs) """ - assert weight_file is not None, "Weight file is required for HashAttention Masker" - assert memory_objective is not None, "memory_objective is required for get_double_sparsity_configs" - optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + if model_config["double_sparsity_config_file"] is None or not os.path.exists(model_config["double_sparsity_config_file"]): + logger.warning(f"Double sparsity config file {model_config['double_sparsity_config_file']} for model {model_config['model_name']} does not exist. Skipping Double Sparsity configurations.") + return optimal_configs, to_optimize_configs - heavy_size: float = float(objective.split("_")[1]) / 100.0 - (256.0 / 32768) - aux_mem: int = int(memory_objective) - - classes = [SinkMaskerConfig, LocalMaskerConfig, DoubleSparsityTopKMaskerConfig] - name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size, "aux_mem": aux_mem}) + for sparsity_objective in sparsity_objectives: + for memory_objective in memory_objectives: + heavy_size: float = float(sparsity_objective) / 100.0 + aux_mem: int = memory_objective + + classes = [SinkMaskerConfig, LocalMaskerConfig, DoubleSparsityTopKMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "double_sparsity", "sparsity_obj": sparsity_objective, "memory_obj": memory_objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - DoubleSparsityTopKMaskerConfig( - heavy_size=heavy_size, - group_factor=8, - label_bits=2, - sorted_channel_file="/data/apdesai/code/DoubleSparse/config/meta-llama/Llama-3.1-8B-Instruct.json", - channel_selection="q_proj"), - ]) - - config.masker_configs[2].search_space = { - "channel_selection": tune.grid_search(["q_proj", "qk_proj"]), - "group_factor": tune.grid_search([2, 4, 8, 16]), - "label_bits": tune.grid_search([1, 2, 4, 8, 16]), - } - config.validity_constraint = lambda config: ((128 // config.masker_configs[2].group_factor) * config.masker_configs[2].label_bits == aux_mem) - to_optimize_configs.append((name, config, classes)) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + DoubleSparsityTopKMaskerConfig( + heavy_size=heavy_size - (256.0 / 32768), + group_factor=8, + label_bits=2, + sorted_channel_file=model_config["double_sparsity_config_file"], + channel_selection="q_proj"), + ]) + + config.masker_configs[2].search_space = { + "channel_selection": tune.grid_search(["q_proj"]), + "group_factor": tune.grid_search([2, 4, 8, 16]), + "label_bits": tune.grid_search([1, 2, 4, 8, 16]), + } + # Set validity constraint to use the correct memory_objective for comparison + config.validity_constraint = partial(_validity_check, mem_obj=aux_mem) + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/factory.py b/benchmark/raytune/config_builders/factory.py index 7e97eec4..7560279a 100644 --- a/benchmark/raytune/config_builders/factory.py +++ b/benchmark/raytune/config_builders/factory.py @@ -59,9 +59,10 @@ def get_all_config_builders() -> Dict[str, BaseConfigBuilder]: def build_all_configs( - weight_file: Optional[str] = None, - objective: str = "default", - builder_names: Optional[List[str]] = None, + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], + builder_names: List[str], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: @@ -76,18 +77,16 @@ def build_all_configs( Returns: Tuple of (optimal_configs, to_optimize_configs) aggregated from all builders """ - if builder_names is None: - builders = get_all_config_builders() - else: - builders = {name: get_config_builder(name) for name in builder_names} + builders: Dict[str, BaseConfigBuilder] = {name: get_config_builder(name) for name in builder_names} all_optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] all_to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] for builder_name, builder in builders.items(): optimal_configs, to_optimize_configs = builder.build_configs( - weight_file=weight_file, - objective=objective, + model_config=model_config, + sparsity_objectives=sparsity_objectives, + memory_objectives=memory_objectives, **kwargs ) all_optimal_configs.extend(optimal_configs) diff --git a/benchmark/raytune/config_builders/hashattention_topk.py b/benchmark/raytune/config_builders/hashattention_topk.py index 2deddc5e..eabf1e5b 100644 --- a/benchmark/raytune/config_builders/hashattention_topk.py +++ b/benchmark/raytune/config_builders/hashattention_topk.py @@ -1,6 +1,6 @@ """Configuration builder for HashAttention TopK attention.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( @@ -12,6 +12,10 @@ from .base import BaseConfigBuilder from .factory import register_builder from .utility import get_masker_list_name +import os +import logging + +logger = logging.getLogger(__name__) @register_builder("hashattention_topk") @@ -20,29 +24,38 @@ class HashAttentionTopKConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all HashAttention TopK attention configurations. - - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + model_config: Dict[str, str] - Model configuration (hash_attention_weight_file extracted from it) + + Ignores: + memory_objectives: List[int] - List of memory objectives Returns: Tuple of (optimal_configs, to_optimize_configs) """ - assert weight_file is not None, "Weight file is required for HashAttention Masker" + weight_file: str = model_config.get("hash_attention_weight_file") + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - for heavy_size in [0.02, 0.05, 0.1, 0.2]: + if not weight_file or not os.path.isfile(weight_file): + logger.warning(f"Weight file {weight_file} for model {model_config['model_name']} does not exist. Skipping HashAttention TopK configurations.") + return optimal_configs, to_optimize_configs + + for sparsity_objective in sparsity_objectives: + heavy_size: float = float(sparsity_objective) / 100.0 classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig] - name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) + name: str = get_masker_list_name(classes, other_params={"builder": "hashattention_topk", "sparsity_obj": sparsity_objective}) config = ResearchAttentionConfig(masker_configs=[ SinkMaskerConfig(sink_size=128), @@ -56,6 +69,11 @@ def build_configs( hat_weight_file=weight_file ), ]) + # Set validity to default (doesn't depend on memory objectives) + config.validity_constraint = lambda config: True + # Set objective function + config.objective = sparsity_objective + optimal_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/magicpig.py b/benchmark/raytune/config_builders/magicpig.py index 9d43b76e..45f1bb40 100644 --- a/benchmark/raytune/config_builders/magicpig.py +++ b/benchmark/raytune/config_builders/magicpig.py @@ -1,6 +1,6 @@ """Configuration builder for MagicPig attention.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from ray import tune @@ -24,17 +24,19 @@ class MagicPigConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all MagicPig attention configurations. - - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration Returns: Tuple of (optimal_configs, to_optimize_configs) @@ -42,24 +44,31 @@ def build_configs( optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - classes = [SinkMaskerConfig, LocalMaskerConfig, MagicPigConfig] - name: str = get_masker_list_name(classes, other_params={"objective": objective}) - - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - MagicPigConfig( - lsh_l=8, # Default value from search space - lsh_k=64 # Default value from search space - ) - ]) - - # Set up search space for LSH parameters - config.masker_configs[2].search_space = { - "lsh_l": tune.grid_search([16, 32, 64, 128]), - "lsh_k": tune.grid_search([2, 4, 8, 16, 32]), - } + for sparsity_objective in sparsity_objectives: + classes = [SinkMaskerConfig, LocalMaskerConfig, MagicPigConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "magicpig", "sparsity_obj": sparsity_objective}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + MagicPigConfig( + lsh_l=8, # Default value from search space + lsh_k=64 # Default value from search space + ) + ]) + + # Set up search space for LSH parameters + config.masker_configs[2].search_space = { + "lsh_l": tune.grid_search([16, 32, 64, 128]), + "lsh_k": tune.grid_search([2, 4, 8, 16]), + } + + # Set validity to default (doesn't depend on memory objectives) + config.validity_constraint = lambda config: True + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) - to_optimize_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/oracle_topk.py b/benchmark/raytune/config_builders/oracle_topk.py index 54d98408..2504e88c 100644 --- a/benchmark/raytune/config_builders/oracle_topk.py +++ b/benchmark/raytune/config_builders/oracle_topk.py @@ -1,6 +1,6 @@ """Configuration builder for Oracle TopK attention.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from ray import tune @@ -22,17 +22,19 @@ class OracleTopKConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: - """Get all Oracle TopK attention configurations. - - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. + """Get all Oracle TopK attention configurations based on the sparsity and memory objectives. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration Returns: Tuple of (optimal_configs, to_optimize_configs) @@ -40,15 +42,22 @@ def build_configs( optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - for heavy_size in [0.02, 0.05, 0.1, 0.2]: + + for sparsity_objective in sparsity_objectives: + heavy_size = float(sparsity_objective) / 100.0 classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig] - name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size}) + name: str = get_masker_list_name(classes, other_params={"builder": "oracle_topk", "sparsity_obj": sparsity_objective}) config = ResearchAttentionConfig(masker_configs=[ SinkMaskerConfig(sink_size=128), LocalMaskerConfig(window_size=128), - OracleTopKConfig(heavy_size=heavy_size - (256.0 / 32768)), # Default value + OracleTopKConfig(heavy_size=heavy_size - (256.0 / 32768)), ]) + # set validity to default + config.validity_constraint = lambda config: True + # set objective function + config.objective = sparsity_objective + optimal_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/oracle_topp.py b/benchmark/raytune/config_builders/oracle_topp.py index 7bdb6ca5..738c77c4 100644 --- a/benchmark/raytune/config_builders/oracle_topp.py +++ b/benchmark/raytune/config_builders/oracle_topp.py @@ -1,6 +1,6 @@ """Configuration builder for Oracle TopP attention.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from ray import tune @@ -22,17 +22,19 @@ class OracleTopPConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all Oracle TopP attention configurations. - - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration Returns: Tuple of (optimal_configs, to_optimize_configs) @@ -40,21 +42,28 @@ def build_configs( optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopPMaskerConfig] - name: str = get_masker_list_name(classes, other_params={"objective": objective}) - - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopPMaskerConfig(top_p=0.7) # Default middle value from search space - ]) - - # Set up search space for top_p parameter - # Using the default search space from OracleTopPMaskerConfig - config.masker_configs[2].search_space = { - "top_p": tune.grid_search([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99]), - } + for sparsity_objective in sparsity_objectives: + classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopPMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "oracle_topp", "sparsity_obj": sparsity_objective}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + OracleTopPMaskerConfig(top_p=0.7) # Default middle value from search space + ]) + + # Set up search space for top_p parameter + # Using the default search space from OracleTopPMaskerConfig + config.masker_configs[2].search_space = { + "top_p": tune.grid_search([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99]), + } + + # Set validity to default (doesn't depend on memory objectives) + config.validity_constraint = lambda config: True + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) - to_optimize_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/quest_top_k.py b/benchmark/raytune/config_builders/quest_top_k.py index af21357e..da65d86f 100644 --- a/benchmark/raytune/config_builders/quest_top_k.py +++ b/benchmark/raytune/config_builders/quest_top_k.py @@ -1,6 +1,7 @@ """Configuration builder for Quest TopK attention.""" -from typing import List, Optional, Tuple +from functools import partial +from typing import List, Optional, Tuple, Dict from ray import tune @@ -16,64 +17,64 @@ from .utility import get_masker_list_name -@register_builder("quest_top_k") +def _validity_check(config: ResearchAttentionConfig, mem_obj: int) -> bool: + """Check if the config meets the memory objective constraint.""" + return mem_obj == 2 * (128 * config.masker_configs[2].label_bits) / config.masker_configs[2].page_size + + +@register_builder("quest_topk") class QuestTopKConfigBuilder(BaseConfigBuilder): """Builder for Quest TopK sparse attention configurations.""" def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", - memory_objective: Optional[str] = None, + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all Quest TopK attention configurations. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + memory_objectives: List[int] - List of memory objectives to build the configurations. + Ignores: + model_config: Dict[str, str] - Model configuration - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. - - Args: - weight_file: Path to weight file (required but not used for QuestTopK) - objective: Objective function name (e.g., "sparsity_5") - memory_objective: Memory objective parameter (e.g., "32") - required - **kwargs: Additional parameters - Returns: Tuple of (optimal_configs, to_optimize_configs) """ - assert weight_file is not None, "Weight file is required for QuestTopK Masker" - assert memory_objective is not None, "memory_objective is required for get_quest_top_k_configs" - optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - heavy_size: float = float(objective.split("_")[1]) / 100.0 - (256.0 / 32768) - aux_mem: int = int(memory_objective) - - classes = [SinkMaskerConfig, LocalMaskerConfig, QuestTopKMaskerConfig] - name: str = get_masker_list_name(classes, other_params={"heavy_size": heavy_size, "aux_mem": aux_mem}) + for sparsity_objective in sparsity_objectives: + for memory_objective in memory_objectives: + heavy_size: float = float(sparsity_objective) / 100.0 - (256.0 / 32768) + aux_mem: int = memory_objective + + classes = [SinkMaskerConfig, LocalMaskerConfig, QuestTopKMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "quest_topk", "sparsity_obj": sparsity_objective, "memory_obj": memory_objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - QuestTopKMaskerConfig( - heavy_size=heavy_size, - page_size=128, - label_bits=16), - ]) - - config.masker_configs[2].search_space = { - "page_size": tune.grid_search([8, 16, 32, 64, 128]), - "label_bits": tune.grid_search([2, 4, 8, 16]), - } - # Memory constraint: similar to double_sparsity pattern - # For quest_top_k, memory usage depends on page_size and label_bits - # Adjust this constraint based on actual memory requirements - config.validity_constraint = lambda config: (aux_mem == 2 * (128 * config.masker_configs[2].label_bits) / config.masker_configs[2].page_size ) - to_optimize_configs.append((name, config, classes)) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + QuestTopKMaskerConfig( + heavy_size=heavy_size - (256.0 / 32768), + page_size=128, + label_bits=16), + ]) + + config.masker_configs[2].search_space = { + "page_size": tune.grid_search([8, 16, 32, 64, 128]), + "label_bits": tune.grid_search([2, 4, 8, 16]), + } + # Set validity constraint to use the correct memory_objective for comparison + config.validity_constraint = partial(_validity_check, mem_obj=aux_mem) + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/random_sampling.py b/benchmark/raytune/config_builders/random_sampling.py index ccef0416..837038f4 100644 --- a/benchmark/raytune/config_builders/random_sampling.py +++ b/benchmark/raytune/config_builders/random_sampling.py @@ -1,6 +1,6 @@ """Configuration builder for Random Sampling attention.""" -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict from ray import tune @@ -24,17 +24,19 @@ class RandomSamplingConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all Random Sampling attention configurations. - - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration Returns: Tuple of (optimal_configs, to_optimize_configs) @@ -44,14 +46,19 @@ def build_configs( classes = [SinkMaskerConfig, LocalMaskerConfig, RandomSamplingMaskerConfig] - - for budget_size in [0.02, 0.05, 0.1, 0.2]: - name: str = get_masker_list_name(classes, other_params={"budget_size": budget_size}) + for sparsity_objective in sparsity_objectives: + budget_size: float = float(sparsity_objective) / 100.0 + name: str = get_masker_list_name(classes, other_params={"builder": "random_sampling", "sparsity_obj": sparsity_objective}) config = ResearchAttentionConfig(masker_configs=[ SinkMaskerConfig(sink_size=128), # Middle value from search space LocalMaskerConfig(window_size=128), # Middle value from search space - RandomSamplingMaskerConfig(sampling_rate=budget_size- (256.0 / 32768)) # Middle value from search space + RandomSamplingMaskerConfig(sampling_rate=budget_size - (256.0 / 32768)) # Middle value from search space ]) + # Set validity to default (doesn't depend on memory objectives) + config.validity_constraint = lambda config: True + # Set objective function + config.objective = sparsity_objective + optimal_configs.append((name, config, classes)) return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/utility.py b/benchmark/raytune/config_builders/utility.py index 62350fc0..ad638599 100644 --- a/benchmark/raytune/config_builders/utility.py +++ b/benchmark/raytune/config_builders/utility.py @@ -67,13 +67,13 @@ def objective(error: float, density: float) -> float: # Pre-defined objective functions for common sparsity levels OBJECTIVE_FUNCTIONS = { - "sparsity_2": create_sparsity_objective(0.02), - "sparsity_5": create_sparsity_objective(0.05), - "sparsity_10": create_sparsity_objective(0.10), - "sparsity_15": create_sparsity_objective(0.15), - "sparsity_20": create_sparsity_objective(0.20), - "sparsity_25": create_sparsity_objective(0.25), - "default": lambda error, density: error + 0.1 * density + (5.0 if density > 0.5 else 0.0), + 2: create_sparsity_objective(0.02), + 5: create_sparsity_objective(0.05), + 10: create_sparsity_objective(0.10), + 15: create_sparsity_objective(0.15), + 20: create_sparsity_objective(0.20), + 25: create_sparsity_objective(0.25), + -1: lambda error, density: error + 0.1 * density + (5.0 if density > 0.5 else 0.0), } diff --git a/benchmark/raytune/config_builders/vattention_hashattention.py b/benchmark/raytune/config_builders/vattention_hashattention.py index 0b7fa1d8..23d6a841 100644 --- a/benchmark/raytune/config_builders/vattention_hashattention.py +++ b/benchmark/raytune/config_builders/vattention_hashattention.py @@ -1,6 +1,10 @@ -"""Configuration builder for VAttention HashAttention TopK configurations.""" +"""Configuration builder for VAttention HashAttention TopK configurations. +Currently works for 32 bits hash attention only. Need some changes to support + general bit-width hashattention in future. +""" -from typing import List, Optional, Tuple +from functools import partial +from typing import List, Optional, Tuple, Dict from ray import tune @@ -17,6 +21,16 @@ from .base import BaseConfigBuilder from .factory import register_builder from .utility import get_masker_list_name +import os +import logging + +logger = logging.getLogger(__name__) + +def _validity_check(config: ResearchAttentionConfig, sparsity_val: float) -> bool: + """Check if the config meets the sparsity constraint.""" + return ( + (config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity_val + ) @register_builder("vattention_hashattention") @@ -25,112 +39,119 @@ class VAttentionHashAttentionConfigBuilder(BaseConfigBuilder): def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all sparse attention configurations. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + model_config: Dict[str, str] - Model configuration (weight_file extracted from it) + memory_objectives: List[int] - List of memory objectives (bit-width) to build the configurations. - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. - - Args: - weight_file: Path to weight file (required for HashAttention) - objective: Objective function name (e.g., "sparsity_2", "sparsity_5", etc.) - **kwargs: Additional parameters - Returns: Tuple of (optimal_configs, to_optimize_configs) """ + weight_file: str = model_config.get("hash_attention_weight_file") assert weight_file is not None, "Weight file is required for HashAttention Masker" optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - - classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig, AdaptiveSamplingMaskerConfig] - name: str = get_masker_list_name(classes, other_params={"objective": objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - HashAttentionTopKMaskerConfig( - heavy_size=0.05, # Middle value from search space - hat_bits=32, # Required parameter - hat_mlp_layers=3, # Required parameter - hat_mlp_hidden_size=128, # Required parameter - hat_mlp_activation="silu", # Required parameter - hat_weight_file=weight_file # Weight file is required - ), - AdaptiveSamplingMaskerConfig( - base_rate_sampling=0.05, # Middle value - epsilon=0.05, # Middle value - delta=0.05, # Middle value - init_offset=128, # Middle value - local_offset=128 # Middle value - ) - ]) - if objective == "sparsity_2": - # Adaptive sampling with HashAttention top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), - "epsilon": tune.grid_search([0.1, 0.2, 0.3, 0.4]), - "delta": tune.grid_search([0.1, 0.2, 0.3, 0.4]) - } + if not weight_file or not os.path.isfile(weight_file): + logger.warning(f"Weight file {weight_file} for model {model_config['model_name']} does not exist. Skipping HashAttention TopK configurations.") + return optimal_configs, to_optimize_configs - elif objective == "sparsity_5": - # Adaptive sampling with HashAttention top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.01, 0.025, 0.05]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.01, 0.02, 0.03]), - "epsilon": tune.grid_search([0.05, 0.1, 0.2, 0.3]), - "delta": tune.grid_search([0.05, 0.1, 0.2, 0.3]) - } + for sparsity_objective in sparsity_objectives: + sparsity_val: float = float(sparsity_objective) / 100.0 + classes = [SinkMaskerConfig, LocalMaskerConfig, HashAttentionTopKMaskerConfig, AdaptiveSamplingMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "vattention_hashattention", "sparsity_obj": sparsity_objective}) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + HashAttentionTopKMaskerConfig( + heavy_size=0.05, # Middle value from search space + hat_bits=32, # Required parameter + hat_mlp_layers=3, # Required parameter + hat_mlp_hidden_size=128, # Required parameter + hat_mlp_activation="silu", # Required parameter + hat_weight_file=weight_file # Weight file is required + ), + AdaptiveSamplingMaskerConfig( + base_rate_sampling=0.05, # Middle value + epsilon=0.05, # Middle value + delta=0.05, # Middle value + init_offset=128, # Middle value + local_offset=128 # Middle value + ) + ]) + + if sparsity_objective == 2: + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), + "epsilon": tune.grid_search([0.1, 0.2, 0.3, 0.4]), + "delta": tune.grid_search([0.1, 0.2, 0.3, 0.4]) + } - elif objective == "sparsity_10": - # Adaptive sampling with HashAttention top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.025, 0.05, 0.075, 0.1]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.025, 0.05, 0.075]), - "epsilon": tune.grid_search([0.025, 0.05, 0.075]), - "delta": tune.grid_search([0.025, 0.05, 0.075]) - } - elif objective == "sparsity_15": - # Adaptive sampling with HashAttention top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.04, 0.06, 0.1]), - "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), - "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) - } + elif sparsity_objective == 5: + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.01, 0.025, 0.05]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.01, 0.02, 0.03]), + "epsilon": tune.grid_search([0.05, 0.1, 0.2, 0.3]), + "delta": tune.grid_search([0.05, 0.1, 0.2, 0.3]) + } - elif objective == "sparsity_20": - # Adaptive sampling with HashAttention top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), - "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), - "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) - } - else: - raise ValueError(f"objective not supported: {objective}") - - sparsity = float(objective.split("_")[1]) / 100.0 - config.validity_constraint = lambda config: ((config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity) + elif sparsity_objective == 10: + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.025, 0.05, 0.075, 0.1]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.025, 0.05, 0.075]), + "epsilon": tune.grid_search([0.025, 0.05, 0.075]), + "delta": tune.grid_search([0.025, 0.05, 0.075]) + } + elif sparsity_objective == 15: + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.04, 0.06, 0.1]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } - to_optimize_configs.append((name, config, classes)) + elif sparsity_objective == 20: + # Adaptive sampling with HashAttention top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } + else: + raise ValueError(f"sparsity_objective not supported: {sparsity_objective}") + + # Set validity constraint to use the correct sparsity value for comparison + config.validity_constraint = partial(_validity_check, sparsity_val=sparsity_val) + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) + return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/config_builders/vattention_oracle.py b/benchmark/raytune/config_builders/vattention_oracle.py index f920339c..00996751 100644 --- a/benchmark/raytune/config_builders/vattention_oracle.py +++ b/benchmark/raytune/config_builders/vattention_oracle.py @@ -1,6 +1,7 @@ """Configuration builder for VAttention Oracle TopK configurations.""" -from typing import List, Optional, Tuple +from functools import partial +from typing import List, Optional, Tuple, Dict from ray import tune @@ -19,111 +20,116 @@ from .utility import get_masker_list_name +def _validity_check(config: ResearchAttentionConfig, sparsity_val: float) -> bool: + """Check if the config meets the sparsity constraint.""" + return (config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity_val + + @register_builder("vattention_oracle") class VAttentionOracleConfigBuilder(BaseConfigBuilder): """Builder for VAttention Oracle TopK sparse attention configurations.""" def build_configs( self, - weight_file: Optional[str] = None, - objective: str = "default", + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], **kwargs ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all sparse attention configurations. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration - Returns list of (name, full_config, masker_classes) tuples. - - Note: The configs returned here are only used to determine which masker classes - to use. The actual parameter values will be determined by Ray Tune search. - - Args: - weight_file: Path to weight file (required but not used for this config) - objective: Objective function name (e.g., "sparsity_2", "sparsity_5", etc.) - **kwargs: Additional parameters - Returns: Tuple of (optimal_configs, to_optimize_configs) """ - assert weight_file is not None, "Weight file is required for HashAttention Masker" - optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] - classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig, AdaptiveSamplingMaskerConfig] - name: str = get_masker_list_name(classes, other_params={"objective": objective}) - config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopKConfig(heavy_size=0.05), # Middle value from search space - AdaptiveSamplingMaskerConfig( - base_rate_sampling=0.05, # Middle value - epsilon=0.05, # Middle value - delta=0.05, # Middle value - init_offset=128, # Middle value - local_offset=128 # Middle value - ) - ]) - - if objective == "sparsity_2": - #1. Adaptive sampling with oracle top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), - "epsilon": tune.grid_search([0.1, 0.2, 0.3, 0.4]), - "delta": tune.grid_search([0.1, 0.2, 0.3, 0.4]) - } + for sparsity_objective in sparsity_objectives: + sparsity_val: float = float(sparsity_objective) / 100.0 + classes = [SinkMaskerConfig, LocalMaskerConfig, OracleTopKConfig, AdaptiveSamplingMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "vattention_oracle", "sparsity_obj": sparsity_objective}) + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + OracleTopKConfig(heavy_size=0.05), # Middle value from search space + AdaptiveSamplingMaskerConfig( + base_rate_sampling=0.05, # Middle value + epsilon=0.05, # Middle value + delta=0.05, # Middle value + init_offset=128, # Middle value + local_offset=128 # Middle value + ) + ]) + + if sparsity_objective == 2: + # Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), + "epsilon": tune.grid_search([0.1, 0.2, 0.3, 0.4]), + "delta": tune.grid_search([0.1, 0.2, 0.3, 0.4]) + } - elif objective == "sparsity_5": - #1. Adaptive sampling with oracle top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.01, 0.025, 0.05]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.01, 0.02, 0.03]), - "epsilon": tune.grid_search([0.05, 0.1, 0.2, 0.3]), - "delta": tune.grid_search([0.05, 0.1, 0.2, 0.3]) - } + elif sparsity_objective == 5: + # Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.01, 0.025, 0.05]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.01, 0.02, 0.03]), + "epsilon": tune.grid_search([0.05, 0.1, 0.2, 0.3]), + "delta": tune.grid_search([0.05, 0.1, 0.2, 0.3]) + } - elif objective == "sparsity_10": - #1. Adaptive sampling with oracle top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.025, 0.05, 0.075, 0.1]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.025, 0.05, 0.075]), - "epsilon": tune.grid_search([0.025, 0.05, 0.075]), - "delta": tune.grid_search([0.025, 0.05, 0.075]) - } - elif objective == "sparsity_15": - #1. Adaptive sampling with oracle top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0, 0.04, 0.06, 0.1]), - "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), - "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) - } + elif sparsity_objective == 10: + # Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.025, 0.05, 0.075, 0.1]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.025, 0.05, 0.075]), + "epsilon": tune.grid_search([0.025, 0.05, 0.075]), + "delta": tune.grid_search([0.025, 0.05, 0.075]) + } + elif sparsity_objective == 15: + # Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.04, 0.06, 0.1]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } - elif objective == "sparsity_20": - #1. Adaptive sampling with oracle top k - config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), - } - config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), - "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), - "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) - } - else: - raise ValueError(f"objective not supported: {objective}") - - sparsity = float(objective.split("_")[1]) / 100.0 - config.validity_constraint = lambda config: ((config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity ) + elif sparsity_objective == 20: + # Adaptive sampling with oracle top k + config.masker_configs[2].search_space = { + "heavy_size": tune.grid_search([0.05, 0.1, 0.15]), + } + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), + "epsilon": tune.grid_search([0.01, 0.025, 0.05, 0.1]), + "delta": tune.grid_search([0.01, 0.025, 0.05, 0.1]) + } + else: + raise ValueError(f"sparsity_objective not supported: {sparsity_objective}") + + # Set validity constraint to use the correct sparsity value for comparison + config.validity_constraint = partial(_validity_check, sparsity_val=sparsity_val) + # Set objective function + config.objective = sparsity_objective - to_optimize_configs.append((name, config, classes)) + to_optimize_configs.append((name, config, classes)) + return optimal_configs, to_optimize_configs diff --git a/benchmark/raytune/optimizer_factory.py b/benchmark/raytune/optimizer_factory.py index 0930ca22..3684bbae 100755 --- a/benchmark/raytune/optimizer_factory.py +++ b/benchmark/raytune/optimizer_factory.py @@ -76,6 +76,8 @@ def create_config_from_params(self, params: Dict[str, Any]) -> ResearchAttention new_config = ResearchAttentionConfig(masker_configs=masker_instances) if hasattr(self.research_attention_config, 'validity_constraint'): new_config.validity_constraint = self.research_attention_config.validity_constraint + if hasattr(self.research_attention_config, 'objective'): + new_config.objective = self.research_attention_config.objective return new_config def create_optimizer(research_attention_config: Optional[ResearchAttentionConfig] = None) -> SparseConfigOptimizer: diff --git a/benchmark/raytune/run_config_dir.py b/benchmark/raytune/run_config_dir.py index bf2463ba..4f4c3b90 100755 --- a/benchmark/raytune/run_config_dir.py +++ b/benchmark/raytune/run_config_dir.py @@ -296,10 +296,10 @@ def progress_reporter(total_tasks: int, result_queue: RayQueue) -> None: def main( configs_dir: str, - benchmark_results_dir: str = "./benchmark_vt_full_10pct", + benchmark_results_dir: str = "/data/apdesai/DO_NOT_DELETE/sparse_attention_hub", max_new_tokens: int = 1000, max_context_length: int = 100000, - max_requests: int = 1000, + max_requests: int = 100, actors_per_gpu: Optional[int] = None ): """Ray-based parallel benchmark runner with efficient resource management. diff --git a/benchmark/raytune/run_optimize_configs.py b/benchmark/raytune/run_optimize_configs.py index 5adfd066..9e5673a7 100755 --- a/benchmark/raytune/run_optimize_configs.py +++ b/benchmark/raytune/run_optimize_configs.py @@ -30,23 +30,25 @@ # Import run configuration from OPTIMIZATION_EXPERIMENT import ( MODEL_CONFIGS, - DEFAULT_MODEL, - RUN_TASKS, - OBJECTIVE, - NUM_SAMPLES, + MODELS, + TASKS, + SPARSITY_OBJECTIVES, + MEMORY_OBJECTIVES, SEARCH_MAX_NEW_TOKENS, SEARCH_MAX_CONTEXT_LENGTH, SEARCH_MAX_REQUESTS, FORCE_SEARCH, OPTIMAL_CONFIGS_DIR, RAY_RESULTS_DIR, - SEARCH_TIMEOUT, ACTORS_PER_GPU, - MEMORY_OBJECTIVE, BUILDER_NAMES, ) -def get_all_sparse_configs(weight_file: str = None, objective: str = "default", memory_objective: str = None, builder_names: List[str] = None) -> List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]: +def get_all_sparse_configs(model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], + builder_names: List[str]) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: """Get all sparse attention configurations. Returns list of (name, full_config, masker_classes) tuples. @@ -62,156 +64,63 @@ def get_all_sparse_configs(weight_file: str = None, objective: str = "default", Returns: Tuple of (optimal_configs, to_optimize_configs) """ - assert weight_file is not None, "Weight file is required for HashAttention Masker" - # Use factory to build all configs optimal_configs, to_optimize_configs = build_all_configs( - weight_file=weight_file, - objective=objective, - builder_names=builder_names or BUILDER_NAMES, - memory_objective=memory_objective + model_config = model_config, + sparsity_objectives=sparsity_objectives, + memory_objectives=memory_objectives, + builder_names=builder_names, ) return optimal_configs, to_optimize_configs -def get_run_configuration() -> dict: - """Build complete configuration from RUN_CONFIG.py.""" - num_gpus: int = torch.cuda.device_count() - - # Get model configuration - model_config: Dict[str, str] = MODEL_CONFIGS[DEFAULT_MODEL] - weight_file: str = model_config["weight_file"] - model_name: str = model_config["model_name"] - - if not os.path.exists(weight_file): - weight_file = "./hat_weights.pkl" - print(f"Warning: HashAttention weights not found, using {weight_file}") - - # Get all sparse configs - optimal_configs, to_optimize_configs = get_all_sparse_configs( - weight_file, - objective=OBJECTIVE, - memory_objective=MEMORY_OBJECTIVE, - builder_names=BUILDER_NAMES - ) - - # Set models, tasks, and num_samples - models: List[str] = [model_name] - tasks: List[str] = RUN_TASKS - num_samples: int = NUM_SAMPLES - - # Build config maps - optimal_configs_map: Dict[str, tuple] = {} - to_optimize_configs_map: Dict[str, tuple] = {} - for name, full_config, classes in optimal_configs: - optimal_configs_map[name] = (classes, full_config) - for name, full_config, classes in to_optimize_configs: - to_optimize_configs_map[name] = (classes, full_config) - - return { - "models": models, - "tasks": tasks, - "optimal_configs": optimal_configs, - "to_optimize_configs": to_optimize_configs, - "optimal_configs_map": optimal_configs_map, - "to_optimize_configs_map": to_optimize_configs_map, - "gpu_ids": list(range(num_gpus)), - "num_samples": num_samples, - "objective_function": OBJECTIVE, - - # Directories - "optimal_configs_dir": OPTIMAL_CONFIGS_DIR, - "ray_results_dir": RAY_RESULTS_DIR, - "search_result_dir": os.path.join(RAY_RESULTS_DIR, "search_runs"), - - # Search params - "search_timeout": SEARCH_TIMEOUT, - "search_max_new_tokens": SEARCH_MAX_NEW_TOKENS, - "search_max_context_length": SEARCH_MAX_CONTEXT_LENGTH, - "search_max_requests": SEARCH_MAX_REQUESTS, - "force_search": FORCE_SEARCH, - } - - -def run_search(config: Dict[str, Any], actors_per_gpu: int = 1) -> Dict[str, OptimalConfig]: +def run_search() -> Dict[str, OptimalConfig]: """Find optimal configurations for all combinations. This function orchestrates the search process across all model/task/config combinations, using ConfigSearchManager to handle individual searches. + All configuration is loaded from OPTIMIZATION_EXPERIMENT.py. Args: - config: Dictionary containing search configuration with keys: - - models: List of model names - - tasks: List of task names - - optimal_configs: List of optimal configs (don't need search) - - to_optimize_configs: List of configs to optimize - - optimal_configs_map: Map of optimal configs - - to_optimize_configs_map: Map of configs to optimize - - num_samples: Number of samples per search - - objective_function: Objective function name - - search_max_new_tokens: Max new tokens for search - - search_max_context_length: Max context length - - search_max_requests: Max requests per trial - - search_timeout: Timeout per trial actors_per_gpu: Number of actors per GPU for resource allocation Returns: Dictionary mapping config keys to OptimalConfig objects """ - print("\n" + "="*80) - print("HYPERPARAMETER SEARCH") - print("="*80) - print(f"Models: {len(config['models'])}") - print(f"Tasks: {len(config['tasks'])}") - print(f"Optimal Configs: {len(config['optimal_configs'])}") - print(f"To Optimize Configs: {len(config['to_optimize_configs'])}") - print(f"Total Combinations to optimize: {len(config['models']) * len(config['tasks']) * len(config['to_optimize_configs'])}") - print(f"Samples per search: {config['num_samples']}") - print(f"Objective Function: {config['objective_function']}") - - # Display objective function details - if config['objective_function'].startswith('sparsity_'): - target: int = int(config['objective_function'].split('_')[1]) - print(f" → Targeting {target}% density (0.{target:02d} fraction)") - print(f" → Formula: 0.99 * error + 0.01 * density + penalty for exceeding target") - - print("\nSearch Configuration:") - print(f" → Max new tokens: {config['search_max_new_tokens']}") - print(f" → Max context length: {config['search_max_context_length']}") - print(f" → Max requests per trial: {config['search_max_requests']}") - print(f" → Timeout per trial: {config['search_timeout']}s") - - print("\nNote: For each sparse config, Ray Tune will search different hyperparameter") - print("values (e.g., window_size, sink_size, sampling_rate) to find the best combination.") - print("="*80) - - manager: ConfigSearchManager = ConfigSearchManager(config) - optimal_configs: Dict[str, OptimalConfig] = {} - - total: int = len(config["models"]) * len(config["tasks"]) * len(config["to_optimize_configs"]) + len(config["models"]) * len(config["tasks"]) * len(config["optimal_configs"]) - current: int = 0 - for model in config["models"]: - print(f"\nModel: {model}") - print("-" * 60) + manager: ConfigSearchManager = ConfigSearchManager( + optimal_configs_dir=OPTIMAL_CONFIGS_DIR, + force_search=FORCE_SEARCH, + generation_kwargs={ + "max_new_tokens": SEARCH_MAX_NEW_TOKENS, + "do_sample": False + }, + request_kwargs={ + "max_context_length": SEARCH_MAX_CONTEXT_LENGTH, + "max_requests": SEARCH_MAX_REQUESTS + }, + ray_results_dir=RAY_RESULTS_DIR + ) + final_optimal_configs: Dict[str, OptimalConfig] = {} + + # first run all the optimal configs + for model in MODELS: + # Get model configuration + model_config: Dict[str, str] = MODEL_CONFIGS[model] - for task in config["tasks"]: - for masker_name, (masker_classes, full_config) in config["to_optimize_configs_map"].items(): - current += 1 - key: str = f"{model}_{task}_{masker_name}".replace("/", "_") - - print(f"\n[{current}/{total}] Task: {task} | Config: {masker_name}") - optimal: OptimalConfig = manager.search_optimal_config( - model, task, masker_name, masker_classes, full_config, actors_per_gpu - ) - optimal_configs[key] = optimal - - for masker_name, (masker_classes, full_config) in config["optimal_configs_map"].items(): - current += 1 + # Get all sparse configs + optimal_configs, to_optimize_configs = get_all_sparse_configs( + model_config, + sparsity_objectives=SPARSITY_OBJECTIVES, + memory_objectives=MEMORY_OBJECTIVES, + builder_names=BUILDER_NAMES + ) + for task in TASKS: + for (masker_name, full_config, masker_classes) in optimal_configs: key = f"{model}_{task}_{masker_name}".replace("/", "_") optimal = OptimalConfig( - model=model, + model=model_config["model_name"], task=task, masker_name=masker_name, sparse_config=full_config, @@ -221,34 +130,33 @@ def run_search(config: Dict[str, Any], actors_per_gpu: int = 1) -> Dict[str, Opt search_time=0.0, num_trials=0 ) - manager._save_config(optimal, Path(manager.results_dir) / f"{key}.json") - optimal_configs[key] = optimal + manager._save_config(optimal, os.path.join(manager.results_dir, f"{key}.json")) + final_optimal_configs[key] = optimal - print(f"\n{'='*80}") - print(f"Search complete. Found {len(optimal_configs)} optimal configurations.") - print(f"Configs saved to: {manager.results_dir}") - print(f"Run identifier: {manager.timestamp}") - print(f"{'='*80}") - - return optimal_configs + + for task in TASKS: + for (masker_name, full_config, masker_classes) in to_optimize_configs: + key: str = f"{model}_{task}_{masker_name}".replace("/", "_") + + optimal: OptimalConfig = manager.search_optimal_config( + model_config["model_name"], task, masker_name, masker_classes, full_config, ACTORS_PER_GPU + ) + final_optimal_configs[key] = optimal + + return final_optimal_configs def main() -> None: """Hyperparameter search for sparse attention methods. - All configuration is loaded from RUN_CONFIG.py. Modify that file to change + All configuration is loaded from OPTIMIZATION_EXPERIMENT.py. Modify that file to change search parameters instead of passing command-line arguments. """ - # Validate objective function - if OBJECTIVE not in OBJECTIVE_FUNCTIONS: - raise ValueError(f"Invalid objective function '{OBJECTIVE}'. Choose from: {list(OBJECTIVE_FUNCTIONS.keys())}") - - config: Dict[str, Any] = get_run_configuration() - + if not ray.is_initialized(): ray.init(ignore_reinit_error=True, log_to_driver=False, runtime_env={"working_dir": str(root_path)}) - optimal_configs: Dict[str, OptimalConfig] = run_search(config, ACTORS_PER_GPU) + run_search() ray.shutdown() diff --git a/benchmark/raytune/search_manager.py b/benchmark/raytune/search_manager.py index 78bb263a..ae49de20 100644 --- a/benchmark/raytune/search_manager.py +++ b/benchmark/raytune/search_manager.py @@ -27,7 +27,7 @@ ) from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig from benchmark_helper import BenchmarkHelper - +from OPTIMIZATION_EXPERIMENT import USE_TIMESTAMP_FOR_RESULTS_DIR class ConfigSearchManager: """Manages Phase 1: Hyperparameter search for optimal configs. @@ -36,7 +36,11 @@ class ConfigSearchManager: sparse attention configurations for given model/task combinations. """ - def __init__(self, base_config: Dict[str, any]) -> None: + def __init__(self, optimal_configs_dir: str, + force_search: bool, + generation_kwargs: Dict[str, any], + request_kwargs: Dict[str, any], + ray_results_dir: str) -> None: """Initialize the search manager with configuration. Args: @@ -44,13 +48,18 @@ def __init__(self, base_config: Dict[str, any]) -> None: - optimal_configs_dir: Directory to save optimal configs - force_search: Whether to force re-search even if configs exist """ - self.config: Dict[str, any] = base_config # Add timestamp to the results directory - timestamp: str = datetime.now().strftime("%Y%m%d_%H%M%S") - base_dir: Path = Path(base_config["optimal_configs_dir"]) - self.results_dir: Path = base_dir / f"run_{timestamp}" - self.results_dir.mkdir(parents=True, exist_ok=True) - self.timestamp: str = timestamp + if USE_TIMESTAMP_FOR_RESULTS_DIR: + timestamp: str = datetime.now().strftime("%Y%m%d_%H%M%S") + else: + timestamp: str = "default" + self.results_dir: str = os.path.join(optimal_configs_dir, f"run_{timestamp}") + os.makedirs(self.results_dir, exist_ok=True) + + self.force_search: bool = force_search + self.generation_kwargs: Dict[str, any] = generation_kwargs + self.request_kwargs: Dict[str, any] = request_kwargs + self.ray_results_dir: Path = ray_results_dir print(f"Saving optimal configs to: {self.results_dir}") def search_optimal_config( @@ -75,10 +84,10 @@ def search_optimal_config( Returns: OptimalConfig containing the best configuration found """ - config_file: Path = self.results_dir / f"{model}_{task}_{masker_name}.json".replace("/", "_") + config_file: Path = os.path.join(self.results_dir, f"{model}_{task}_{masker_name}.json".replace("/", "_")) # Check if already exists - if config_file.exists() and not self.config.get("force_search", False): + if os.path.exists(config_file) and not self.force_search: print(f" → Loading existing config") return self._load_config(config_file) @@ -118,7 +127,11 @@ def search_optimal_config( # Create objective function def objective(trial_config: Dict[str, any]) -> Dict[str, float]: - runner: BenchmarkHelper = BenchmarkHelper(self.config) + runner: BenchmarkHelper = BenchmarkHelper( + base_result_dir=self.results_dir, + generation_kwargs=self.generation_kwargs, + request_kwargs=self.request_kwargs + ) attention_config = optimizer.create_config_from_params(trial_config) score: float density: float @@ -126,18 +139,18 @@ def objective(trial_config: Dict[str, any]) -> Dict[str, float]: score, density, error = runner(attention_config, task, model) return {"combined_score": score, "density": density, "error": error} - # ### run a sample objective to ensure there are no errors - print("="*10, "Running a short test objective to ensure there are no errors", flush=True) - sample_config: Dict[str, float] = { - "AdaptiveSamplingMaskerConfig_base_rate_sampling": 0.1, - "AdaptiveSamplingMaskerConfig_epsilon": 0.25, - "AdaptiveSamplingMaskerConfig_delta": 0.25 - } - result: Dict[str, float] = objective(sample_config) - print("="*10, "Successfully ran a short test objective", flush=True) - print(sample_config) - print(result) - print("="*100, flush=True) + # # ### run a sample objective to ensure there are no errors + # print("="*10, "Running a short test objective to ensure there are no errors", flush=True) + # sample_config: Dict[str, float] = { + # "AdaptiveSamplingMaskerConfig_base_rate_sampling": 0.1, + # "AdaptiveSamplingMaskerConfig_epsilon": 0.25, + # "AdaptiveSamplingMaskerConfig_delta": 0.25 + # } + # result: Dict[str, float] = objective(sample_config) + # print("="*10, "Successfully ran a short test objective", flush=True) + # print(sample_config) + # print(result) + # print("="*100, flush=True) # Run Ray Tune sanitized_name: str = f"{model}_{task}_{masker_name}".replace("/", "_") @@ -147,7 +160,7 @@ def objective(trial_config: Dict[str, any]) -> Dict[str, float]: metric="combined_score", mode="min", resources_per_trial={"CPU": 1, "GPU": 1.0 / actors_per_gpu}, - storage_path=os.path.abspath(self.config["ray_results_dir"]), + storage_path=self.ray_results_dir, name=sanitized_name, verbose=1, # Show Ray Tune progress stop={"training_iteration": 1}, # One evaluation per config @@ -171,21 +184,21 @@ def objective(trial_config: Dict[str, any]) -> Dict[str, float]: trials_info.append(trial_info) # Save trial details to separate file - trials_file: Path = self.results_dir / f"{model}_{task}_{masker_name}_trials.json".replace("/", "_") + trials_file: Path = os.path.join(self.results_dir, f"{model}_{task}_{masker_name}_trials.json".replace("/", "_")) with open(trials_file, "w") as f: json.dump({ "model": model, "task": task, "masker_name": masker_name, - "objective_function": self.config.get("objective_function", "default"), + "objective_function": full_sparse_config.objective if full_sparse_config.objective else "None", "best_trial_id": best_trial.trial_id, "trials": trials_info, - "analysis_dataframe_path": str(self.results_dir / f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_")) + "analysis_dataframe_path": str(os.path.join(self.results_dir, f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_"))) }, f, indent=2) # Save Ray analysis dataframe for detailed analysis df = analysis.dataframe() - df.to_csv(self.results_dir / f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_"), index=False) + df.to_csv(os.path.join(self.results_dir, f"{model}_{task}_{masker_name}_analysis.csv".replace("/", "_")), index=False) optimal = OptimalConfig( model=model, From 4d839f1bcab6082f70523167cd5c34e4fc17c5cd Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Mon, 10 Nov 2025 09:01:27 -0800 Subject: [PATCH 5/7] config builder --- benchmark/raytune/OPTIMIZATION_EXPERIMENT.py | 9 ++- benchmark/raytune/config_builders/__init__.py | 2 + benchmark/raytune/config_builders/pqcache.py | 81 +++++++++++++++++++ 3 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 benchmark/raytune/config_builders/pqcache.py diff --git a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py index 3d53a610..c4df7f7d 100644 --- a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py +++ b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py @@ -62,12 +62,12 @@ ] TASKS: List[str] = [ - # "ruler32k/vt", + "ruler32k/vt", # "ruler32k/qa_1", # "ruler32k/qa_2", # "ruler32k/fwe", # "ruler32k/niah_multikey_2", - "ruler32k/niah_multikey_3", + # "ruler32k/niah_multikey_3", ] SPARSITY_OBJECTIVES: List[str] = [ @@ -87,12 +87,13 @@ # "dense", # "double_sparsity", # "hashattention_topk", - "magicpig", + # "magicpig", # "oracle_topk", # "oracle_topp", # "quest_topk", # "vattention_hashattention", # "vattention_oracle", + "pqcache", ] # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"]) @@ -101,7 +102,7 @@ SEARCH_MAX_NEW_TOKENS: int = 3 # Max new tokens for search trials SEARCH_MAX_CONTEXT_LENGTH: int = 40000 # Max context length for search trials SEARCH_MAX_REQUESTS: int = 3 # Max requests per search trial -OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/DO_NOT_DELETE/magicpig_optimization" # Directory for storing optimal configurations +OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/code/sparse-attention-hub/debug/" # Directory for storing optimal configurations RAY_RESULTS_DIR: str = "/tmp/ray_results" # Directory for Ray Tune results SEARCH_TIMEOUT: int = 900 # Timeout per search trial in seconds ACTORS_PER_GPU: int = 1 # Number of actors per GPU for resource allocation diff --git a/benchmark/raytune/config_builders/__init__.py b/benchmark/raytune/config_builders/__init__.py index f1910409..38a9ec1a 100644 --- a/benchmark/raytune/config_builders/__init__.py +++ b/benchmark/raytune/config_builders/__init__.py @@ -12,6 +12,7 @@ from .oracle_topp import OracleTopPConfigBuilder # noqa: E402, F401 from .hashattention_topk import HashAttentionTopKConfigBuilder # noqa: E402, F401 from .magicpig import MagicPigConfigBuilder # noqa: E402, F401 +from .pqcache import PQCacheConfigBuilder # noqa: E402, F401 from .quest_top_k import QuestTopKConfigBuilder # noqa: E402, F401 from .random_sampling import RandomSamplingConfigBuilder # noqa: E402, F401 @@ -25,6 +26,7 @@ "OracleTopPConfigBuilder", "HashAttentionTopKConfigBuilder", "MagicPigConfigBuilder", + "PQCacheConfigBuilder", "QuestTopKConfigBuilder", "RandomSamplingConfigBuilder", "get_config_builder", diff --git a/benchmark/raytune/config_builders/pqcache.py b/benchmark/raytune/config_builders/pqcache.py new file mode 100644 index 00000000..c54c384c --- /dev/null +++ b/benchmark/raytune/config_builders/pqcache.py @@ -0,0 +1,81 @@ +"""Configuration builder for PQCache attention.""" + +from typing import List, Optional, Tuple, Dict + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + PQCacheConfig, + SinkMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +@register_builder("pqcache") +class PQCacheConfigBuilder(BaseConfigBuilder): + """Builder for PQCache sparse attention configurations.""" + + def build_configs( + self, + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all PQCache attention configurations. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + for sparsity_objective in sparsity_objectives: + heavy_size: float = float(sparsity_objective) / 100.0 + classes = [SinkMaskerConfig, LocalMaskerConfig, PQCacheConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "pqcache", "sparsity_obj": sparsity_objective}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + PQCacheConfig( + heavy_size=heavy_size - (256.0 / 32768), + pq_group_factor=2, # Default value: head_dim=128 // pq_sub_dim=64 = 2 + pq_bits=6, # Default value from search space + kmeans_iter=10, # Default value from search space + init_offset=128, # Matches sink_size + metric="euclidean", # Default value from search space + ) + ]) + + # Set up search space for PQCache parameters + # Note: pq_group_factor = head_dim // pq_sub_dim + # Assuming head_dim=128: pq_sub_dim=64 -> pq_group_factor=2, pq_sub_dim=32 -> pq_group_factor=4 + config.masker_configs[2].search_space = { + "pq_group_factor": tune.grid_search([2, 4]), # Corresponds to pq_sub_dim=[64, 32] for head_dim=128 + "pq_bits": tune.grid_search([4, 6, 8]), + "kmeans_iter": tune.grid_search([10]), + "metric": tune.grid_search(["euclidean"]), + } + + # Set validity to default (doesn't depend on memory objectives) + config.validity_constraint = lambda config: True + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + From 288599760d2217947a73f27dca87d064687740dd Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Mon, 17 Nov 2025 12:43:59 -0800 Subject: [PATCH 6/7] vatt+pqcache builder --- benchmark/raytune/OPTIMIZATION_EXPERIMENT.py | 15 +- benchmark/raytune/config_builders/__init__.py | 2 + benchmark/raytune/config_builders/magicpig.py | 21 ++- .../config_builders/vattention_pqcache.py | 154 ++++++++++++++++++ benchmark/raytune/run_config_dir.py | 2 +- 5 files changed, 184 insertions(+), 10 deletions(-) create mode 100644 benchmark/raytune/config_builders/vattention_pqcache.py diff --git a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py index c4df7f7d..b47c38ea 100644 --- a/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py +++ b/benchmark/raytune/OPTIMIZATION_EXPERIMENT.py @@ -63,11 +63,11 @@ TASKS: List[str] = [ "ruler32k/vt", - # "ruler32k/qa_1", - # "ruler32k/qa_2", - # "ruler32k/fwe", - # "ruler32k/niah_multikey_2", - # "ruler32k/niah_multikey_3", + "ruler32k/qa_1", + "ruler32k/qa_2", + "ruler32k/fwe", + "ruler32k/niah_multikey_2", + "ruler32k/niah_multikey_3", ] SPARSITY_OBJECTIVES: List[str] = [ @@ -93,7 +93,8 @@ # "quest_topk", # "vattention_hashattention", # "vattention_oracle", - "pqcache", + # "pqcache", + "vattention_pqcache", ] # Specify which builders to use (e.g., ["magicpig"], ["dense"], ["double_sparsity"]) @@ -102,7 +103,7 @@ SEARCH_MAX_NEW_TOKENS: int = 3 # Max new tokens for search trials SEARCH_MAX_CONTEXT_LENGTH: int = 40000 # Max context length for search trials SEARCH_MAX_REQUESTS: int = 3 # Max requests per search trial -OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/code/sparse-attention-hub/debug/" # Directory for storing optimal configurations +OPTIMAL_CONFIGS_DIR: str = "/data/apdesai/code/DO_NOT_DELETE/vattention_pqcache_optimization/" # Directory for storing optimal configurations RAY_RESULTS_DIR: str = "/tmp/ray_results" # Directory for Ray Tune results SEARCH_TIMEOUT: int = 900 # Timeout per search trial in seconds ACTORS_PER_GPU: int = 1 # Number of actors per GPU for resource allocation diff --git a/benchmark/raytune/config_builders/__init__.py b/benchmark/raytune/config_builders/__init__.py index 38a9ec1a..acf2661f 100644 --- a/benchmark/raytune/config_builders/__init__.py +++ b/benchmark/raytune/config_builders/__init__.py @@ -8,6 +8,7 @@ from .double_sparsity import DoubleSparsityConfigBuilder # noqa: E402, F401 from .vattention_oracle import VAttentionOracleConfigBuilder # noqa: E402, F401 from .vattention_hashattention import VAttentionHashAttentionConfigBuilder # noqa: E402, F401 +from .vattention_pqcache import VAttentionPQCacheConfigBuilder # noqa: E402, F401 from .oracle_topk import OracleTopKConfigBuilder # noqa: E402, F401 from .oracle_topp import OracleTopPConfigBuilder # noqa: E402, F401 from .hashattention_topk import HashAttentionTopKConfigBuilder # noqa: E402, F401 @@ -22,6 +23,7 @@ "DoubleSparsityConfigBuilder", "VAttentionOracleConfigBuilder", "VAttentionHashAttentionConfigBuilder", + "VAttentionPQCacheConfigBuilder", "OracleTopKConfigBuilder", "OracleTopPConfigBuilder", "HashAttentionTopKConfigBuilder", diff --git a/benchmark/raytune/config_builders/magicpig.py b/benchmark/raytune/config_builders/magicpig.py index 45f1bb40..b4c75aa9 100644 --- a/benchmark/raytune/config_builders/magicpig.py +++ b/benchmark/raytune/config_builders/magicpig.py @@ -18,6 +18,23 @@ from .utility import get_masker_list_name +def _validity_check(config: ResearchAttentionConfig) -> bool: + """Check if the config meets the LSH constraint. + + Returns True if lsh_l * lsh_k is greater than 64 * 64. + + Args: + config: ResearchAttentionConfig to validate. + + Returns: + True if lsh_l * lsh_k > 64 * 64, False otherwise. + """ + magicpig_config = config.masker_configs[2] + # anything greater than this causes too much memory usage for 32K context + return (magicpig_config.lsh_l * magicpig_config.lsh_k) > 4096 + + + @register_builder("magicpig") class MagicPigConfigBuilder(BaseConfigBuilder): """Builder for MagicPig sparse attention configurations.""" @@ -63,8 +80,8 @@ def build_configs( "lsh_k": tune.grid_search([2, 4, 8, 16]), } - # Set validity to default (doesn't depend on memory objectives) - config.validity_constraint = lambda config: True + # Set validity constraint + config.validity_constraint = _validity_check # Set objective function config.objective = sparsity_objective diff --git a/benchmark/raytune/config_builders/vattention_pqcache.py b/benchmark/raytune/config_builders/vattention_pqcache.py new file mode 100644 index 00000000..e1e4d486 --- /dev/null +++ b/benchmark/raytune/config_builders/vattention_pqcache.py @@ -0,0 +1,154 @@ +"""Configuration builder for VAttention PQCache configurations.""" + +from functools import partial +from typing import List, Optional, Tuple, Dict + +from ray import tune + +from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig +from sparse_attention_hub.sparse_attention.research_attention.maskers.fixed.implementations import ( + LocalMaskerConfig, + PQCacheConfig, + SinkMaskerConfig, +) +from sparse_attention_hub.sparse_attention.research_attention.maskers.sampling.implementations import ( + AdaptiveSamplingMaskerConfig, +) + +from .base import BaseConfigBuilder +from .factory import register_builder +from .utility import get_masker_list_name + + +def _validity_check(config: ResearchAttentionConfig, sparsity_val: float) -> bool: + """Check if the config meets the sparsity constraint. + + Args: + config: ResearchAttentionConfig to validate. + sparsity_val: Target sparsity value as a float. + + Returns: + True if pqcache heavy_size + adaptive sampling base_rate_sampling <= sparsity_val, False otherwise. + """ + return (config.masker_configs[2].heavy_size + config.masker_configs[3].base_rate_sampling) <= sparsity_val + + +@register_builder("vattention_pqcache") +class VAttentionPQCacheConfigBuilder(BaseConfigBuilder): + """Builder for VAttention PQCache sparse attention configurations.""" + + def build_configs( + self, + model_config: Dict[str, str], + sparsity_objectives: List[int], + memory_objectives: List[int], + **kwargs + ) -> Tuple[List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]], + List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]]]: + """Get all VAttention PQCache attention configurations. + + Uses: + sparsity_objectives: List[int] - List of sparsity objectives to build the configurations. + Ignores: + memory_objectives: List[int] - List of memory objectives + model_config: Dict[str, str] - Model configuration + + Returns: + Tuple of (optimal_configs, to_optimize_configs) + """ + optimal_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + to_optimize_configs: List[Tuple[str, Optional[ResearchAttentionConfig], Optional[List]]] = [] + + for sparsity_objective in sparsity_objectives: + sparsity_val: float = float(sparsity_objective) / 100.0 + heavy_size: float = float(sparsity_objective) / 100.0 + classes = [SinkMaskerConfig, LocalMaskerConfig, PQCacheConfig, AdaptiveSamplingMaskerConfig] + name: str = get_masker_list_name(classes, other_params={"builder": "vattention_pqcache", "sparsity_obj": sparsity_objective}) + + config = ResearchAttentionConfig(masker_configs=[ + SinkMaskerConfig(sink_size=128), + LocalMaskerConfig(window_size=128), + PQCacheConfig( + heavy_size=heavy_size - (256.0 / 32768), + pq_group_factor=2, # Default value: head_dim=128 // pq_sub_dim=64 = 2 + pq_bits=6, # Default value from search space + kmeans_iter=10, # Default value from search space + init_offset=128, # Matches sink_size + metric="euclidean", # Default value from search space + ), + AdaptiveSamplingMaskerConfig( + base_rate_sampling=0.05, # Middle value + epsilon=0.05, # Middle value + delta=0.05, # Middle value + init_offset=128, # Middle value + local_offset=128 # Middle value + ) + ]) + + # Set up search space for PQCache parameters (from pqcache builder) + # Note: pq_group_factor = head_dim // pq_sub_dim + # Assuming head_dim=128: pq_sub_dim=64 -> pq_group_factor=2, pq_sub_dim=32 -> pq_group_factor=4 + config.masker_configs[2].search_space = { + "pq_group_factor": tune.grid_search([2, 4]), # Corresponds to pq_sub_dim=[64, 32] for head_dim=128 + "pq_bits": tune.grid_search([4, 8]), + "kmeans_iter": tune.grid_search([10]), + "metric": tune.grid_search(["euclidean"]), + } + + # Set up search space for AdaptiveSamplingMaskerConfig (from vattention_hashattention builder) + if sparsity_objective == 2: + # Adaptive sampling with PQCache + config.masker_configs[2].search_space["heavy_size"] = tune.grid_search([0.005, 0.01, 0.02 - (256.0 / 32768)]) + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.005, 0.01]), + "epsilon": tune.grid_search([0.2, 0.4]), + "delta": tune.grid_search([0.2, 0.4]) + } + + elif sparsity_objective == 5: + # Adaptive sampling with PQCache + config.masker_configs[2].search_space["heavy_size"] = tune.grid_search([0.01, 0.025, 0.05]) + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.01, 0.025]), + "epsilon": tune.grid_search([0.15, 0.25]), + "delta": tune.grid_search([0.15, 0.25]) + } + + elif sparsity_objective == 10: + # Adaptive sampling with PQCache + config.masker_configs[2].search_space["heavy_size"] = tune.grid_search([0.025, 0.05, 0.075]) + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0.025, 0.05, 0.075]), + "epsilon": tune.grid_search([0.025, 0.05, 0.075]), + "delta": tune.grid_search([0.025, 0.05, 0.075]) + } + elif sparsity_objective == 15: + # Adaptive sampling with PQCache + config.masker_configs[2].search_space["heavy_size"] = tune.grid_search([0.05, 0.1, 0.15]) + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0, 0.05, 0.1]), + "epsilon": tune.grid_search([0.01, 0.04, 0.1]), + "delta": tune.grid_search([0.01, 0.04, 0.1]) + } + + elif sparsity_objective == 20: + # Adaptive sampling with PQCache + config.masker_configs[2].search_space["heavy_size"] = tune.grid_search([0.05, 0.1, 0.15]) + config.masker_configs[3].search_space = { + "base_rate_sampling": tune.grid_search([0.05, 0.1, 0.15]), + "epsilon": tune.grid_search([0.01, 0.04, 0.1]), + "delta": tune.grid_search([0.01, 0.04, 0.1]) + } + else: + raise ValueError(f"sparsity_objective not supported: {sparsity_objective}") + + # Set validity constraint to use the correct sparsity value for comparison + config.validity_constraint = partial(_validity_check, sparsity_val=sparsity_val) + # Set objective function + config.objective = sparsity_objective + + to_optimize_configs.append((name, config, classes)) + + return optimal_configs, to_optimize_configs + + diff --git a/benchmark/raytune/run_config_dir.py b/benchmark/raytune/run_config_dir.py index 4f4c3b90..e340e5c4 100755 --- a/benchmark/raytune/run_config_dir.py +++ b/benchmark/raytune/run_config_dir.py @@ -383,7 +383,7 @@ def main( # Create adapter config adapter_config = { "adapter_name": "huggingface", - "model_kwargs": {"torch_dtype": torch.bfloat16}, + "model_kwargs": {"torch_dtype": torch.bfloat16, "attn_implementation": "flash_attention_2"}, "tokenizer_kwargs": {"padding_side": "left"} } From 2581cc66c6969f49cc7c52003ad6f1207dc4805d Mon Sep 17 00:00:00 2001 From: Aditya Desai Date: Fri, 5 Dec 2025 14:51:28 -0800 Subject: [PATCH 7/7] Update Readme --- benchmark/raytune/README.md | 163 ++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 80 deletions(-) diff --git a/benchmark/raytune/README.md b/benchmark/raytune/README.md index 0dea73df..51904c98 100644 --- a/benchmark/raytune/README.md +++ b/benchmark/raytune/README.md @@ -1,99 +1,102 @@ -# Ray Tune Benchmark Suite +## Ray Tune Benchmark Suite -A distributed benchmark suite for sparse attention configurations using Ray for parallel execution. +Distributed benchmark suite for sparse attention configurations using Ray. -## Setup +### 1. Quick Start (Run existing builders on new models / settings / objectives) -### Environment Variables +- **Optimize configs** -For HashAttention configurations, set the weights directory: + 1. Edit `benchmark/raytune/OPTIMIZATION_EXPERIMENT.py` to choose: + - **Models**: `MODEL_CONFIGS`, `MODELS` + - **Tasks**: `TASKS` + - **Objectives**: `SPARSITY_OBJECTIVES`, `MEMORY_OBJECTIVES` + - **Builders**: `BUILDER_NAMES` + - **Search/runtime**: samples, timeouts, context limits, output dirs + 2. Run the optimization: ```bash -export SPARSE_ATTENTION_WEIGHTS_DIR=/path/to/your/hashattention/weights +python3 benchmark/raytune/run_optimize_configs.py ``` -The directory should contain the HashAttention weight files for your models (e.g., `llama3.1-8b-patch.64K.v1.hat_weights.pkl`). + This writes one JSON config per (model, task, builder, objective) into the configured optimal-configs directory. -## Quick Start +- **Run benchmarks with optimized configs** -### 1. Optimize Configurations -Find optimal sparse attention configurations for your models: - -```bash -python3 benchmark/raytune/run_optimize_configs.py \ - --objective sparsity_10 \ - --optimal-configs-dir > \ - --num-samples 1 \ - --search-max-new-tokens 5 \ - --search-max-context-length 32678 \ - --search-max-requests 2 \ - --actors-per-gpu 1 -``` - -### 2. Run Benchmarks -Execute benchmarks using the optimized configurations: + Use the config directory produced above with `run_config_dir.py`: ```bash python3 benchmark/raytune/run_config_dir.py \ - --configs-dir \ + --configs-dir /path/to/optimal/configs \ --max-new-tokens 100 \ --max-context-length 32678 \ --max-requests 2 \ --actors-per-gpu 1 \ - --benchmark-results-dir ./test_bench.1/ + --benchmark-results-dir ./bench_results/ ``` -## Workflow - -### Phase 1: Configuration Optimization -Use `run_optimize_configs.py` to search for optimal sparse attention parameters: - -**Configuration Sources:** -- **Models**: Defined in `get_run_configuration()` function -- **Tasks**: Specified in the configuration -- **Sparse Configs**: Two types handled: - - `to_optimize_configs`: Configurations that need hyperparameter search - - `optimal_configs`: Pre-optimized configurations (used as-is) -- **Search Spaces**: Each config type can have its own search space defined separately. Example: - -```python -# Create a ResearchAttentionConfig with custom search spaces -config = ResearchAttentionConfig(masker_configs=[ - SinkMaskerConfig(sink_size=128), - LocalMaskerConfig(window_size=128), - OracleTopKConfig(heavy_size=0.10), - AdaptiveSamplingMaskerConfig( - base_rate_sampling=0.1, - epsilon=0.25, - delta=0.25, - init_offset=128, - local_offset=128 - ) -]) - -# Define search spaces for specific maskers -config.masker_configs[2].search_space = { - "heavy_size": tune.grid_search([0.01, 0.05, 0.1, 0.2]) -} -config.masker_configs[3].search_space = { - "base_rate_sampling": tune.grid_search([0.01, 0.02, 0.05]), - "epsilon": tune.grid_search([0.05, 0.1, 0.2]), - "delta": tune.grid_search([0.05, 0.1, 0.2]) -} -``` - -**Output**: Optimal configurations are written to `/run_/` directory with individual JSON files per model-task-config combination. - -### Phase 2: Benchmark Execution -Use `run_config_dir.py` to run full benchmarks with the found configurations: - -**Input**: Pass the config directory (e.g., `/run_/`) containing all the JSON configuration files generated in Phase 1. - -**Output**: Benchmark results saved to the specified `--benchmark-results-dir`. - -## Features - -- **Distributed Execution**: Ray-based parallel processing across multiple GPUs -- **Automatic Resource Management**: Efficient GPU utilization and task scheduling -- **Sparse Attention Support**: Multiple masker types and configurations -- **Comprehensive Metrics**: Detailed performance and accuracy measurements +### 2. Implementation of optimization + +- **Config builders**: For each sparse attention method, a config builder constructs a `ResearchAttentionConfig` (masker stack, defaults, and metadata) for a given model/task/objective. +- **Search spaces**: Builders attach Ray Tune search spaces (e.g. `config.masker_configs[i].search_space`) to selected hyperparameters; `run_optimize_configs.py` passes these to Ray. +- **Validity checker**: Each builder defines a small validity checker that rejects invalid hyperparameter combinations early so trials can be skipped before running the benchmark. + +High-level flow: + +```text +(model, task, objectives, builder name) + │ + ▼ + Config builder + ┌─────────┴────────────────────────────┐ + │ │ + ▼ ▼ +ResearchAttentionConfig Ray Tune search_space attached + │ + ▼ +Ray Tune iterates over configs ──► validity checker ──► + │ │ + ├─ valid ──► run benchmark trial + └─ invalid ──► skip early (no trial) +``` + +### 3. Adding a new builder + +- **Create a builder**: Copy an existing builder from `benchmark/raytune/config_builders/`, rename it, and adapt: + - masker composition and default parameters + - Ray Tune search spaces on the relevant hyperparameters + - the validity checker logic for early exit on bad configs +- **Wire it up**: + - Register the new builder name wherever builders are dispatched (e.g. builder registry/factory). + - Add the new name to `BUILDER_NAMES` in `OPTIMIZATION_EXPERIMENT.py` so it is included in optimization and benchmarking. + +**Example sketch (`vattention_pqcache`)** in `config_builders/vattention_pqcache.py` (Check the file for details) : + +- **1. Builder name**: + + - Decorator: `@register_builder("vattention_pqcache")` + - Class: `VAttentionPQCacheConfigBuilder` + +- **2. Search space**: + + - Base definition on the PQCache masker: + + ```python + config.masker_configs[2].search_space = { + "pq_group_factor": tune.grid_search([2, 4]), + "pq_bits": tune.grid_search([4, 8]), + "kmeans_iter": tune.grid_search([10]), + "metric": tune.grid_search(["euclidean"]), + } + ``` + + - Plus sparsity-dependent grids on PQCache + AdaptiveSampling (e.g. `config.masker_configs[2].search_space["heavy_size"] = ...`, `config.masker_configs[3].search_space = {...}` inside the `if sparsity_objective == ...` blocks). + +- **3. Validity checker**: + + - Function: `_validity_check(config, sparsity_val)` at the top of the file. + - Attached to the config with: + + ```python + config.validity_constraint = partial(_validity_check, sparsity_val=sparsity_val) + ``` +