diff --git a/benchmark/raytune/run_config_dir.py b/benchmark/raytune/run_config_dir.py index d47dc76e..9b7ecd9f 100755 --- a/benchmark/raytune/run_config_dir.py +++ b/benchmark/raytune/run_config_dir.py @@ -322,7 +322,7 @@ def main( print(f"\n{'='*80}") print(f"RAY BENCHMARK RUNNER") print(f"{'='*80}") - + # Initialize Ray if not ray.is_initialized(): ray.init(ignore_reinit_error=True) @@ -427,13 +427,17 @@ def main( pool.submit(lambda actor, task: actor.run_benchmark.remote(task), task) # Collect results + results: List[BenchmarkResult] = [] while pool.has_next(): result = pool.get_next() result_queue.put(result) + results.append(result) # Wait for progress reporter ray.get(progress_task) + failed_results: List[BenchmarkResult] = [r for r in results if not r.success] + # Get actor statistics print("\nActor statistics:") for actor in actors: @@ -455,6 +459,11 @@ def main( ray.shutdown() + if failed_results: + for result in failed_results: + print(f"[ERROR] Task {result.task_id} failed: {result.error}") + raise RuntimeError("One or more benchmark tasks failed. See errors above.") + if __name__ == "__main__": fire.Fire(main) diff --git a/benchmark/raytune/utility.py b/benchmark/raytune/utility.py index b84c0d83..a7890d1e 100644 --- a/benchmark/raytune/utility.py +++ b/benchmark/raytune/utility.py @@ -165,19 +165,21 @@ def deserialize_sparse_config(data: Optional[Dict]) -> Optional[Any]: # Dynamically discover all available masker config classes config_map = get_all_masker_config_classes() - # Reconstruct masker configs with error handling + #reconstruct masker configs so we let errors propagate for critical failures + #this ennsures missing files or invalid configurations cause immediate failues in stead of silent skipping maskers and producing misleading results masker_configs = [] for masker_data in data.get("masker_configs", []): config_class = config_map.get(masker_data["type"]) if config_class: - try: - # Create instance with parameters - params = masker_data.get("params", {}) - masker_configs.append(config_class(**params)) - except Exception as e: - logging.warning(f"Failed to create {masker_data['type']}: {e}") - continue - - # Import ResearchAttentionConfig here to avoid circular imports + #create instance with parameters so we let ValueError/FileNotFoundError propagate + params = masker_data.get("params", {}) + masker_configs.append(config_class(**params)) + else: + raise ValueError( + f"Unknown masker config type: {masker_data['type']}. " + f"Available types: {list(config_map.keys())}" + ) + + #import ResearchAttentionConfig here to avoid circular imports from sparse_attention_hub.sparse_attention.research_attention import ResearchAttentionConfig return ResearchAttentionConfig(masker_configs=masker_configs) if masker_configs else None