flash-algo
diff --git a/‎benchmarks/benchmark_mqar.py‎
Lines changed: 0 additions & 62 deletions b/‎benchmarks/benchmark_mqar.py‎
Lines changed: 0 additions & 62 deletions
diff --git a/‎…chmarks/benchmark_forward_equivalence.py‎ ‎benchmarks/forward_equivalence.py‎benchmarks/benchmark_forward_equivalence.py renamed to benchmarks/forward_equivalence.py
Lines changed: 26 additions & 26 deletions b/‎…chmarks/benchmark_forward_equivalence.py‎ ‎benchmarks/forward_equivalence.py‎benchmarks/benchmark_forward_equivalence.py renamed to benchmarks/forward_equivalence.py
Lines changed: 26 additions & 26 deletions
diff --git a/‎…chmarks/benchmark_forward_performance.py‎ ‎benchmarks/forward_performance.py‎benchmarks/benchmark_forward_performance.py renamed to benchmarks/forward_performance.py
Lines changed: 50 additions & 50 deletions b/‎…chmarks/benchmark_forward_performance.py‎ ‎benchmarks/forward_performance.py‎benchmarks/benchmark_forward_performance.py renamed to benchmarks/forward_performance.py
Lines changed: 50 additions & 50 deletions
diff --git a/‎benchmarks/benchmark_grad.py‎ ‎benchmarks/grad_equivalence.py‎benchmarks/benchmark_grad.py renamed to benchmarks/grad_equivalence.py b/‎benchmarks/benchmark_grad.py‎ ‎benchmarks/grad_equivalence.py‎benchmarks/benchmark_grad.py renamed to benchmarks/grad_equivalence.py
@@ -518,27 +518,27 @@ def test_cuda_forward_equivalence(accuracy_threshold=0.95):
     # If you encounter NAN issues when running multiple configurations, try running a single configuration
     test_configs = [
         # (batch_size, num_heads, num_kv_heads, query_len, key_len, head_dim, is_causal)
-        (1, 1, 1, 64, 64, 32, True),
-        (1, 1, 1, 64, 64, 32, False),
-        (1, 1, 1, 128, 128, 32, True),
-        (1, 1, 1, 128, 128, 32, False),
-        (1, 1, 1, 256, 256, 32, True),
-        (1, 1, 1, 256, 256, 32, False),
-        (1, 1, 1, 512, 512, 32, True),
-        (1, 1, 1, 512, 512, 32, False),
-        (1, 1, 1, 1024, 1024, 32, True),
-        (1, 1, 1, 1024, 1024, 32, False),
-        (1, 1, 1, 2048, 2048, 32, True),
-        (1, 1, 1, 2048, 2048, 32, False),
+        # (1, 1, 1, 64, 64, 32, True),
+        # (1, 1, 1, 64, 64, 32, False),
+        # (1, 1, 1, 128, 128, 32, True),
+        # (1, 1, 1, 128, 128, 32, False),
+        # (1, 1, 1, 256, 256, 32, True),
+        # (1, 1, 1, 256, 256, 32, False),
+        # (1, 1, 1, 512, 512, 32, True),
+        # (1, 1, 1, 512, 512, 32, False),
+        # (1, 1, 1, 1024, 1024, 32, True),
+        # (1, 1, 1, 1024, 1024, 32, False),
+        # (1, 1, 1, 2048, 2048, 32, True),
+        # (1, 1, 1, 2048, 2048, 32, False),
         (1, 1, 1, 4096, 4096, 32, True),
-        (1, 1, 1, 4096, 4096, 32, False),
-        (1, 2, 1, 64, 64, 32, True),
-        (2, 1, 1, 128, 128, 32, True),
-        (2, 2, 1, 128, 128, 32, True),
-        (1, 2, 1, 64, 64, 128, True),
-        (1, 2, 1, 128, 128, 128, True),
-        (1, 2, 1, 256, 256, 128, True),
-        (1, 2, 1, 512, 512, 128, True),
+        # (1, 1, 1, 4096, 4096, 32, False),
+        # (1, 2, 1, 64, 64, 32, True),
+        # (2, 1, 1, 128, 128, 32, True),
+        # (2, 2, 1, 128, 128, 32, True),
+        # (1, 2, 1, 64, 64, 128, True),
+        # (1, 2, 1, 128, 128, 128, True),
+        # (1, 2, 1, 256, 256, 128, True),
+        # (1, 2, 1, 512, 512, 128, True),
     ]
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -1050,13 +1050,13 @@ def main():
         print("\n" + "📍" + " Starting Standard Forward Pass Tests " + "📍")
         test_results['cuda'] = test_cuda_forward_equivalence(args.accuracy_threshold)
 
-    if args.test_type in ['all', 'triton']:
-        print("\n" + "🔥" + " Starting Python vs Triton Tests " + "🔥")
-        test_results['triton'] = test_triton_forward_equivalence(args.accuracy_threshold)
+    # if args.test_type in ['all', 'triton']:
+    #     print("\n" + "🔥" + " Starting Python vs Triton Tests " + "🔥")
+    #     test_results['triton'] = test_triton_forward_equivalence(args.accuracy_threshold)
 
-    if args.test_type in ['all', 'flex']:
-        print("\n" + "🌟" + " Starting Python vs Flex Attention Tests " + "🌟")
-        test_results['flex'] = test_flex_forward_equivalence(args.accuracy_threshold)
+    # if args.test_type in ['all', 'flex']:
+    #     print("\n" + "🌟" + " Starting Python vs Flex Attention Tests " + "🌟")
+    #     test_results['flex'] = test_flex_forward_equivalence(args.accuracy_threshold)
 
 
     # Print overall summary
 
@@ -732,57 +732,57 @@ def run_performance_benchmark(test_type='all', num_runs=3, warmup_runs=2):
         (1, 2, 1, 4096, 4096, 128, 2048, True),
         (1, 2, 1, 8192, 8192, 128, 2048, True),
         (1, 2, 1, 16384, 16384, 128, 2048, True),
-        (1, 2, 1, 32768, 32768, 128, 2048, True),
-
-        # Inference
-        (1, 2, 1, 2, 256, 128, 2048, True),
-        (1, 2, 1, 2, 512, 128, 2048, True),
-        (1, 2, 1, 2, 1024, 128, 2048, True),
-        (1, 2, 1, 2, 2048, 128, 2048, True),
-        (1, 2, 1, 2, 4096, 128, 2048, True),
-        (1, 2, 1, 2, 8192, 128, 2048, True),
-        (1, 2, 1, 2, 16384, 128, 2048, True),
-        (1, 2, 1, 2, 32768, 128, 2048, True),
+        # (1, 2, 1, 32768, 32768, 128, 2048, True),
+
+        # # Inference
+        # (1, 2, 1, 2, 256, 128, 2048, True),
+        # (1, 2, 1, 2, 512, 128, 2048, True),
+        # (1, 2, 1, 2, 1024, 128, 2048, True),
+        # (1, 2, 1, 2, 2048, 128, 2048, True),
+        # (1, 2, 1, 2, 4096, 128, 2048, True),
+        # (1, 2, 1, 2, 8192, 128, 2048, True),
+        # (1, 2, 1, 2, 16384, 128, 2048, True),
+        # (1, 2, 1, 2, 32768, 128, 2048, True),
         (1, 2, 1, 2, 65536, 128, 2048, True),
-        (1, 2, 1, 2, 131072, 128, 2048, True),
-        (1, 2, 1, 2, 262144, 128, 2048, True),
-        (1, 2, 1, 2, 524288, 128, 2048, True),
-        
-        # Vary batch size
-        (1, 2, 1, 4096, 4096, 32, 2048, True),
-        (2, 2, 1, 4096, 4096, 32, 2048, True),
-        (4, 2, 1, 4096, 4096, 32, 2048, True),
-        (8, 2, 1, 4096, 4096, 32, 2048, True),
-        
-        # Vary head count
-        (1, 1, 1, 4096, 4096, 32, 2048, True),
-        (1, 2, 1, 4096, 4096, 32, 2048, True),
-        (1, 4, 1, 4096, 4096, 32, 2048, True),
-        (1, 8, 2, 4096, 4096, 32, 2048, True),
-        
-        # Vary head dimension
-        (1, 2, 1, 4096, 4096, 32, 2048, True),
-        (1, 2, 1, 4096, 4096, 64, 2048, True),
-        (1, 2, 1, 4096, 4096, 96, 2048, True),
-        (1, 2, 1, 4096, 4096, 128, 2048, True),
-        (1, 2, 1, 4096, 4096, 192, 2048, True),
-        (1, 2, 1, 4096, 4096, 256, 2048, True),
-        
-        # Vary keep_window_size
-        (1, 2, 1, 32768, 32768, 128, 32, True),
-        (1, 2, 1, 32768, 32768, 128, 64, True),
-        (1, 2, 1, 32768, 32768, 128, 128, True),
-        (1, 2, 1, 32768, 32768, 128, 256, True),
-        (1, 2, 1, 32768, 32768, 128, 512, True),
-        (1, 2, 1, 32768, 32768, 128, 1024, True),
-        (1, 2, 1, 32768, 32768, 128, 2048, True),
-        (1, 2, 1, 32768, 32768, 128, 4096, True),
-        (1, 2, 1, 32768, 32768, 128, 8192, True),
-        (1, 2, 1, 32768, 32768, 128, 16384, True),
-        (1, 2, 1, 32768, 32768, 128, 32768, True),
-        
-        # Test non-causal
-        (1, 2, 1, 4096, 4096, 128, 2048, False),
+        # (1, 2, 1, 2, 131072, 128, 2048, True),
+        # (1, 2, 1, 2, 262144, 128, 2048, True),
+        # (1, 2, 1, 2, 524288, 128, 2048, True),
+        
+        # # Vary batch size
+        # (1, 2, 1, 4096, 4096, 32, 2048, True),
+        # (2, 2, 1, 4096, 4096, 32, 2048, True),
+        # (4, 2, 1, 4096, 4096, 32, 2048, True),
+        # (8, 2, 1, 4096, 4096, 32, 2048, True),
+        
+        # # Vary head count
+        # (1, 1, 1, 4096, 4096, 32, 2048, True),
+        # (1, 2, 1, 4096, 4096, 32, 2048, True),
+        # (1, 4, 1, 4096, 4096, 32, 2048, True),
+        # (1, 8, 2, 4096, 4096, 32, 2048, True),
+        
+        # # Vary head dimension
+        # (1, 2, 1, 4096, 4096, 32, 2048, True),
+        # (1, 2, 1, 4096, 4096, 64, 2048, True),
+        # (1, 2, 1, 4096, 4096, 96, 2048, True),
+        # (1, 2, 1, 4096, 4096, 128, 2048, True),
+        # (1, 2, 1, 4096, 4096, 192, 2048, True),
+        # (1, 2, 1, 4096, 4096, 256, 2048, True),
+        
+        # # Vary keep_window_size
+        # (1, 2, 1, 32768, 32768, 128, 32, True),
+        # (1, 2, 1, 32768, 32768, 128, 64, True),
+        # (1, 2, 1, 32768, 32768, 128, 128, True),
+        # (1, 2, 1, 32768, 32768, 128, 256, True),
+        # (1, 2, 1, 32768, 32768, 128, 512, True),
+        # (1, 2, 1, 32768, 32768, 128, 1024, True),
+        # (1, 2, 1, 32768, 32768, 128, 2048, True),
+        # (1, 2, 1, 32768, 32768, 128, 4096, True),
+        # (1, 2, 1, 32768, 32768, 128, 8192, True),
+        # (1, 2, 1, 32768, 32768, 128, 16384, True),
+        # (1, 2, 1, 32768, 32768, 128, 32768, True),
+        
+        # # Test non-causal
+        # (1, 2, 1, 4096, 4096, 128, 2048, False),
     ]
 
     print(f"\n📊 Benchmark Results (averaged over {num_runs} runs):")