Merge pull request #217 from david-thrower/216-add-support-for-gradient-accumulation-steps

Aidyn-Lopez · web-flow · commit 389d773456f2 · 2025-09-14T01:27:47.000-04:00
216 add support for gradient accumulation steps
diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
@@ -6,7 +6,7 @@ name: Python application
 on:
   push:
 
-    branches: [ "main", "208-refactor-nlp-example-to-tokenize-first" ]
+    branches: [ "main", "216-add-support-for-gradient-accumulation-steps" ]
 
 
 permissions:
diff --git a/cerebros/neuralnetworkfuture/neural_network_future.py b/cerebros/neuralnetworkfuture/neural_network_future.py
@@ -56,6 +56,7 @@ def __init__(
              metrics=[tf.keras.metrics.RootMeanSquaredError()],
              model_graph_file='test_model_graph.html',
              train_data_dtype=tf.float32,
+             gradient_accumulation_steps=1,
              *args,
              **kwargs):
         print(level_number)
@@ -76,6 +77,7 @@ def __init__(
         self.compiled_materialized_neural_network = []
         self.model_graph_file = model_graph_file
         self.train_data_dtype = train_data_dtype
+        self.gradient_accumulation_steps = gradient_accumulation_steps    
 
         # super().__init__(self,
         #                 *args,
@@ -328,15 +330,30 @@ def compile_neural_network(self):
             jit_compile = True
         else:
             jit_compile = False
+        if not isinstance(self.gradient_accumulation_steps, int):     
+            raise ValueError("gradient_accumulation_steps must be an int >= 0. You set it as {self.gradient_accumulation_steps} type {type(self.gradient_accumulation_steps)}")
+        if self.gradient_accumulation_steps > 1:
+            self.materialized_neural_network.compile(
+                    loss=self.loss,
+                    metrics=self.metrics,
+                    optimizer=tf.keras.optimizers.AdamW(
+                        learning_rate=self.learning_rate,
+                        weight_decay=0.004,  # Add weight decay parameter
+                        gradient_accumulation_steps=self.gradient_accumulation_steps
+                    ),
+                    jit_compile=jit_compile)
+        elif self.gradient_accumulation_steps == 1:
+            self.materialized_neural_network.compile(
+                    loss=self.loss,
+                    metrics=self.metrics,
+                    optimizer=tf.keras.optimizers.AdamW(
+                        learning_rate=self.learning_rate,
+                        weight_decay=0.004,  # Add weight decay parameter
+                    ),
+                    jit_compile=jit_compile)
+        else:
+            raise ValueError("gradient_accumulation_steps must be an int >= 0. You set it as {self.gradient_accumulation_steps} type {type(self.gradient_accumulation_steps)}")
 
-        self.materialized_neural_network.compile(
-            loss=self.loss,
-            metrics=self.metrics,
-            optimizer=tf.keras.optimizers.AdamW(
-                learning_rate=self.learning_rate,
-                weight_decay=0.004  # Add weight decay parameter
-            ),
-            jit_compile=jit_compile)
 
     def util_parse_connectivity_csv(self):
 
diff --git a/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py b/cerebros/simplecerebrosrandomsearch/simple_cerebros_random_search.py
@@ -314,6 +314,7 @@ def __init__(
                  patience=7,
                  project_name='cerebros-auto-ml-test',
                  batch_size=200,
+                 gradient_accumulation_steps=1,
                  meta_trial_number=0,
                  base_models=[''],
                  train_data_dtype=tf.float32,
@@ -373,6 +374,7 @@ def __init__(
         self.metrics = metrics
         self.epochs = epochs
         self.batch_size = batch_size
+        self.gradient_accumulation_steps=gradient_accumulation_steps
         self.meta_trial_number = meta_trial_number
         self.base_models = base_models
         self.best_model_path = ""
@@ -480,7 +482,8 @@ def run_moity_permutations(self, spec, subtrial_number, lock):
             loss=self.loss,
             metrics=self.metrics,
             model_graph_file=model_graph_file,
-            train_data_dtype=self.train_data_dtype
+            train_data_dtype=self.train_data_dtype,
+            gradient_accumulation_steps=self.gradient_accumulation_steps
             )
         tf.keras.backend.clear_session()
         collect()
diff --git a/phishing_email_detection_gpt2.py b/phishing_email_detection_gpt2.py
@@ -503,7 +503,8 @@ def from_config(cls, config):
     batch_size=batch_size,
     meta_trial_number=meta_trial_number,
     base_models=[cerebros_base_model],
-    train_data_dtype=tf.int32)  # Changed from tf.string to tf.int32
+    train_data_dtype=tf.int32,
+    gradient_accumulation_steps=2)
 
 cerebros_t0 = time.time()
 result = cerebros_automl.run_random_search()
@@ -516,8 +517,6 @@ def from_config(cls, config):
 
 print(f"Cerebros trained {models_tried} models FROM A COLD START in ONLY {cerebros_time_all_models_min} min. Cerebros took only {cerebros_time_per_model} minutes on average per model.")
 print(f"GPT2 took {gpt_time_on_one_model_min} just to FINE TUNE one PRE - TRAINED model for 3 epochs. Although this is a small scale test, this shows the advantage of scaling in ON timing VS ON**2 timing.")
-
-
 print(f'Cerebros best accuracy achieved is {result}')
 print(f'val set accuracy')