Kv grad allreduce v2 (#39)

jlamypoirier · thomasw21 · web-flow · commit 659295a34081 · 2023-03-21T19:04:17.000-04:00
Co-authored-by: thomasw21 &lt;24695242+thomasw21@users.noreply.github.com&gt;
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -611,12 +611,22 @@ def forward(self, hidden_states, attention_mask,
              value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
         elif self.attention_type == AttnType.self_attn and self.attention_head_type == 'multiquery':
             kv_input=hidden_states
-            if get_args().sequence_parallel:
-                # The linear layer doesn't gather the sequence-parallel.
-                kv_input = mpu.gather_from_sequence_parallel_region(kv_input, tensor_parallel_output_grad=False)
             # Attention heads [sq, b, h] --> [sq, b, (2 * hn)]
             mixed_kv_layer = self.key_value(kv_input)
 
+            # Reduce the KV gradients in the tensor-parallel direction.
+            # This is different from multi-head attention which reduces the KV input,
+            # because the sum over attn heads happens in the attn weight gradient instead of the KV layer:
+            #   A [b, n * sq, sk] = Q [b, n * sq, hn] x K^T [b, hn, sk]
+            #   G_K [b, sk, hn] = G_A [b, sk, n * sq] x Q [b, n * sq, hn]
+            #                   = sum_p (G_Ap [b, sk, np * sq] x Q_p [b, np * sq, hn])
+            if get_args().sequence_parallel:
+                # We switch to the tensor parallel regime here instead of at the KV input
+                # so that the KV layer is done in parallel instead of just duplicated.
+                mixed_kv_layer = mpu.gather_from_sequence_parallel_region(mixed_kv_layer, tensor_parallel_output_grad=True)
+            else:
+                mixed_kv_layer = mpu.copy_to_tensor_model_parallel_region(mixed_kv_layer)
+
             # [sq, b, (2 * hn)] --> [sq, b, np (expanded), 2 * hn]
             # new_tensor_shape = mixed_kv_layer.size()[:-1] + \
             #     (self.num_attention_heads_per_partition,
diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py
@@ -542,7 +542,11 @@ def reduce_model_grads(self, args, timers):
         timers('backward-embedding-all-reduce').stop()
 
         # All-reduce key-value grads if needed.
-        if args.attention_head_type == "multiquery":
+        if (
+            args.attention_head_type == "multiquery"
+            and mpu.get_tensor_model_parallel_world_size() > 1
+            and args.sequence_parallel
+        ):
             timers('backward-key-value-all-reduce').start()
             self.allreduce_key_value_grads(args)
             timers('backward-key-value-all-reduce').stop()
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
@@ -268,7 +268,8 @@ def allreduce_embedding_grads(self, args):
     
     def allreduce_key_value_grads(self, args):
         """
-        Reduce the gradients for the key_value weights and biases for multi-query attention.
+        Reduce the gradients for the key_value weights and biases for multi-query attention
+        with sequence parallelism.
         Coalesce the bias grads to avoid too many small reductions,
         but not the weight grads since it could cause memory issues.
         """
@@ -334,7 +335,11 @@ def reduce_model_grads(self, args, timers):
         timers('backward-embedding-all-reduce').stop()
 
         # All-reduce key-value grads if needed.
-        if args.attention_head_type == "multiquery":
+        if (
+            args.attention_head_type == "multiquery"
+            and mpu.get_tensor_model_parallel_world_size() > 1
+            and args.sequence_parallel
+        ):
             timers('backward-key-value-all-reduce').start()
             self.allreduce_key_value_grads(args)
             timers('backward-key-value-all-reduce').stop()