From 04a72ed21ebf448a04833f3b4e5615575cf3b894 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 1 Mar 2022 14:50:04 -0500
Subject: [PATCH 001/144] add iteration argument to load_checkpoint

---
 megatron/checkpointing.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 61dab390dc..35a21d5adb 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -295,17 +295,7 @@ def fix_query_key_value_ordering(model, checkpoint_version):
         print_rank_0(" succesfully fixed query-key-values ordering for"
                     " checkpoint version {}".format(checkpoint_version))
 
-def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True):
-    """Load a model checkpoint and return the iteration.
-    strict (bool): whether to strictly enforce that the keys in
-        :attr:`state_dict` of the checkpoint match the names of
-        parameters and buffers in model.
-    """
-    args = get_args()
-    load_dir = getattr(args, load_arg)
-
-    model = utils.unwrap_model(model)
-
+def get_iteration_release_from_tracker(load_dir):
     # Read the tracker file and set the iteration.
     tracker_filename = get_checkpoint_tracker_filename(load_dir)
 
@@ -320,6 +310,23 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     # Otherwise, read the tracker file and either set the iteration or
     # mark it as a release checkpoint.
     iteration, release = read_metadata(tracker_filename)
+    return iteration, release
+
+def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True, iteration=None):
+    """Load a model checkpoint and return the iteration.
+    strict (bool): whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model.
+    """
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+
+    model = utils.unwrap_model(model)
+
+    if iteration is None:
+        iteration, release = get_iteration_release_from_tracker(load_dir)
+    else:
+        release = False
 
     # Checkpoint.
     checkpoint_name = get_checkpoint_name(load_dir, iteration, release)

From 22005a11fa9cb62ca6179f24dbf7ae63918386c0 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 10 Mar 2022 13:44:44 -0500
Subject: [PATCH 002/144] fix load_checkpoint when starting from scratch

---
 megatron/checkpointing.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 35a21d5adb..582108b253 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -295,23 +295,6 @@ def fix_query_key_value_ordering(model, checkpoint_version):
         print_rank_0(" succesfully fixed query-key-values ordering for"
                     " checkpoint version {}".format(checkpoint_version))
 
-def get_iteration_release_from_tracker(load_dir):
-    # Read the tracker file and set the iteration.
-    tracker_filename = get_checkpoint_tracker_filename(load_dir)
-
-    # If no tracker file, return iretation zero.
-    if not os.path.isfile(tracker_filename):
-        print_rank_0('WARNING: could not find the metadata file {} '.format(
-            tracker_filename))
-        print_rank_0('    will not load any checkpoints and will start from '
-                     'random')
-        return 0
-
-    # Otherwise, read the tracker file and either set the iteration or
-    # mark it as a release checkpoint.
-    iteration, release = read_metadata(tracker_filename)
-    return iteration, release
-
 def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True, iteration=None):
     """Load a model checkpoint and return the iteration.
     strict (bool): whether to strictly enforce that the keys in
@@ -324,8 +307,22 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     model = utils.unwrap_model(model)
 
     if iteration is None:
-        iteration, release = get_iteration_release_from_tracker(load_dir)
+        # Read the tracker file and set the iteration.
+        tracker_filename = get_checkpoint_tracker_filename(load_dir)
+
+        # If no tracker file, return iretation zero.
+        if not os.path.isfile(tracker_filename):
+            print_rank_0('WARNING: could not find the metadata file {} '.format(
+                tracker_filename))
+            print_rank_0('    will not load any checkpoints and will start from '
+                        'random')
+            return 0
+
+        # Otherwise, read the tracker file and either set the iteration or
+        # mark it as a release checkpoint.
+        iteration, release = read_metadata(tracker_filename)
     else:
+        # Iteration given as argument: do nothing
         release = False
 
     # Checkpoint.

From e51c3c5408bef2478a09eb5ba26b1e8c85a41101 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 19 Apr 2022 11:00:10 -0400
Subject: [PATCH 003/144] reset consumed_train_samples when finetune=True

---
 megatron/checkpointing.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 582108b253..938510ff73 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -372,11 +372,12 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     if 'args' in state_dict:
         checkpoint_args = state_dict['args']
         check_checkpoint_args(checkpoint_args)
-        args.consumed_train_samples = getattr(checkpoint_args,
-                                              'consumed_train_samples', 0)
-        update_num_microbatches(consumed_samples=args.consumed_train_samples)
-        args.consumed_valid_samples = getattr(checkpoint_args,
-                                              'consumed_valid_samples', 0)
+        if not args.finetune:
+            args.consumed_train_samples = getattr(checkpoint_args,
+                                                'consumed_train_samples', 0)
+            update_num_microbatches(consumed_samples=args.consumed_train_samples)
+            args.consumed_valid_samples = getattr(checkpoint_args,
+                                                'consumed_valid_samples', 0)
     else:
         print_rank_0('could not find arguments in the checkpoint ...')
 

From 884e8b8c0d7ea462cc990c139943b570e392f57e Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 22 Apr 2022 17:11:42 -0400
Subject: [PATCH 004/144] add wandb reporting

---
 megatron/arguments.py  | 10 ++++++++++
 megatron/initialize.py | 13 +++++++++++++
 megatron/training.py   | 18 +++++++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index e50ce48fe3..be6d537850 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -268,6 +268,11 @@ def parse_args(extra_args_provider=None, defaults={},
             'distributed checkpoint activations are supported for pytorch ' \
             'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
             'pytorch version is v%s.%s.' % (TORCH_MAJOR, TORCH_MINOR)
+        
+    # Weights and Biases
+    if args.wandb_entity_name or args.wandb_project_name:
+        assert args.wandb_entity_name and args.wandb_project_name, \
+            "Both entity and project name must be set in order to report to wandb"
 
     _print_args(args)
     return args
@@ -382,6 +387,11 @@ def _add_logging_args(parser):
     group.add_argument('--log-world-size-to-tensorboard',
                        action='store_true',
                        help='Enable world size logging to tensorboard.')
+    
+    group.add_argument('--wandb-entity-name', type=str, default=None,
+                        help="Name of wandb entity for reporting")
+    group.add_argument('--wandb-project-name', type=str, default=None,
+                        help="Name of wandb project")
 
     return parser
 
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 5c4c4e54b0..4e56298a5d 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -23,6 +23,8 @@
 import torch
 from datetime import timedelta
 
+import wandb
+
 from megatron import fused_kernels
 from megatron import get_adlr_autoresume
 from megatron import get_args
@@ -229,6 +231,17 @@ def write_args_to_tensorboard():
             writer.add_text(arg, str(getattr(args, arg)),
                             global_step=args.iteration)
 
+def init_wandb():
+    args = get_args()
+    if args.rank == (args.world_size - 1):
+        wandb.init(
+            name=os.path.basename(args.save),
+            entity=args.wandb_entity_name,
+            project=args.wandb_project_name,
+            group="mini_cluster",
+            config=args
+        )
+
 
 def _set_jit_fusion_options():
     """Set PyTorch JIT layer fusion options."""
diff --git a/megatron/training.py b/megatron/training.py
index 0f458e30d7..975db345a7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -19,6 +19,8 @@
 import math
 import sys
 import time
+
+import wandb
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
 
@@ -41,7 +43,7 @@
 from megatron.model import Float16Module
 from megatron.model import ModelType
 from megatron.optimizer import get_megatron_optimizer
-from megatron.initialize import initialize_megatron
+from megatron.initialize import init_wandb, initialize_megatron
 from megatron.initialize import write_args_to_tensorboard
 from megatron.learning_rates import AnnealingLR
 from megatron.model import DistributedDataParallel as LocalDDP
@@ -598,6 +600,17 @@ def add_to_logging(name):
                 mem_stats["allocation.all.current"],
                 iteration,
             )
+    
+    # Weights and biases reporting
+    if (iteration % args.log_interval == 0) and is_last_rank():
+        metrics = {
+            'learning-rate': learning_rate,
+            'samples': args.consumed_train_samples,
+            'loss-scale': loss_scale,
+            'grad-norm': grad_norm,
+            **loss_dict
+        }
+        wandb.log(metrics, step=iteration)
 
     if iteration % args.log_interval == 0:
         elapsed_time = timers('interval-time').elapsed()
@@ -667,6 +680,9 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
     # Write args to tensorboard
     write_args_to_tensorboard()
 
+    # Init Weights and Biases
+    init_wandb()
+
     # Turn on training mode which enables dropout.
     for model_module in model:
         model_module.train()

From ad1f261b4a64426dddf3b08b6e124f4fbd6de4ef Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Sun, 24 Apr 2022 22:46:22 -0400
Subject: [PATCH 005/144] add validation loss wandb reporting

---
 megatron/training.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/megatron/training.py b/megatron/training.py
index 975db345a7..4ed1324368 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -849,6 +849,13 @@ def evaluate_and_print_results(prefix, forward_step_func,
                                   iteration)
                 writer.add_scalar('{} validation ppl vs samples'.format(key),
                                   ppl, args.consumed_train_samples)
+    
+    # Weights and biases reporting
+    if is_last_rank():
+        metrics = {
+            '{} validation'.format(key): total_loss_dict[key].item() for key in total_loss_dict
+        }
+        wandb.log(metrics, step=iteration)
 
     length = len(string) + 1
     print_rank_last('-' * length)

From cc54e57e1b5d2745d24040439f318bebd7b02bfb Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 26 Apr 2022 11:49:58 -0400
Subject: [PATCH 006/144] skip wandb if not provided

---
 megatron/initialize.py | 3 +++
 megatron/training.py   | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 4e56298a5d..3bce58ec7d 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -234,6 +234,9 @@ def write_args_to_tensorboard():
 def init_wandb():
     args = get_args()
     if args.rank == (args.world_size - 1):
+        if not (args.wandb_entity_name and args.wandb_project_name):
+            print('> Skipping wandb init ...', flush=True)
+            return
         wandb.init(
             name=os.path.basename(args.save),
             entity=args.wandb_entity_name,
diff --git a/megatron/training.py b/megatron/training.py
index 4ed1324368..3372e01c05 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -602,7 +602,7 @@ def add_to_logging(name):
             )
     
     # Weights and biases reporting
-    if (iteration % args.log_interval == 0) and is_last_rank():
+    if (iteration % args.log_interval == 0) and is_last_rank() and args.wandb_project_name:
         metrics = {
             'learning-rate': learning_rate,
             'samples': args.consumed_train_samples,
@@ -851,7 +851,7 @@ def evaluate_and_print_results(prefix, forward_step_func,
                                   ppl, args.consumed_train_samples)
     
     # Weights and biases reporting
-    if is_last_rank():
+    if is_last_rank() and args.wandb_project_name:
         metrics = {
             '{} validation'.format(key): total_loss_dict[key].item() for key in total_loss_dict
         }

From 97954acb9834fedeff76e223ec06c4b1847daf17 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 12 May 2022 19:36:39 -0400
Subject: [PATCH 007/144] when finetuning, load optimizer state

---
 megatron/checkpointing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 938510ff73..3d2efda219 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -395,7 +395,8 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     fix_query_key_value_ordering(model, checkpoint_version)
 
     # Optimizer.
-    if not release and not args.finetune and not args.no_load_optim:
+    # if not release and not args.finetune and not args.no_load_optim:
+    if not release and not args.no_load_optim:
         try:
             if optimizer is not None:
                 optimizer.load_state_dict(state_dict['optimizer'])

From 81c72abcdc563a206c43175692fbcac490dc5504 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 13 May 2022 09:54:11 -0400
Subject: [PATCH 008/144] do not load lr scheduler in finetuning

---
 megatron/checkpointing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 3d2efda219..bcaf32386e 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -395,12 +395,13 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     fix_query_key_value_ordering(model, checkpoint_version)
 
     # Optimizer.
+    # for finetuning: load optimizer but not lr_scheduler.
     # if not release and not args.finetune and not args.no_load_optim:
     if not release and not args.no_load_optim:
         try:
             if optimizer is not None:
                 optimizer.load_state_dict(state_dict['optimizer'])
-            if lr_scheduler is not None:
+            if lr_scheduler is not None and not args.finetune:
                 lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
         except KeyError:
             print_rank_0('Unable to load optimizer from checkpoint {}. '

From 03c1aa06517fc471223d517d34af825ac1eb6f31 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 17 May 2022 14:40:12 -0400
Subject: [PATCH 009/144] add torchrun support

---
 megatron/arguments.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index be6d537850..07030ecf97 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -273,6 +273,10 @@ def parse_args(extra_args_provider=None, defaults={},
     if args.wandb_entity_name or args.wandb_project_name:
         assert args.wandb_entity_name and args.wandb_project_name, \
             "Both entity and project name must be set in order to report to wandb"
+    
+    # Local-rank from environment variable (if using torchrun)
+    if args.local_rank is None and "LOCAL_RANK" in os.environ:
+        args.local_rank = int(os.environ["LOCAL_RANK"])
 
     _print_args(args)
     return args

From 00cfaad19672a22cf4f7ecacf146f6f16bf0ae32 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 17 May 2022 14:42:18 -0400
Subject: [PATCH 010/144] reload weights into optimizer after loading model
 weights

---
 megatron/training.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/megatron/training.py b/megatron/training.py
index 3372e01c05..334f695f0b 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -363,6 +363,9 @@ def setup_model_and_optimizer(model_provider_func, model_type):
         torch.distributed.barrier()
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
+        # This is critical when only model is loaded. We should make sure
+        # main parameters are also updated.
+        optimizer.reload_model_params()
     else:
         args.iteration = 0
 

From ea9ec200fad4751b923dab0be8e1fc1770569e88 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 18 May 2022 11:09:41 -0400
Subject: [PATCH 011/144] add  --finetune-from argument, so there is no need to
 modify arguments to resume a finetuning job

---
 megatron/arguments.py     |  3 +++
 megatron/checkpointing.py | 28 ++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 07030ecf97..aff0c5fc36 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -611,6 +611,9 @@ def _add_checkpointing_args(parser):
                        help='Load model for finetuning. Do not load optimizer '
                        'or rng state from checkpoint and set iteration to 0. '
                        'Assumed when loading a release checkpoint.')
+    group.add_argument('--finetune-from', type=str, default=None,
+                       help='Directory containing a model checkpoint for finetuning.'
+                       'Will be loaded if the `--load` directory contains no checkpoint')
 
     return parser
 
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index bcaf32386e..769015f68d 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -304,12 +304,24 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     args = get_args()
     load_dir = getattr(args, load_arg)
 
-    model = utils.unwrap_model(model)
-
     if iteration is None:
         # Read the tracker file and set the iteration.
         tracker_filename = get_checkpoint_tracker_filename(load_dir)
 
+        # If no tracker file, and we are in finetuning, try to load from the `finetune_from` dir
+        if not os.path.isfile(tracker_filename) and args.finetune and load_arg != 'finetune_from':
+            print_rank_0('WARNING: could not find the metadata file {} '.format(
+                tracker_filename))
+            print_rank_0('    will try to load from `--finetune-from` instead')
+            return load_checkpoint(model, optimizer, lr_scheduler, load_arg='finetune_from', strict=strict, iteration=iteration)
+        # If we are resuming an experiment: setting finetune to False.
+        elif load_arg != 'finetune_from':
+            args.finetune=False
+            print_rank_0(f"Resuming from {load_dir}")
+        # Finetuning
+        else:
+            print_rank_0(f"Finetuning from {load_dir}")
+
         # If no tracker file, return iretation zero.
         if not os.path.isfile(tracker_filename):
             print_rank_0('WARNING: could not find the metadata file {} '.format(
@@ -324,10 +336,12 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     else:
         # Iteration given as argument: do nothing
         release = False
+    
+    model = utils.unwrap_model(model)
 
     # Checkpoint.
     checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
-    print_rank_0(f' loading checkpoint from {args.load} at iteration {iteration}')
+    print_rank_0(f' loading checkpoint from {load_dir} at iteration {iteration}')
 
     # Load the checkpoint.
     try:
@@ -395,13 +409,11 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     fix_query_key_value_ordering(model, checkpoint_version)
 
     # Optimizer.
-    # for finetuning: load optimizer but not lr_scheduler.
-    # if not release and not args.finetune and not args.no_load_optim:
-    if not release and not args.no_load_optim:
+    if not release and not args.finetune and not args.no_load_optim:
         try:
             if optimizer is not None:
                 optimizer.load_state_dict(state_dict['optimizer'])
-            if lr_scheduler is not None and not args.finetune:
+            if lr_scheduler is not None:
                 lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
         except KeyError:
             print_rank_0('Unable to load optimizer from checkpoint {}. '
@@ -450,7 +462,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     if torch.distributed.is_initialized():
         torch.distributed.barrier()
 
-    print_rank_0(f'  successfully loaded checkpoint from {args.load} '
+    print_rank_0(f'  successfully loaded checkpoint from {load_dir} '
                  f'at iteration {iteration}')
 
     return iteration

From fdc697f6fed06ef13a392765dd618998adb7c4f9 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 1 Jun 2022 14:03:47 -0400
Subject: [PATCH 012/144] make wandb not a requirement

---
 megatron/initialize.py | 5 ++++-
 megatron/training.py   | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 3bce58ec7d..728b307e88 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -23,7 +23,10 @@
 import torch
 from datetime import timedelta
 
-import wandb
+try:
+    import wandb
+except ModuleNotFoundError:
+    print('Wandb import failed', flush=True)
 
 from megatron import fused_kernels
 from megatron import get_adlr_autoresume
diff --git a/megatron/training.py b/megatron/training.py
index 334f695f0b..585dfcf538 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -20,7 +20,10 @@
 import sys
 import time
 
-import wandb
+try:
+    import wandb
+except ModuleNotFoundError:
+    print('Wandb import failed', flush=True)
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
 

From 80363928989508882f1acb479c60ca7071f6722d Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 13 Jul 2022 10:57:56 -0400
Subject: [PATCH 013/144] add tokens_per_epoch print

---
 megatron/data/gpt_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index e6c64e975d..fadc79fffd 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -197,6 +197,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     """
     # Number of tokens in each epoch and number of required epochs.
     tokens_per_epoch = _num_tokens(documents, sizes)
+    print_rank_0(f' > Tokens per epoch: {tokens_per_epoch}')
     num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
     # rng state
     np_rng = np.random.RandomState(seed=seed)

From 652625b0a6fbe790606191def4e68fa7bb09a675 Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Fri, 22 Jul 2022 12:49:17 +0100
Subject: [PATCH 014/144] add: initial port of alibi from bigscience

---
 megatron/arguments.py             |  21 ++++-
 megatron/checkpointing.py         |   6 +-
 megatron/model/enums.py           |   9 ++-
 megatron/model/glu_activations.py |  52 ++++++++++++
 megatron/model/transformer.py     | 128 +++++++++++++++++++++++-------
 5 files changed, 183 insertions(+), 33 deletions(-)
 create mode 100644 megatron/model/glu_activations.py

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 326139855f..4e1e1f86e9 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -20,6 +20,8 @@
 
 import torch
 
+from megatron.model.enums import PositionEmbeddingType
+
 def parse_args(extra_args_provider=None, defaults={},
                ignore_unknown_args=False):
     """Parse all arguments."""
@@ -256,10 +258,21 @@ def parse_args(extra_args_provider=None, defaults={},
         assert args.encoder_seq_length is not None
         args.seq_length = args.encoder_seq_length
 
-    if args.seq_length is not None:
-        assert args.max_position_embeddings >= args.seq_length
-    if args.decoder_seq_length is not None:
-        assert args.max_position_embeddings >= args.decoder_seq_length
+    # NOTE: this was before integrating alibi
+    # if args.seq_length is not None:
+    #     assert args.max_position_embeddings >= args.seq_length
+    # if args.decoder_seq_length is not None:
+    #     assert args.max_position_embeddings >= args.decoder_seq_length
+
+    if args.position_embedding_type == PositionEmbeddingType.absolute or args.position_embedding_type == PositionEmbeddingType.alibi:
+        assert args.max_position_embeddings is not None
+        if args.seq_length is not None:
+            assert args.max_position_embeddings >= args.seq_length
+        if args.decoder_seq_length is not None:
+            assert args.max_position_embeddings >= args.decoder_seq_length
+    else:
+        assert args.max_position_embeddings is None
+        
     if args.lr is not None:
         assert args.min_lr <= args.lr
     if args.save is not None:
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index d58aaab32f..0079a97cfb 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -27,6 +27,7 @@
                       print_rank_0,
                       update_num_microbatches,
                       utils)
+from megatron.model.enums import PositionEmbeddingType
 
 _CHECKPOINT_VERSION = None
 
@@ -60,8 +61,11 @@ def _compare(arg_name, old_arg_name=None):
     _compare('num_layers')
     _compare('hidden_size')
     _compare('num_attention_heads')
-    if args.vocab_file:
+    _compare('position_embedding_type')
+    # with alibi we can change `max_position_embeddings`
+    if args.position_embedding_type != PositionEmbeddingType.alibi:
         _compare('max_position_embeddings')
+    if args.vocab_file:
         _compare('make_vocab_size_divisible_by')
         _compare('padded_vocab_size')
         _compare('tokenizer_type')
diff --git a/megatron/model/enums.py b/megatron/model/enums.py
index 492d2c0c6c..90287bb498 100644
--- a/megatron/model/enums.py
+++ b/megatron/model/enums.py
@@ -29,4 +29,11 @@ class AttnType(enum.Enum):
 
 class AttnMaskType(enum.Enum):
     padding = 1
-    causal = 2
+    causal = 2 # Overrides `attention_mask` to be a lower triangular matrix
+    prefix = 3
+    custom = 4 # Forces one to pass an `attention_mask` that's 1 if we need to mask. Tensor that can be broadcast to [micro_batch_size, n_head, seq_length, seq_length]
+
+class PositionEmbeddingType(enum.Enum):
+    rotary = 1 # NOTE: this one is not used so far, however for future compatibility the enum left as is
+    absolute = 2
+    alibi = 3
diff --git a/megatron/model/glu_activations.py b/megatron/model/glu_activations.py
new file mode 100644
index 0000000000..c479d96834
--- /dev/null
+++ b/megatron/model/glu_activations.py
@@ -0,0 +1,52 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from megatron import logging
+from megatron.model.utils import log_debug_usage
+
+logger = logging.get_logger(__name__)
+
+class _GLUBaseModule(nn.Module):
+    def __init__(self, activation_fn):
+        super().__init__()
+        self.activation_fn = activation_fn
+
+    def forward(self, x):
+        # dim=-1 breaks in jit for pt<1.10
+        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
+        return x1 * self.activation_fn(x2)
+
+
+class LiGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(nn.Identity())
+
+
+class GEGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(F.gelu)
+
+
+class ReGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(F.relu)
+
+
+class SwiGLU(_GLUBaseModule):
+    def __init__(self):
+        super().__init__(F.silu)
+
+
+liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))
+geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU()))
+reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU()))
+swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU()))
+
+
+GLU_ACTIVATIONS = {
+    "geglu": geglu,
+    "liglu": liglu,
+    "reglu": reglu,
+    "swiglu": swiglu,
+}
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index b9c1b79289..3d1775a194 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -18,17 +18,27 @@
 from contextlib import nullcontext
 import torch
 import torch.nn.functional as F
+from torch import nn
 
 from megatron import get_timers, get_args, get_global_memory_buffer
 from megatron import mpu
 from .module import MegatronModule
-from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType
-from megatron.model import LayerNorm
+from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
 
 
+from .glu_activations import GLU_ACTIVATIONS
+
+# flags required to enable jit fusion kernels
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
@@ -71,24 +81,28 @@ class ParallelMLP(MegatronModule):
 
     MLP will take the input with h hidden state, project it to 4*h
     hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
+    state back into h hidden dimension. At the end, dropout is also
+    applied.
     """
 
     def __init__(self, init_method, output_layer_init_method):
         super(ParallelMLP, self).__init__()
         args = get_args()
 
-        # Project to 4h.
+        # Project to ffn_hidden_size
         self.dense_h_to_4h = mpu.ColumnParallelLinear(
             args.hidden_size,
-            args.ffn_hidden_size,
+            # GLU is a special activation that divides the dimension by a factor 2.
+            2 * args.ffn_hidden_size if args.glu_activation else args.ffn_hidden_size,
             gather_output=False,
             init_method=init_method,
             skip_bias_add=True)
 
         self.bias_gelu_fusion = args.bias_gelu_fusion
         self.activation_func = F.gelu
-        if args.openai_gelu:
+        if args.glu_activation:
+            self.activation_func = GLU_ACTIVATIONS[args.glu_activation]
+        elif args.openai_gelu:
             self.activation_func = openai_gelu
         elif args.onnx_safe:
             self.activation_func = erf_gelu
@@ -214,7 +228,7 @@ def __init__(self, layer_number,
         self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
 
     def forward(self, query_layer, key_layer,
-                value_layer, attention_mask):
+                value_layer, attention_mask, alibi):
 
         # ===================================
         # Raw attention scores. [b, np, s, s]
@@ -233,17 +247,36 @@ def forward(self, query_layer, key_layer,
         key_layer = key_layer.view(output_size[3],
                                    output_size[0] * output_size[1], -1)
 
-        # preallocting input tensor: [b * np, sq, sk]
-        matmul_input_buffer = get_global_memory_buffer().get_tensor(
-            (output_size[0]*output_size[1], output_size[2], output_size[3]),
-            query_layer.dtype, "mpu")
+        if alibi is None:
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = get_global_memory_buffer().get_tensor(
+                (output_size[0]*output_size[1], output_size[2], output_size[3]),
+                query_layer.dtype, "mpu")
+        else:
+            matmul_input_buffer = alibi[:output_size[0]*output_size[1], :, :output_size[3]]
 
         # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_input_buffer,
-            query_layer.transpose(0, 1),   # [b * np, sq, hn]
-            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
+        if alibi is None:
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),   # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0, alpha=(1.0/self.norm_factor))
+        else:
+            if not hasattr(self, "logged_alibi"):
+                print("Using Alibi.")
+                self.logged_alibi = True
+
+            if self.apply_query_key_layer_scaling:
+                beta = 1.0 / self.layer_number
+            else:
+                beta = 1.0
+
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=beta, alpha=(1.0 / self.norm_factor))
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
@@ -364,20 +397,21 @@ def __init__(self, init_method,
             skip_bias_add=True)
 
     def _checkpointed_attention_forward(self, query_layer, key_layer,
-                                        value_layer, attention_mask):
+                                        value_layer, attention_mask, alibi):
         """Forward method with activation checkpointing."""
         def custom_forward(*inputs):
             query_layer = inputs[0]
             key_layer = inputs[1]
             value_layer = inputs[2]
             attention_mask = inputs[3]
+            alibi = inputs[4]
             output_ = self.core_attention(query_layer, key_layer,
-                                          value_layer, attention_mask)
+                                          value_layer, attention_mask, alibi)
             return output_
 
         hidden_states = mpu.checkpoint(
             custom_forward,
-            False, query_layer, key_layer, value_layer, attention_mask)
+            False, query_layer, key_layer, value_layer, attention_mask, alibi)
 
         return hidden_states
 
@@ -391,7 +425,7 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size):
             device=torch.cuda.current_device())
 
     def forward(self, hidden_states, attention_mask,
-                encoder_output=None, inference_params=None):
+                encoder_output=None, inference_params=None, alibi=None):
         # hidden_states: [sq, b, h]
 
         # =================================================
@@ -478,10 +512,10 @@ def forward(self, hidden_states, attention_mask,
 
         if self.checkpoint_core_attention:
             context_layer = self._checkpointed_attention_forward(
-                query_layer, key_layer, value_layer, attention_mask)
+                query_layer, key_layer, value_layer, attention_mask, alibi)
         else:
             context_layer = self.core_attention(
-                query_layer, key_layer, value_layer, attention_mask)
+                query_layer, key_layer, value_layer, attention_mask, alibi)
 
         # =================
         # Output. [sq, b, h]
@@ -588,13 +622,23 @@ def __init__(self, init_method, output_layer_init_method,
         else:
             self.mlp = ParallelMLP(init_method, output_layer_init_method)
 
-        # Set bias+dropout+add fusion grad_enable execution handler.
+                    # Set bias+dropout+add fusion grad_enable execution handler.
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
         use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
         self.bias_dropout_add_exec_handler = \
                 nullcontext if use_nvfuser else torch.enable_grad
 
+        # Alibi
+        if args.position_embedding_type == PositionEmbeddingType.alibi:
+            self.alibi = self._build_alibi_tensor(args.seq_length, args.num_attention_heads, args.micro_batch_size).to(torch.cuda.current_device())
+            if args.params_dtype == torch.float16:
+                self.alibi = self.alibi.to(torch.float16)
+            elif args.params_dtype == torch.bfloat16:
+                self.alibi = self.alibi.to(torch.bfloat16)
+        else:
+            self.alibi = None
+
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
                 inference_params=None):
@@ -607,7 +651,8 @@ def forward(self, hidden_states, attention_mask,
             self.self_attention(
                 layernorm_output,
                 attention_mask,
-                inference_params=inference_params)
+                inference_params=inference_params,
+                alibi=self.alibi)
 
         # Residual connection.
         if self.apply_residual_connection_post_layernorm:
@@ -699,6 +744,35 @@ def forward(self, hidden_states, attention_mask,
 
         return output
 
+    @staticmethod
+    def _build_alibi_tensor(max_seq_len, num_attention_heads, batch_size):
+        # Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+        """Returns tensor shaped (batch_size * num_attention_heads, 1, max_seq_len)"""
+
+        def get_slopes(n):
+            def get_slopes_power_of_2(n):
+                start = (2 ** (-2 ** -(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio ** i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2 ** math.floor(math.log2(n))
+                return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
+                                                                   :n - closest_power_of_2]
+
+        slopes = torch.Tensor(get_slopes(num_attention_heads))
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand(
+            num_attention_heads, -1, -1)
+        
+        #Select the part of the tensor that corresponds to our tensor parallel index.
+        tp_world_size = mpu.get_tensor_model_parallel_world_size()
+        tp_index = mpu.get_tensor_model_parallel_rank()
+        alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index]
+        
+        alibi = alibi.repeat(batch_size, 1, 1)
+        return alibi
 
 class NoopTransformerLayer(MegatronModule):
     """A single 'no-op' transformer layer.
@@ -732,7 +806,7 @@ class ParallelTransformer(MegatronModule):
     def __init__(self, init_method, output_layer_init_method,
                  layer_type=LayerType.encoder,
                  self_attn_mask_type=AttnMaskType.padding,
-                 post_layer_norm=True, 
+                  post_layer_norm=True, 
                  pre_process=True, post_process=True,
                  drop_path_rate=0.0):
         super(ParallelTransformer, self).__init__()
@@ -886,8 +960,8 @@ def set_input_tensor(self, input_tensor):
         self.input_tensor = input_tensor
 
     def forward(self, hidden_states, attention_mask,
-                encoder_output=None, enc_dec_attn_mask=None,
-                inference_params=None):
+                 encoder_output=None, enc_dec_attn_mask=None,
+                 inference_params=None):
         # hidden_states: [s, b, h]
 
         # Checks.

From 9734afd8f402f03d02871df6c0769550062ffd2e Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Tue, 26 Jul 2022 08:39:31 +0100
Subject: [PATCH 015/144] chg: change back import for this version of  Megatron

---
 megatron/model/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 3d1775a194..7ca6348011 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -24,7 +24,7 @@
 from megatron import mpu
 from .module import MegatronModule
 from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
-from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron.model import LayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu

From f49506b81ad5fd6109283bec5ad04cf9a89e1bea Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Tue, 26 Jul 2022 11:54:29 +0100
Subject: [PATCH 016/144] cnh: direct import of MixedFusedLayerNorm

---
 megatron/model/__init__.py                   | 1 -
 megatron/model/bert_model.py                 | 2 +-
 megatron/model/transformer.py                | 2 +-
 megatron/model/vision/esvit_swin_backbone.py | 2 +-
 megatron/model/vision/mit_backbone.py        | 2 +-
 tasks/vision/segmentation/seg_heads.py       | 2 +-
 6 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index ac226c1ac6..331ba78183 100644
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 
 from .distributed import DistributedDataParallel
 from .bert_model import BertModel
diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py
index 213565d25d..158fc84ef0 100644
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -22,7 +22,7 @@
 from megatron.model.enums import AttnMaskType
 from megatron.model.language_model import parallel_lm_logits
 from megatron.model.language_model import get_language_model
-from megatron.model import LayerNorm
+from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from megatron.model.utils import openai_gelu, erf_gelu
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 7ca6348011..f011770ee7 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -24,7 +24,7 @@
 from megatron import mpu
 from .module import MegatronModule
 from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
-from megatron.model import LayerNorm
+from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
diff --git a/megatron/model/vision/esvit_swin_backbone.py b/megatron/model/vision/esvit_swin_backbone.py
index 70aee3db42..221ccf331f 100644
--- a/megatron/model/vision/esvit_swin_backbone.py
+++ b/megatron/model/vision/esvit_swin_backbone.py
@@ -17,7 +17,7 @@
 from torch.nn.init import trunc_normal_
 from megatron.model.transformer import DropPath
 from megatron import get_args
-from megatron.model import LayerNorm
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 import numpy as np
 from math import sqrt
 
diff --git a/megatron/model/vision/mit_backbone.py b/megatron/model/vision/mit_backbone.py
index c67ca2c62b..c68f10e764 100644
--- a/megatron/model/vision/mit_backbone.py
+++ b/megatron/model/vision/mit_backbone.py
@@ -12,7 +12,7 @@
 from functools import partial
 from torch.nn.init import trunc_normal_
 from megatron.model.transformer import DropPath
-from megatron.model import LayerNorm
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 
 
 class Mlp(nn.Module):
diff --git a/tasks/vision/segmentation/seg_heads.py b/tasks/vision/segmentation/seg_heads.py
index c87c3027af..349a440a2b 100644
--- a/tasks/vision/segmentation/seg_heads.py
+++ b/tasks/vision/segmentation/seg_heads.py
@@ -18,7 +18,7 @@
 import apex
 import torch.nn.functional as F
 from megatron import get_args
-from megatron.model import LayerNorm
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from megatron.model.module import MegatronModule
 from megatron.model.vision.utils import resize
 

From d3ce0187f7a7379909330fb0e6ca150680663abe Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Tue, 26 Jul 2022 13:48:31 +0100
Subject: [PATCH 017/144] chg: enums moved to megatron root

---
 megatron/arguments.py                              | 2 +-
 megatron/checkpointing.py                          | 2 +-
 megatron/{model => }/enums.py                      | 0
 megatron/fused_kernels/tests/test_fused_kernels.py | 2 +-
 megatron/model/bert_model.py                       | 2 +-
 megatron/model/biencoder_model.py                  | 2 +-
 megatron/model/classification.py                   | 2 +-
 megatron/model/fused_softmax.py                    | 2 +-
 megatron/model/language_model.py                   | 2 +-
 megatron/model/multiple_choice.py                  | 2 +-
 megatron/model/realm_model.py                      | 2 +-
 megatron/model/t5_model.py                         | 2 +-
 megatron/model/transformer.py                      | 2 +-
 13 files changed, 12 insertions(+), 12 deletions(-)
 rename megatron/{model => }/enums.py (100%)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 4e1e1f86e9..f2f0200f50 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -20,7 +20,7 @@
 
 import torch
 
-from megatron.model.enums import PositionEmbeddingType
+from megatron.enums import PositionEmbeddingType
 
 def parse_args(extra_args_provider=None, defaults={},
                ignore_unknown_args=False):
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 0079a97cfb..906bf17b97 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -27,7 +27,7 @@
                       print_rank_0,
                       update_num_microbatches,
                       utils)
-from megatron.model.enums import PositionEmbeddingType
+from megatron.enums import PositionEmbeddingType
 
 _CHECKPOINT_VERSION = None
 
diff --git a/megatron/model/enums.py b/megatron/enums.py
similarity index 100%
rename from megatron/model/enums.py
rename to megatron/enums.py
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index f8d5027a1f..9052043d14 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -3,7 +3,7 @@
 import torch
 from torch.nn import LayerNorm
 
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.utils import attention_mask_func
diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py
index 158fc84ef0..d61da23a95 100644
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -19,7 +19,7 @@
 
 from megatron import get_args
 from megatron import mpu
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.language_model import parallel_lm_logits
 from megatron.model.language_model import get_language_model
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
diff --git a/megatron/model/biencoder_model.py b/megatron/model/biencoder_model.py
index 752c5752e9..d89f6dbee0 100644
--- a/megatron/model/biencoder_model.py
+++ b/megatron/model/biencoder_model.py
@@ -8,7 +8,7 @@
 from megatron.checkpointing import get_checkpoint_name
 from megatron import mpu, get_tokenizer
 from megatron.model.bert_model import bert_position_ids
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
diff --git a/megatron/model/classification.py b/megatron/model/classification.py
index d975072f77..94dc5fe7d8 100644
--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
@@ -19,7 +19,7 @@
 
 from megatron import get_args, print_rank_last
 from megatron import mpu
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 2409edd59f..096c8b2b4b 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -16,7 +16,7 @@
 
 import torch
 import torch.nn as nn
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 
 
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
index 6cec08cf0d..ca5613cd5d 100644
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -21,7 +21,7 @@
 from megatron import get_args
 from megatron import mpu
 from .module import MegatronModule
-from megatron.model.enums import LayerType, AttnMaskType
+from megatron.enums import LayerType, AttnMaskType
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal
diff --git a/megatron/model/multiple_choice.py b/megatron/model/multiple_choice.py
index c43bd969c0..a8445e0cb7 100644
--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
@@ -19,7 +19,7 @@
 
 from megatron import get_args, print_rank_last
 from megatron import mpu
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
diff --git a/megatron/model/realm_model.py b/megatron/model/realm_model.py
index 5730a85e36..c57f51592a 100644
--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
@@ -6,7 +6,7 @@
 from megatron.model import BertModel
 from .module import MegatronModule
 from megatron import mpu
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.language_model import get_language_model
diff --git a/megatron/model/t5_model.py b/megatron/model/t5_model.py
index 3ed032c697..f6cefe8979 100644
--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
@@ -21,7 +21,7 @@
     get_args,
     mpu
 )
-from megatron.model.enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from megatron.model.language_model import parallel_lm_logits, get_language_model
 from megatron.model.transformer import LayerNorm
 from megatron.model.utils import (
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index f011770ee7..62664272a6 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -23,7 +23,7 @@
 from megatron import get_timers, get_args, get_global_memory_buffer
 from megatron import mpu
 from .module import MegatronModule
-from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
+from megatron.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl

From 73c7130e08c052cb8034b99c3d7aa5187e2a6383 Mon Sep 17 00:00:00 2001
From: Denis Kocetkov <denis.kocetkov@servicenow.com>
Date: Tue, 26 Jul 2022 09:45:14 -0400
Subject: [PATCH 018/144] fix: commented logging funcionality which is not
 implemented yet

---
 megatron/model/glu_activations.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/megatron/model/glu_activations.py b/megatron/model/glu_activations.py
index c479d96834..4fa821d3f1 100644
--- a/megatron/model/glu_activations.py
+++ b/megatron/model/glu_activations.py
@@ -2,10 +2,14 @@
 from torch import nn
 from torch.nn import functional as F
 
-from megatron import logging
-from megatron.model.utils import log_debug_usage
 
-logger = logging.get_logger(__name__)
+# NOTE: logging funcionality commented for now as 
+#      it is not implemented in this version so far
+
+#from megatron import logging
+#from megatron.model.utils import log_debug_usage
+
+#logger = logging.get_logger(__name__)
 
 class _GLUBaseModule(nn.Module):
     def __init__(self, activation_fn):
@@ -38,10 +42,15 @@ def __init__(self):
         super().__init__(F.silu)
 
 
-liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))
-geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU()))
-reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU()))
-swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU()))
+#liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))
+#geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU()))
+#reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU()))
+#swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU()))
+
+liglu = torch.jit.script(LiGLU())
+geglu = torch.jit.script(GEGLU())
+reglu = torch.jit.script(ReGLU())
+swiglu = torch.jit.script(SwiGLU())
 
 
 GLU_ACTIVATIONS = {

From b09a8d11e3b97cf111f844bf7a94a5e01153f260 Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Tue, 26 Jul 2022 14:53:29 +0100
Subject: [PATCH 019/144] chg: refactor for moved enums

---
 megatron/model/__init__.py  | 2 +-
 megatron/model/gpt_model.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 331ba78183..18a4e8956d 100644
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -20,4 +20,4 @@
 from .t5_model import T5Model
 from .language_model import get_language_model
 from .module import Float16Module
-from .enums import ModelType
+from megatron.enums import ModelType
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
index af6b5bf12e..5abdf1ecfe 100644
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -21,7 +21,7 @@
 from megatron import mpu
 from .module import MegatronModule
 
-from .enums import AttnMaskType
+from megatron.enums import AttnMaskType
 from .language_model import parallel_lm_logits
 from .language_model import get_language_model
 from .utils import init_method_normal

From 4e07321686b0bf8fa7026388ebce05deafcc5c4b Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Wed, 27 Jul 2022 11:20:11 +0100
Subject: [PATCH 020/144] add: port support for positional embedding param from
 bigscience

---
 megatron/model/language_model.py | 65 +++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
index ca5613cd5d..759975e779 100644
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -21,7 +21,7 @@
 from megatron import get_args
 from megatron import mpu
 from .module import MegatronModule
-from megatron.enums import LayerType, AttnMaskType
+from megatron.enums import LayerType, AttnMaskType, PositionEmbeddingType
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal
@@ -132,8 +132,6 @@ class Embedding(MegatronModule):
     Arguments:
         hidden_size: hidden size
         vocab_size: vocabulary size
-        max_sequence_length: maximum size of sequence. This
-                             is used for positional embedding
         embedding_dropout_prob: dropout probability for embeddings
         init_method: weight initialization method
         num_tokentypes: size of the token-type embeddings. 0 value
@@ -143,7 +141,6 @@ class Embedding(MegatronModule):
     def __init__(self,
                  hidden_size,
                  vocab_size,
-                 max_sequence_length,
                  embedding_dropout_prob,
                  init_method,
                  num_tokentypes=0):
@@ -162,11 +159,17 @@ def __init__(self,
         self._word_embeddings_key = 'word_embeddings'
 
         # Position embedding (serial).
-        self.position_embeddings = torch.nn.Embedding(
-            max_sequence_length, self.hidden_size)
-        self._position_embeddings_key = 'position_embeddings'
-        # Initialize the position embeddings.
-        self.init_method(self.position_embeddings.weight)
+        self.position_embedding_type = args.position_embedding_type
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            max_position_embeddings = args.max_position_embeddings
+            assert max_position_embeddings is not None
+            self.position_embeddings = torch.nn.Embedding(
+                max_position_embeddings, self.hidden_size)
+            self._position_embeddings_key = 'position_embeddings'
+            # Initialize the position embeddings.
+            self.init_method(self.position_embeddings.weight)
+        else:
+            self.position_embeddings = None
 
         # Token type embedding.
         # Add this as an optional field that can be added through
@@ -190,8 +193,9 @@ def zero_parameters(self):
         """Zero out all parameters in embedding."""
         self.word_embeddings.weight.data.fill_(0)
         self.word_embeddings.weight.shared = True
-        self.position_embeddings.weight.data.fill_(0)
-        self.position_embeddings.weight.shared = True
+        if self.position_embeddings is not None:
+            self.position_embeddings.weight.data.fill_(0)
+            self.position_embeddings.weight.shared = True
         if self.num_tokentypes > 0:
             self.tokentype_embeddings.weight.data.fill_(0)
             self.tokentype_embeddings.weight.shared = True
@@ -216,8 +220,14 @@ def add_tokentype_embeddings(self, num_tokentypes):
     def forward(self, input_ids, position_ids, tokentype_ids=None):
         # Embeddings.
         words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        embeddings = words_embeddings + position_embeddings
+        embeddings = words_embeddings
+
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            assert self.position_embeddings is not None
+            embeddings = embeddings + self.position_embeddings(position_ids)
+        else:
+            assert self.position_embeddings is None
+
         if tokentype_ids is not None:
             assert self.tokentype_embeddings is not None
             embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
@@ -248,9 +258,10 @@ def state_dict_for_save_checkpoint(self, destination=None, prefix='',
         state_dict_ = {}
         state_dict_[self._word_embeddings_key] \
             = self.word_embeddings.state_dict(destination, prefix, keep_vars)
-        state_dict_[self._position_embeddings_key] \
-            = self.position_embeddings.state_dict(
-                destination, prefix, keep_vars)
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            state_dict_[self._position_embeddings_key] \
+                = self.position_embeddings.state_dict(
+                    destination, prefix, keep_vars)
         if self.num_tokentypes > 0:
             state_dict_[self._tokentype_embeddings_key] \
                 = self.tokentype_embeddings.state_dict(
@@ -274,16 +285,17 @@ def load_state_dict(self, state_dict, strict=True):
         self.word_embeddings.load_state_dict(state_dict_, strict=strict)
 
         # Position embedding.
-        if self._position_embeddings_key in state_dict:
-            state_dict_ = state_dict[self._position_embeddings_key]
-        else:
-            # for backward compatibility.
-            state_dict_ = {}
-            for key in state_dict.keys():
-                if 'position_embeddings' in key:
-                    state_dict_[key.split('position_embeddings.')[1]] \
-                        = state_dict[key]
-        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+        if self.position_embedding_type == PositionEmbeddingType.absolute:
+            if self._position_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._position_embeddings_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'position_embeddings' in key:
+                        state_dict_[key.split('position_embeddings.')[1]] \
+                            = state_dict[key]
+            self.position_embeddings.load_state_dict(state_dict_, strict=strict)
 
         # Tokentype embedding.
         if self.num_tokentypes > 0:
@@ -347,7 +359,6 @@ def __init__(self,
         if self.pre_process:
             self.embedding = Embedding(self.hidden_size,
                                        args.padded_vocab_size,
-                                       args.max_position_embeddings,
                                        args.hidden_dropout,
                                        self.init_method,
                                        self.num_tokentypes)

From 4ccf2377718ef76666acc042064aa9504ca33651 Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Wed, 27 Jul 2022 11:55:04 +0100
Subject: [PATCH 021/144] add: port from bigscience for  pos embeding and glu
 activations args

---
 megatron/arguments.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index f2f0200f50..7ff8dae173 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -21,6 +21,7 @@
 import torch
 
 from megatron.enums import PositionEmbeddingType
+import megatron
 
 def parse_args(extra_args_provider=None, defaults={},
                ignore_unknown_args=False):
@@ -284,6 +285,10 @@ def parse_args(extra_args_provider=None, defaults={},
         assert args.fp16 or args.bf16, \
             'residual connection in fp32 only supported when using fp16 or bf16.'
 
+    # Activation function
+    if args.glu_activation is not None and args.bias_gelu_fusion:
+        raise ValueError("if glu-activation is used, please set --no-bias-gelu-fusion")
+
     if args.weight_decay_incr_style == 'constant':
         assert args.start_weight_decay is None
         assert args.end_weight_decay is None
@@ -418,6 +423,15 @@ def _add_network_size_args(parser):
     group.add_argument('--bert-no-binary-head', action='store_false',
                        help='Disable BERT binary head.',
                        dest='bert_binary_head')
+    group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
+                       choices=list(PositionEmbeddingType),
+                       default=PositionEmbeddingType.absolute,
+                       help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
+                       )
+    group.add_argument('--glu-activation', type=str,
+                       choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
+                       help='GLU activations to use.'
+                       )
     group.add_argument('--num-experts', type=int, default=None,
                        help='Number of Experts in Switch Transformer (None means no Switch)')
     return parser

From 40649698d143f3b40b070581b31d2db0ac3d70d0 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 8 Aug 2022 18:26:32 -0400
Subject: [PATCH 022/144] add multi-query attention logic in attention module

---
 megatron/arguments.py         |  4 ++
 megatron/model/transformer.py | 87 +++++++++++++++++++++++++++++------
 2 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 326139855f..66663aa4c7 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -383,6 +383,10 @@ def _add_network_size_args(parser):
                        'attention. This is set to '
                        '   args.hidden_size // args.num_attention_heads '
                        'if not provided.')
+    group.add_argument('--attention-head-type', type=str, default='multihead',
+                       choices=['multihead', 'multiquery'],
+                       help='Type of attention heads. `multihead` is the standard multi-head attention.'
+                       '`multiquery` shares the values and keys across attention heads')
     group.add_argument('--max-position-embeddings', type=int, default=None,
                        help='Maximum number of position embeddings to use. '
                        'This is the size of position embedding.')
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index b9c1b79289..adb0b84cbe 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -26,7 +26,7 @@
 from megatron.model import LayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
-from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu
+from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu, get_linear_layer
 
 
 """ We use the following notation throughout this file:
@@ -214,11 +214,12 @@ def __init__(self, layer_number,
         self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
 
     def forward(self, query_layer, key_layer,
-                value_layer, attention_mask):
+                value_layer, attention_mask, expand_key_value=False):
 
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================
+        np = query_layer.size(2)
 
         # [b, np, sq, sk]
         output_size = (query_layer.size(1),
@@ -229,9 +230,15 @@ def forward(self, query_layer, key_layer,
         # [sq, b, np, hn] -> [sq, b * np, hn]
         query_layer = query_layer.view(output_size[2],
                                        output_size[0] * output_size[1], -1)
+        # [sk, b, 1, hn] -> [sk, b * np, hn]
+        # TODO: Check that we indeed get the speedup at inference. Isn't the reshape memory allocation a bottleneck?
+        if expand_key_value:
+            key_layer = key_layer.expand(output_size[3], output_size[0], np, -1)
+            key_layer = key_layer.reshape(output_size[3], output_size[0] * np, -1)
         # [sk, b, np, hn] -> [sk, b * np, hn]
-        key_layer = key_layer.view(output_size[3],
-                                   output_size[0] * output_size[1], -1)
+        else:
+            key_layer = key_layer.view(output_size[3],
+                                    output_size[0] * output_size[1], -1)
 
         # preallocting input tensor: [b * np, sq, sk]
         matmul_input_buffer = get_global_memory_buffer().get_tensor(
@@ -274,13 +281,18 @@ def forward(self, query_layer, key_layer,
 
         # context layer shape: [b, np, sq, hn]
         output_size = (value_layer.size(1),
-                       value_layer.size(2),
+                       np,
                        query_layer.size(0),
                        value_layer.size(3))
 
-        # change view [sk, b * np, hn]
-        value_layer = value_layer.view(value_layer.size(0),
-                                       output_size[0] * output_size[1], -1)
+        # [sk, b, 1, hn] -> [sk, b * np, hn]
+        if expand_key_value:
+            value_layer = value_layer.expand(value_layer.size(0), value_layer.size(1), np, -1)
+            value_layer = value_layer.reshape(value_layer.size(0), value_layer.size(1) * np, -1)
+        else:
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0),
+                                        output_size[0] * output_size[1], -1)
 
         # change view [b * np, sq, sk]
         attention_probs = attention_probs.view(output_size[0] * output_size[1],
@@ -320,6 +332,7 @@ def __init__(self, init_method,
         self.attention_type = attention_type
         self.attn_mask_type = attn_mask_type
         self.params_dtype = args.params_dtype
+        self.attention_head_type = args.attention_head_type
 
         projection_size = args.kv_channels * args.num_attention_heads
 
@@ -331,12 +344,28 @@ def __init__(self, init_method,
             args.num_attention_heads, world_size)
 
         # Strided linear layer.
-        if attention_type == AttnType.self_attn:
+        if attention_type == AttnType.self_attn and self.attention_head_type == 'multihead':
             self.query_key_value = mpu.ColumnParallelLinear(
                 args.hidden_size,
                 3 * projection_size,
                 gather_output=False,
                 init_method=init_method)
+        elif attention_type == AttnType.self_attn and self.attention_head_type == 'multiquery':
+            self.query = mpu.ColumnParallelLinear(
+                args.hidden_size,
+                projection_size,
+                gather_output=False,
+                init_method=init_method)
+            # In MultiQuery attention, keys and values are shared across heads
+            # Use args.kv_channels instead of projection_size
+            # No `.fork()` so the rng tracker is shared across tensor-parallel processes.
+            # with mpu.get_cuda_rng_tracker():
+            self.key_value = get_linear_layer(
+                args.hidden_size,
+                2 * args.kv_channels,
+                init_method=init_method)
+            print(f"KV WEIGHT {layer_number}", self.key_value.weight)
+        # TODO: add elif block for cross_attn and multiquery?
         else:
             assert attention_type == AttnType.cross_attn
             self.query = mpu.ColumnParallelLinear(
@@ -364,7 +393,7 @@ def __init__(self, init_method,
             skip_bias_add=True)
 
     def _checkpointed_attention_forward(self, query_layer, key_layer,
-                                        value_layer, attention_mask):
+                                        value_layer, attention_mask, expand_key_value):
         """Forward method with activation checkpointing."""
         def custom_forward(*inputs):
             query_layer = inputs[0]
@@ -372,7 +401,7 @@ def custom_forward(*inputs):
             value_layer = inputs[2]
             attention_mask = inputs[3]
             output_ = self.core_attention(query_layer, key_layer,
-                                          value_layer, attention_mask)
+                                          value_layer, attention_mask, expand_key_value)
             return output_
 
         hidden_states = mpu.checkpoint(
@@ -385,11 +414,12 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size):
         return torch.empty(
             inference_max_sequence_len,
             batch_size,
-            self.num_attention_heads_per_partition,
+            self.num_attention_heads_per_partition if self.attention_head_type == "multihead" else 1,
             self.hidden_size_per_attention_head,
             dtype=self.params_dtype,
             device=torch.cuda.current_device())
 
+
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, inference_params=None):
         # hidden_states: [sq, b, h]
@@ -415,7 +445,7 @@ def forward(self, hidden_states, attention_mask,
         # Query, Key, and Value
         # =====================
 
-        if self.attention_type == AttnType.self_attn:
+        if self.attention_type == AttnType.self_attn and self.attention_head_type == 'multihead':
             # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
 
@@ -429,6 +459,33 @@ def forward(self, hidden_states, attention_mask,
             (query_layer,
              key_layer,
              value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
+        elif self.attention_type == AttnType.self_attn and self.attention_head_type == 'multiquery':
+            # Attention heads [sq, b, h] --> [sq, b, (2 * hn)]
+            mixed_kv_layer = self.key_value(hidden_states)
+
+            # [sq, b, (2 * hn)] --> [sq, b, np (expanded), 2 * hn]
+            # new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+            #     (self.num_attention_heads_per_partition,
+            #      2 * self.hidden_size_per_attention_head)
+            # mixed_kv_layer = mixed_kv_layer.unsqueeze(2).expand(*new_tensor_shape)
+
+            # [sq, b, (2 * hn)] --> [sq, b, 1, 2 * hn]
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+                (1,
+                 2 * self.hidden_size_per_attention_head)
+            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 2 * hn] --> 2 [sq, b, np, hn]
+            (key_layer,
+             value_layer) = mpu.split_tensor_along_last_dim(mixed_kv_layer, 2)
+
+            # Attention head [sq, b, h] --> [sq, b, np * hn]
+            query_layer, _ = self.query(hidden_states)
+            # [sq, b, np * hn] --> [sq, b, np, hn]
+            new_tensor_shape = query_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(*new_tensor_shape)
         else:
             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
             mixed_kv_layer, _ = self.key_value(encoder_output)
@@ -478,10 +535,10 @@ def forward(self, hidden_states, attention_mask,
 
         if self.checkpoint_core_attention:
             context_layer = self._checkpointed_attention_forward(
-                query_layer, key_layer, value_layer, attention_mask)
+                query_layer, key_layer, value_layer, attention_mask, expand_key_value=True)
         else:
             context_layer = self.core_attention(
-                query_layer, key_layer, value_layer, attention_mask)
+                query_layer, key_layer, value_layer, attention_mask, expand_key_value=True)
 
         # =================
         # Output. [sq, b, h]

From 190e328617b629c10404822a261835e6e99f7f16 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 9 Aug 2022 11:56:00 -0400
Subject: [PATCH 023/144] add kv weight gradient reduction in tensor-parallel
 group

---
 megatron/mpu/layers.py          |  3 ++-
 megatron/optimizer/optimizer.py | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 3ee9db274b..ac78c3adbe 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -264,7 +264,8 @@ def backward(ctx, grad_output):
             handle.wait()
 
         # Convert the tensor shapes to 2D for execution compatibility
-        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
+        # TODO: Is the reshape preventing us from getting a speedup here?
+        grad_output = grad_output.reshape(grad_output.shape[0] * grad_output.shape[1],
                                        grad_output.shape[2])
         total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
 				       total_input.shape[2])
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
index b265145a3d..6e83e6592d 100644
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -265,6 +265,19 @@ def allreduce_embedding_grads(self, args):
         """All-reduce both word and position embeddings."""
         self.allreduce_word_embedding_grads(args)
         self.allreduce_position_embedding_grads(args)
+    
+    def allreduce_key_value_grads(self, args):
+        # TODO: models[0] ?
+        unwrapped_model = self.models[0]
+        unwrapped_model = unwrap_model(
+                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+        for layer in unwrapped_model.language_model.encoder.layers:
+            kv_weight = layer.self_attention.key_value.weight
+            if args.DDP_impl == 'local':
+                grad = kv_weight.main_grad
+            else:
+                grad = kv_weight.grad
+            torch.distributed.all_reduce(grad, group=mpu.get_tensor_model_parallel_group())
 
 
     def allreduce_layernorm_grads(self, args):
@@ -310,6 +323,13 @@ def reduce_model_grads(self, args, timers):
         self.allreduce_embedding_grads(args)
         timers('backward-embedding-all-reduce').stop()
 
+        # All-reduce key-value grads if needed.
+        if args.attention_head_type == "multiquery":
+            timers('backward-key-value-all-reduce').start()
+            self.allreduce_key_value_grads(args)
+            timers('backward-key-value-all-reduce').stop()
+
+
 
 class MixedPrecisionOptimizer(MegatronOptimizer):
     """Base class for both the float-16 and the distributed optimizer.

From 6fd0c29dfe204bfa52f5c37602de954dae4e521b Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 10 Aug 2022 12:46:56 -0400
Subject: [PATCH 024/144] more efficient multiquery attention

---
 megatron/model/transformer.py | 168 ++++++++++++++++++++++++++++++++--
 1 file changed, 161 insertions(+), 7 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index adb0b84cbe..aa776b2232 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -215,7 +215,7 @@ def __init__(self, layer_number,
 
     def forward(self, query_layer, key_layer,
                 value_layer, attention_mask, expand_key_value=False):
-
+        timers = get_timers()
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================
@@ -230,27 +230,30 @@ def forward(self, query_layer, key_layer,
         # [sq, b, np, hn] -> [sq, b * np, hn]
         query_layer = query_layer.view(output_size[2],
                                        output_size[0] * output_size[1], -1)
-        # [sk, b, 1, hn] -> [sk, b * np, hn]
-        # TODO: Check that we indeed get the speedup at inference. Isn't the reshape memory allocation a bottleneck?
+        timers("CoreAttention: K view/reshape").start()
         if expand_key_value:
+            # [sk, b, 1, hn] -> [sk, b * np, hn]
             key_layer = key_layer.expand(output_size[3], output_size[0], np, -1)
             key_layer = key_layer.reshape(output_size[3], output_size[0] * np, -1)
-        # [sk, b, np, hn] -> [sk, b * np, hn]
         else:
+            # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.view(output_size[3],
                                     output_size[0] * output_size[1], -1)
+        timers("CoreAttention: K view/reshape").stop()
 
         # preallocting input tensor: [b * np, sq, sk]
         matmul_input_buffer = get_global_memory_buffer().get_tensor(
             (output_size[0]*output_size[1], output_size[2], output_size[3]),
             query_layer.dtype, "mpu")
 
+        timers("CoreAttention: QK matmul").start()
         # Raw attention scores. [b * np, sq, sk]
         matmul_result = torch.baddbmm(
             matmul_input_buffer,
             query_layer.transpose(0, 1),   # [b * np, sq, hn]
             key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
             beta=0.0, alpha=(1.0/self.norm_factor))
+        timers("CoreAttention: QK matmul").stop()
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
@@ -259,6 +262,7 @@ def forward(self, query_layer, key_layer,
         # Attention probs and dropout
         # ===========================
 
+        timers("CoreAttention: Softmax, dropout").start()
         # attention scores and attention mask [b, np, sq, sk]
         attention_probs = self.scale_mask_softmax(attention_scores,
                                                   attention_mask)
@@ -271,6 +275,7 @@ def forward(self, query_layer, key_layer,
                 attention_probs = self.attention_dropout(attention_probs)
         else:
             attention_probs = self.attention_dropout(attention_probs)
+        timers("CoreAttention: Softmax, dropout").stop()
 
         # =========================
         # Context layer. [sq, b, hp]
@@ -285,6 +290,7 @@ def forward(self, query_layer, key_layer,
                        query_layer.size(0),
                        value_layer.size(3))
 
+        timers("CoreAttention: V view/reshape").start()
         # [sk, b, 1, hn] -> [sk, b * np, hn]
         if expand_key_value:
             value_layer = value_layer.expand(value_layer.size(0), value_layer.size(1), np, -1)
@@ -293,19 +299,137 @@ def forward(self, query_layer, key_layer,
             # change view [sk, b * np, hn]
             value_layer = value_layer.view(value_layer.size(0),
                                         output_size[0] * output_size[1], -1)
+        timers("CoreAttention: V view/reshape").stop()
 
         # change view [b * np, sq, sk]
         attention_probs = attention_probs.view(output_size[0] * output_size[1],
                                                output_size[2], -1)
 
+        timers("CoreAttention: V matmul").start()
         # matmul: [b * np, sq, hn]
         context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+        timers("CoreAttention: V matmul").stop()
 
         # change view [b, np, sq, hn]
         context_layer = context_layer.view(*output_size)
 
+        timers("CoreAttention: context contiguous").start()
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        timers("CoreAttention: context contiguous").stop()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class MultiQueryCoreAttention(CoreAttention):
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask, expand_key_value=False):
+        timers = get_timers()
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+        sq = query_layer.size(0)
+        bs = query_layer.size(1)
+        np = query_layer.size(2)
+
+        sk = key_layer.size(0)
+        # Only one head for key and values
+        assert key_layer.size(2) == 1 and value_layer.size(2) == 1
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+
+        
+        timers("CoreAttention: K view/reshape").start()
+        # [sq, b, np, hn] -> [b, np * sq, hn]
+        query_layer = query_layer.permute([1, 2, 0, 3]).reshape(bs, np * sq, -1)
+        # [sk, b, 1, hn] -> [b, hn, sk]
+        key_layer = key_layer.squeeze(2).permute(1, 2, 0)
+        # [sk, b, 1, hn] -> [sk, b * np, hn]
+        # key_layer = key_layer.expand(output_size[3], output_size[0], np, -1)
+        # key_layer = key_layer.reshape(output_size[3], output_size[0] * np, -1)
+
+        # preallocting input tensor: [b, np * sq, sk]
+        matmul_input_buffer = get_global_memory_buffer().get_tensor(
+            (bs, np * sq, sk),
+            query_layer.dtype, "mpu")
+        timers("CoreAttention: K view/reshape").stop()
+
+        timers("CoreAttention: QK matmul").start()
+        # Raw attention scores. [b, np * sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer,   # [b, np * sq, hn]
+            key_layer,  # [b, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+        timers("CoreAttention: QK matmul").stop()
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(bs, np, sq, sk)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        timers("CoreAttention: Softmax, dropout").start()
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+
+        if not self.sequence_parallel:
+            with mpu.get_cuda_rng_tracker().fork():
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+        timers("CoreAttention: Softmax, dropout").stop()
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       np,
+                       query_layer.size(0),
+                       value_layer.size(3))
+
+        timers("CoreAttention: V view/reshape").start()
+        # [sk, b, 1, hn] -> [b, sk, hn]
+        value_layer = value_layer.squeeze(2).transpose(0, 1)
+        timers("CoreAttention: V view/reshape").stop()
+
+        # change view [b, np * sq, sk]
+        attention_probs = attention_probs.view(bs, np * sq, -1)
+
+        timers("CoreAttention: V matmul").start()
+        # matmul: [b, np * sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        timers("CoreAttention: V matmul").stop()
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(bs, np, sq, -1)
+
+        timers("CoreAttention: context contiguous").start()
         # [b, np, sq, hn] --> [sq, b, np, hn]
         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        timers("CoreAttention: context contiguous").stop()
 
         # [sq, b, np, hn] --> [sq, b, hp]
         new_context_layer_shape = context_layer.size()[:-2] + \
@@ -380,8 +504,11 @@ def __init__(self, init_method,
                 gather_output=False,
                 init_method=init_method)
 
-        self.core_attention = CoreAttention(self.layer_number,
-                                            self.attn_mask_type)
+        if self.attention_head_type == 'multihead':
+            self.core_attention = CoreAttention(self.layer_number,
+                                                self.attn_mask_type)
+        else:
+            self.core_attention = MultiQueryCoreAttention(self.layer_number, self.attn_mask_type)
         self.checkpoint_core_attention = args.recompute_granularity == 'selective'
 
         # Output.
@@ -423,10 +550,11 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size):
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, inference_params=None):
         # hidden_states: [sq, b, h]
-
+        timers = get_timers()
         # =================================================
         # Pre-allocate memory for key-values for inference.
         # =================================================
+        timers("inference_params init").start()
         if inference_params:
             if self.layer_number not in inference_params.key_value_memory_dict:
                 inf_max_seq_len = inference_params.max_sequence_len
@@ -440,11 +568,13 @@ def forward(self, hidden_states, attention_mask,
             else:
                 inference_key_memory, inference_value_memory = \
                     inference_params.key_value_memory_dict[self.layer_number]
+        timers("inference_params init").stop()
 
         # =====================
         # Query, Key, and Value
         # =====================
 
+        timers("KV forward").start()
         if self.attention_type == AttnType.self_attn and self.attention_head_type == 'multihead':
             # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
@@ -486,6 +616,8 @@ def forward(self, hidden_states, attention_mask,
                 (self.num_attention_heads_per_partition,
                  self.hidden_size_per_attention_head)
             query_layer = query_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, hn] -> [b, np * sq, hn]
         else:
             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
             mixed_kv_layer, _ = self.key_value(encoder_output)
@@ -508,10 +640,15 @@ def forward(self, hidden_states, attention_mask,
                  self.hidden_size_per_attention_head)
             query_layer = query_layer.view(*new_tensor_shape)
 
+        timers("KV forward").stop()
+
         # ==================================
         # Adjust key and value for inference
         # ==================================
 
+
+        timers("Inference params").start()
+
         if inference_params:
             batch_start = inference_params.batch_size_offset
             batch_end = batch_start + key_layer.size(1)
@@ -528,23 +665,30 @@ def forward(self, hidden_states, attention_mask,
                 :sequence_end, batch_start:batch_end, ...]
             value_layer = inference_value_memory[
                 :sequence_end, batch_start:batch_end, ...]
+        
+        timers("Inference params").stop()
 
         # ==================================
         # core attention computation
         # ==================================
 
+        timers("Core attention forward").start()
+
         if self.checkpoint_core_attention:
             context_layer = self._checkpointed_attention_forward(
                 query_layer, key_layer, value_layer, attention_mask, expand_key_value=True)
         else:
             context_layer = self.core_attention(
                 query_layer, key_layer, value_layer, attention_mask, expand_key_value=True)
+        timers("Core attention forward").stop()
 
         # =================
         # Output. [sq, b, h]
         # =================
 
+        timers("dense").start()
         output, bias = self.dense(context_layer)
+        timers("dense").stop()
 
         return output, bias
 
@@ -655,16 +799,19 @@ def __init__(self, init_method, output_layer_init_method,
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
                 inference_params=None):
+        timers = get_timers()
         # hidden_states: [s, b, h]
 
         # Layer norm at the beginning of the transformer layer.
         layernorm_output = self.input_layernorm(hidden_states)
         # Self attention.
+        timers("attention forward").start()
         attention_output, attention_bias = \
             self.self_attention(
                 layernorm_output,
                 attention_mask,
                 inference_params=inference_params)
+        timers("attention forward").stop()
 
         # Residual connection.
         if self.apply_residual_connection_post_layernorm:
@@ -722,7 +869,9 @@ def forward(self, hidden_states, attention_mask,
             layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
 
         # MLP.
+        timers("MLP forward").start()
         mlp_output, mlp_bias = self.mlp(layernorm_output)
+        timers("MLP forward").stop()
 
         # Second residual connection.
         if self.apply_residual_connection_post_layernorm:
@@ -946,6 +1095,9 @@ def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
                 inference_params=None):
         # hidden_states: [s, b, h]
+        timers = get_timers()
+
+        timers("Transformer forward").start()
 
         # Checks.
         if inference_params:
@@ -1003,4 +1155,6 @@ def forward(self, hidden_states, attention_mask,
         if self.post_process and self.post_layer_norm:
             hidden_states = self.final_layernorm(hidden_states)
 
+        timers("Transformer forward").stop()
+
         return hidden_states

From 254ff4b826a89d0b22610dea20471d7a0d92fd8c Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 2 Sep 2022 11:04:41 -0400
Subject: [PATCH 025/144] raise if trying to uyse multi-query cross-atteention

---
 megatron/model/transformer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index aa776b2232..07e83e3a93 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -475,6 +475,7 @@ def __init__(self, init_method,
                 gather_output=False,
                 init_method=init_method)
         elif attention_type == AttnType.self_attn and self.attention_head_type == 'multiquery':
+            # TODO: Find a way to merge the query and key-value computations?
             self.query = mpu.ColumnParallelLinear(
                 args.hidden_size,
                 projection_size,
@@ -488,9 +489,7 @@ def __init__(self, init_method,
                 args.hidden_size,
                 2 * args.kv_channels,
                 init_method=init_method)
-            print(f"KV WEIGHT {layer_number}", self.key_value.weight)
-        # TODO: add elif block for cross_attn and multiquery?
-        else:
+        elif attention_type == AttnType.cross_attn and self.attention_head_type == 'multihead':
             assert attention_type == AttnType.cross_attn
             self.query = mpu.ColumnParallelLinear(
                 args.hidden_size,
@@ -503,6 +502,8 @@ def __init__(self, init_method,
                 2 * projection_size,
                 gather_output=False,
                 init_method=init_method)
+        else:
+            raise NotImplementedError("Multiquery attention not implemented for cross-attention.")
 
         if self.attention_head_type == 'multihead':
             self.core_attention = CoreAttention(self.layer_number,

From eaf617466219fc9354ed61cfb0ca6a7d18cce4b9 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 2 Sep 2022 11:18:11 -0400
Subject: [PATCH 026/144] remove expand_key_value parameter since CoreAttention
 for multi-query is now in a separate class

---
 megatron/model/transformer.py | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 07e83e3a93..bf43d00bb4 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -214,7 +214,7 @@ def __init__(self, layer_number,
         self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
 
     def forward(self, query_layer, key_layer,
-                value_layer, attention_mask, expand_key_value=False):
+                value_layer, attention_mask):
         timers = get_timers()
         # ===================================
         # Raw attention scores. [b, np, s, s]
@@ -231,14 +231,9 @@ def forward(self, query_layer, key_layer,
         query_layer = query_layer.view(output_size[2],
                                        output_size[0] * output_size[1], -1)
         timers("CoreAttention: K view/reshape").start()
-        if expand_key_value:
-            # [sk, b, 1, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.expand(output_size[3], output_size[0], np, -1)
-            key_layer = key_layer.reshape(output_size[3], output_size[0] * np, -1)
-        else:
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3],
-                                    output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
         timers("CoreAttention: K view/reshape").stop()
 
         # preallocting input tensor: [b * np, sq, sk]
@@ -291,14 +286,9 @@ def forward(self, query_layer, key_layer,
                        value_layer.size(3))
 
         timers("CoreAttention: V view/reshape").start()
-        # [sk, b, 1, hn] -> [sk, b * np, hn]
-        if expand_key_value:
-            value_layer = value_layer.expand(value_layer.size(0), value_layer.size(1), np, -1)
-            value_layer = value_layer.reshape(value_layer.size(0), value_layer.size(1) * np, -1)
-        else:
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0),
-                                        output_size[0] * output_size[1], -1)
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
         timers("CoreAttention: V view/reshape").stop()
 
         # change view [b * np, sq, sk]
@@ -331,7 +321,7 @@ class MultiQueryCoreAttention(CoreAttention):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
-    def forward(self, query_layer, key_layer, value_layer, attention_mask, expand_key_value=False):
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
         timers = get_timers()
         # ===================================
         # Raw attention scores. [b, np, s, s]
@@ -521,7 +511,7 @@ def __init__(self, init_method,
             skip_bias_add=True)
 
     def _checkpointed_attention_forward(self, query_layer, key_layer,
-                                        value_layer, attention_mask, expand_key_value):
+                                        value_layer, attention_mask):
         """Forward method with activation checkpointing."""
         def custom_forward(*inputs):
             query_layer = inputs[0]
@@ -529,7 +519,7 @@ def custom_forward(*inputs):
             value_layer = inputs[2]
             attention_mask = inputs[3]
             output_ = self.core_attention(query_layer, key_layer,
-                                          value_layer, attention_mask, expand_key_value)
+                                          value_layer, attention_mask)
             return output_
 
         hidden_states = mpu.checkpoint(
@@ -677,10 +667,10 @@ def forward(self, hidden_states, attention_mask,
 
         if self.checkpoint_core_attention:
             context_layer = self._checkpointed_attention_forward(
-                query_layer, key_layer, value_layer, attention_mask, expand_key_value=True)
+                query_layer, key_layer, value_layer, attention_mask)
         else:
             context_layer = self.core_attention(
-                query_layer, key_layer, value_layer, attention_mask, expand_key_value=True)
+                query_layer, key_layer, value_layer, attention_mask)
         timers("Core attention forward").stop()
 
         # =================

From 15131378f67b8e27af3df6b20df62f1dbe78fdd0 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 2 Sep 2022 13:02:09 -0400
Subject: [PATCH 027/144] remove most timers

---
 megatron/arguments.py              |   3 +
 megatron/model/transformer.py      |  53 +---------
 tools/text_generation_benchmark.py | 163 +++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+), 50 deletions(-)
 create mode 100644 tools/text_generation_benchmark.py

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 66663aa4c7..c5f4e42fee 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -454,6 +454,9 @@ def _add_logging_args(parser):
                         help="Name of wandb entity for reporting")
     group.add_argument('--wandb-project-name', type=str, default=None,
                         help="Name of wandb project")
+    group.add_argument('--transformer-timers', action='store_true',
+                        help="If set, activate the timers within the transformer layers."
+                        "Only for debugging, as this slows down the model.")
 
     return parser
 
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index bf43d00bb4..9bcfb56a2a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -215,7 +215,6 @@ def __init__(self, layer_number,
 
     def forward(self, query_layer, key_layer,
                 value_layer, attention_mask):
-        timers = get_timers()
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================
@@ -230,25 +229,21 @@ def forward(self, query_layer, key_layer,
         # [sq, b, np, hn] -> [sq, b * np, hn]
         query_layer = query_layer.view(output_size[2],
                                        output_size[0] * output_size[1], -1)
-        timers("CoreAttention: K view/reshape").start()
         # [sk, b, np, hn] -> [sk, b * np, hn]
         key_layer = key_layer.view(output_size[3],
                                    output_size[0] * output_size[1], -1)
-        timers("CoreAttention: K view/reshape").stop()
 
         # preallocting input tensor: [b * np, sq, sk]
         matmul_input_buffer = get_global_memory_buffer().get_tensor(
             (output_size[0]*output_size[1], output_size[2], output_size[3]),
             query_layer.dtype, "mpu")
 
-        timers("CoreAttention: QK matmul").start()
         # Raw attention scores. [b * np, sq, sk]
         matmul_result = torch.baddbmm(
             matmul_input_buffer,
             query_layer.transpose(0, 1),   # [b * np, sq, hn]
             key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
             beta=0.0, alpha=(1.0/self.norm_factor))
-        timers("CoreAttention: QK matmul").stop()
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
@@ -257,7 +252,6 @@ def forward(self, query_layer, key_layer,
         # Attention probs and dropout
         # ===========================
 
-        timers("CoreAttention: Softmax, dropout").start()
         # attention scores and attention mask [b, np, sq, sk]
         attention_probs = self.scale_mask_softmax(attention_scores,
                                                   attention_mask)
@@ -270,7 +264,6 @@ def forward(self, query_layer, key_layer,
                 attention_probs = self.attention_dropout(attention_probs)
         else:
             attention_probs = self.attention_dropout(attention_probs)
-        timers("CoreAttention: Softmax, dropout").stop()
 
         # =========================
         # Context layer. [sq, b, hp]
@@ -285,28 +278,22 @@ def forward(self, query_layer, key_layer,
                        query_layer.size(0),
                        value_layer.size(3))
 
-        timers("CoreAttention: V view/reshape").start()
         # change view [sk, b * np, hn]
         value_layer = value_layer.view(value_layer.size(0),
                                        output_size[0] * output_size[1], -1)
-        timers("CoreAttention: V view/reshape").stop()
 
         # change view [b * np, sq, sk]
         attention_probs = attention_probs.view(output_size[0] * output_size[1],
                                                output_size[2], -1)
 
-        timers("CoreAttention: V matmul").start()
         # matmul: [b * np, sq, hn]
         context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-        timers("CoreAttention: V matmul").stop()
 
         # change view [b, np, sq, hn]
         context_layer = context_layer.view(*output_size)
 
-        timers("CoreAttention: context contiguous").start()
         # [b, np, sq, hn] --> [sq, b, np, hn]
         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-        timers("CoreAttention: context contiguous").stop()
 
         # [sq, b, np, hn] --> [sq, b, hp]
         new_context_layer_shape = context_layer.size()[:-2] + \
@@ -322,7 +309,6 @@ def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
     def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        timers = get_timers()
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================
@@ -340,8 +326,6 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
                        query_layer.size(0),
                        key_layer.size(0))
 
-        
-        timers("CoreAttention: K view/reshape").start()
         # [sq, b, np, hn] -> [b, np * sq, hn]
         query_layer = query_layer.permute([1, 2, 0, 3]).reshape(bs, np * sq, -1)
         # [sk, b, 1, hn] -> [b, hn, sk]
@@ -354,16 +338,13 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
         matmul_input_buffer = get_global_memory_buffer().get_tensor(
             (bs, np * sq, sk),
             query_layer.dtype, "mpu")
-        timers("CoreAttention: K view/reshape").stop()
 
-        timers("CoreAttention: QK matmul").start()
         # Raw attention scores. [b, np * sq, sk]
         matmul_result = torch.baddbmm(
             matmul_input_buffer,
             query_layer,   # [b, np * sq, hn]
             key_layer,  # [b, hn, sk]
             beta=0.0, alpha=(1.0/self.norm_factor))
-        timers("CoreAttention: QK matmul").stop()
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(bs, np, sq, sk)
@@ -372,7 +353,6 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
         # Attention probs and dropout
         # ===========================
 
-        timers("CoreAttention: Softmax, dropout").start()
         # attention scores and attention mask [b, np, sq, sk]
         attention_probs = self.scale_mask_softmax(attention_scores,
                                                   attention_mask)
@@ -385,7 +365,6 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
                 attention_probs = self.attention_dropout(attention_probs)
         else:
             attention_probs = self.attention_dropout(attention_probs)
-        timers("CoreAttention: Softmax, dropout").stop()
 
         # =========================
         # Context layer. [sq, b, hp]
@@ -400,26 +379,20 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
                        query_layer.size(0),
                        value_layer.size(3))
 
-        timers("CoreAttention: V view/reshape").start()
         # [sk, b, 1, hn] -> [b, sk, hn]
         value_layer = value_layer.squeeze(2).transpose(0, 1)
-        timers("CoreAttention: V view/reshape").stop()
 
         # change view [b, np * sq, sk]
         attention_probs = attention_probs.view(bs, np * sq, -1)
 
-        timers("CoreAttention: V matmul").start()
         # matmul: [b, np * sq, hn]
         context_layer = torch.bmm(attention_probs, value_layer)
-        timers("CoreAttention: V matmul").stop()
 
         # change view [b, np, sq, hn]
         context_layer = context_layer.view(bs, np, sq, -1)
 
-        timers("CoreAttention: context contiguous").start()
         # [b, np, sq, hn] --> [sq, b, np, hn]
         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-        timers("CoreAttention: context contiguous").stop()
 
         # [sq, b, np, hn] --> [sq, b, hp]
         new_context_layer_shape = context_layer.size()[:-2] + \
@@ -541,11 +514,9 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size):
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, inference_params=None):
         # hidden_states: [sq, b, h]
-        timers = get_timers()
         # =================================================
         # Pre-allocate memory for key-values for inference.
         # =================================================
-        timers("inference_params init").start()
         if inference_params:
             if self.layer_number not in inference_params.key_value_memory_dict:
                 inf_max_seq_len = inference_params.max_sequence_len
@@ -559,13 +530,11 @@ def forward(self, hidden_states, attention_mask,
             else:
                 inference_key_memory, inference_value_memory = \
                     inference_params.key_value_memory_dict[self.layer_number]
-        timers("inference_params init").stop()
 
         # =====================
         # Query, Key, and Value
         # =====================
 
-        timers("KV forward").start()
         if self.attention_type == AttnType.self_attn and self.attention_head_type == 'multihead':
             # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
             mixed_x_layer, _ = self.query_key_value(hidden_states)
@@ -631,15 +600,11 @@ def forward(self, hidden_states, attention_mask,
                  self.hidden_size_per_attention_head)
             query_layer = query_layer.view(*new_tensor_shape)
 
-        timers("KV forward").stop()
-
         # ==================================
         # Adjust key and value for inference
         # ==================================
 
 
-        timers("Inference params").start()
-
         if inference_params:
             batch_start = inference_params.batch_size_offset
             batch_end = batch_start + key_layer.size(1)
@@ -656,30 +621,22 @@ def forward(self, hidden_states, attention_mask,
                 :sequence_end, batch_start:batch_end, ...]
             value_layer = inference_value_memory[
                 :sequence_end, batch_start:batch_end, ...]
-        
-        timers("Inference params").stop()
 
         # ==================================
         # core attention computation
         # ==================================
 
-        timers("Core attention forward").start()
-
         if self.checkpoint_core_attention:
             context_layer = self._checkpointed_attention_forward(
                 query_layer, key_layer, value_layer, attention_mask)
         else:
             context_layer = self.core_attention(
                 query_layer, key_layer, value_layer, attention_mask)
-        timers("Core attention forward").stop()
 
         # =================
         # Output. [sq, b, h]
         # =================
-
-        timers("dense").start()
         output, bias = self.dense(context_layer)
-        timers("dense").stop()
 
         return output, bias
 
@@ -790,19 +747,16 @@ def __init__(self, init_method, output_layer_init_method,
     def forward(self, hidden_states, attention_mask,
                 encoder_output=None, enc_dec_attn_mask=None,
                 inference_params=None):
-        timers = get_timers()
         # hidden_states: [s, b, h]
 
         # Layer norm at the beginning of the transformer layer.
         layernorm_output = self.input_layernorm(hidden_states)
         # Self attention.
-        timers("attention forward").start()
         attention_output, attention_bias = \
             self.self_attention(
                 layernorm_output,
                 attention_mask,
                 inference_params=inference_params)
-        timers("attention forward").stop()
 
         # Residual connection.
         if self.apply_residual_connection_post_layernorm:
@@ -860,9 +814,7 @@ def forward(self, hidden_states, attention_mask,
             layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
 
         # MLP.
-        timers("MLP forward").start()
         mlp_output, mlp_bias = self.mlp(layernorm_output)
-        timers("MLP forward").stop()
 
         # Second residual connection.
         if self.apply_residual_connection_post_layernorm:
@@ -1087,8 +1039,9 @@ def forward(self, hidden_states, attention_mask,
                 inference_params=None):
         # hidden_states: [s, b, h]
         timers = get_timers()
+        args = get_args()
 
-        timers("Transformer forward").start()
+        if args.transformer_timers: timers("Transformer forward").start()
 
         # Checks.
         if inference_params:
@@ -1146,6 +1099,6 @@ def forward(self, hidden_states, attention_mask,
         if self.post_process and self.post_layer_norm:
             hidden_states = self.final_layernorm(hidden_states)
 
-        timers("Transformer forward").stop()
+        if args.transformer_timers: timers("Transformer forward").stop()
 
         return hidden_states
diff --git a/tools/text_generation_benchmark.py b/tools/text_generation_benchmark.py
new file mode 100644
index 0000000000..ee458f377c
--- /dev/null
+++ b/tools/text_generation_benchmark.py
@@ -0,0 +1,163 @@
+
+"""Sample Generate GPT"""
+import os
+import sys
+import re
+sys.path.append(os.path.abspath(os.path.join(
+    os.getcwd(),
+    "Megatron-LM",
+)))
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.initialize import initialize_megatron
+from megatron.model import GPTModel
+from megatron.training import get_model
+from megatron.text_generation import generate_and_post_process
+import torch
+from human_eval.data import write_jsonl, read_problems
+from tqdm import tqdm
+
+
+GENERATE_NUM = 0
+
+# End on unindented code
+# EOF_STRINGS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
+
+
+BATCH_SIZE = 512
+TOKENS_TO_GENERATE = 128
+PROMPT_LENGTH = 128
+NUM_BATCHES = 8
+
+
+# NUM_SAMPLES_PER_TASK = 5
+# # Number of human-eval tasks
+# NUM_TASKS = 200
+
+def send_do_generate():
+        choice = torch.cuda.LongTensor([GENERATE_NUM])
+        torch.distributed.broadcast(choice, 0)
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0('building GPT model ...')
+    model = GPTModel(num_tokentypes=0, parallel_output=False, pre_process=pre_process, post_process=post_process)
+
+    return model
+
+def get_batches(prompts, batch_size):
+    for start_idx in tqdm(range(0, len(prompts), batch_size)):
+        actual_batch_size = min(batch_size, len(prompts) - start_idx)
+        yield prompts[start_idx: start_idx + actual_batch_size]
+
+
+def unbatch(d: dict):
+    return [dict(zip(d.keys(), t)) for t in zip(*d.values())]
+
+
+# Use fixed-length prompts
+def load_evaluation_data(args):
+    # HumanEval data
+    # problems = read_problems()
+
+    # batches = get_batches(
+    #     [
+    #         problems[task_id]["prompt"]
+    #         for task_id in problems
+    #         for _ in range(5)
+    #     ],
+    #     BATCH_SIZE
+    # )
+    # return batches
+
+    prompt = " ".join(["one"] * PROMPT_LENGTH)
+    prompts = [prompt] * (BATCH_SIZE * NUM_BATCHES)
+
+    batches = get_batches(prompts, BATCH_SIZE)
+    return batches
+
+
+if __name__ == "__main__":
+    # Initialize Megatron
+    initialize_megatron(extra_args_provider=None,
+                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
+                                       'no_load_rng': True,
+                                       'no_load_optim': True})
+
+    args = get_args()
+    timers = get_timers()
+
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+    # Setup model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+
+    if args.load is not None:
+        iteration = load_checkpoint(model, None, None, iteration=None)
+    else:
+        iteration = None
+
+    assert len(model) == 1
+    model = model[0]
+
+    def generate(prompts):
+        response, response_seg, response_logprobs, tokens = \
+                generate_and_post_process(
+                    model,
+                    prompts=prompts,
+                    tokens_to_generate=TOKENS_TO_GENERATE,
+                    return_output_log_probs=True,
+                    use_eod_token_for_early_termination=False)
+        
+        assert all([r.startswith(p) for r, p in zip(response, prompts)])
+        result = {
+            "response": response, 
+            "response_seg": response_seg,
+            "raw_completion": [r[len(p):] for r, p in zip(response, prompts)]
+        }
+        # The "completion" field contains the string that is actually going to be evaluated by the HumanEval script
+        # result["completion"] = [post_process_completion(c) for c in result["raw_completion"]]
+        # Return a list of dicts
+        return unbatch(result)
+
+    # if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+    #     server = MegatronServer(model)
+    #     server.run("0.0.0.0")
+
+    # while True:
+    #     choice = torch.cuda.LongTensor(1)
+    #     torch.distributed.broadcast(choice, 0)
+    #     if choice[0].item() == 0:
+    #         generate_and_post_process(model)
+
+
+    # Evaluation data iterator
+    batches = load_evaluation_data(args)
+
+    timers('generate').start()
+    # Generate
+    samples = [
+        generate_dict
+        for batch in batches
+        for generate_dict in generate(batch)
+    ]
+    timers('generate').stop()
+
+    elapsed = timers.timers['generate'].elapsed(reset=False)
+    num_tokens = TOKENS_TO_GENERATE * NUM_BATCHES * BATCH_SIZE
+    print(f"{elapsed * 1000 / (num_tokens)} ms per token")
+    timers.log(['generate'])
+    if args.transformer_timers: 
+        timers.log(["Transformer forward"])
+    print("DONE")
+
+    # Write results to file
+    # if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+    #     write_jsonl(args.output_file.format(iteration), samples)
+

From b4d6017791b1efb31378a6904b1362f41bc5e357 Mon Sep 17 00:00:00 2001
From: Denis Kocetkov <denis.kocetkov@servicenow.com>
Date: Wed, 7 Sep 2022 07:00:24 -0400
Subject: [PATCH 028/144] chg: move enums back to model

---
 megatron/arguments.py                              | 2 +-
 megatron/checkpointing.py                          | 2 +-
 megatron/fused_kernels/tests/test_fused_kernels.py | 2 +-
 megatron/model/__init__.py                         | 2 +-
 megatron/model/bert_model.py                       | 2 +-
 megatron/model/biencoder_model.py                  | 2 +-
 megatron/model/classification.py                   | 2 +-
 megatron/{ => model}/enums.py                      | 0
 megatron/model/fused_softmax.py                    | 2 +-
 megatron/model/gpt_model.py                        | 2 +-
 megatron/model/language_model.py                   | 2 +-
 megatron/model/multiple_choice.py                  | 2 +-
 megatron/model/realm_model.py                      | 2 +-
 megatron/model/t5_model.py                         | 2 +-
 megatron/model/transformer.py                      | 2 +-
 15 files changed, 14 insertions(+), 14 deletions(-)
 rename megatron/{ => model}/enums.py (100%)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 7ff8dae173..4ab086b3fd 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -20,8 +20,8 @@
 
 import torch
 
-from megatron.enums import PositionEmbeddingType
 import megatron
+from megatron.model.enums import PositionEmbeddingType
 
 def parse_args(extra_args_provider=None, defaults={},
                ignore_unknown_args=False):
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 906bf17b97..0079a97cfb 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -27,7 +27,7 @@
                       print_rank_0,
                       update_num_microbatches,
                       utils)
-from megatron.enums import PositionEmbeddingType
+from megatron.model.enums import PositionEmbeddingType
 
 _CHECKPOINT_VERSION = None
 
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index 9052043d14..f8d5027a1f 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -3,7 +3,7 @@
 import torch
 from torch.nn import LayerNorm
 
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.utils import attention_mask_func
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 18a4e8956d..7bff1cbdd3 100644
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -20,4 +20,4 @@
 from .t5_model import T5Model
 from .language_model import get_language_model
 from .module import Float16Module
-from megatron.enums import ModelType
+from megatron.model.enums import ModelType
diff --git a/megatron/model/bert_model.py b/megatron/model/bert_model.py
index d61da23a95..158fc84ef0 100644
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -19,7 +19,7 @@
 
 from megatron import get_args
 from megatron import mpu
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.language_model import parallel_lm_logits
 from megatron.model.language_model import get_language_model
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
diff --git a/megatron/model/biencoder_model.py b/megatron/model/biencoder_model.py
index d89f6dbee0..752c5752e9 100644
--- a/megatron/model/biencoder_model.py
+++ b/megatron/model/biencoder_model.py
@@ -8,7 +8,7 @@
 from megatron.checkpointing import get_checkpoint_name
 from megatron import mpu, get_tokenizer
 from megatron.model.bert_model import bert_position_ids
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
diff --git a/megatron/model/classification.py b/megatron/model/classification.py
index 94dc5fe7d8..d975072f77 100644
--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
@@ -19,7 +19,7 @@
 
 from megatron import get_args, print_rank_last
 from megatron import mpu
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
diff --git a/megatron/enums.py b/megatron/model/enums.py
similarity index 100%
rename from megatron/enums.py
rename to megatron/model/enums.py
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 096c8b2b4b..2409edd59f 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -16,7 +16,7 @@
 
 import torch
 import torch.nn as nn
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 
 
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
index 5abdf1ecfe..b6a1d7b5e9 100644
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -21,7 +21,7 @@
 from megatron import mpu
 from .module import MegatronModule
 
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from .language_model import parallel_lm_logits
 from .language_model import get_language_model
 from .utils import init_method_normal
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
index 759975e779..9a061526a3 100644
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -21,7 +21,7 @@
 from megatron import get_args
 from megatron import mpu
 from .module import MegatronModule
-from megatron.enums import LayerType, AttnMaskType, PositionEmbeddingType
+from megatron.model.enums import LayerType, AttnMaskType, PositionEmbeddingType
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal
diff --git a/megatron/model/multiple_choice.py b/megatron/model/multiple_choice.py
index a8445e0cb7..c43bd969c0 100644
--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
@@ -19,7 +19,7 @@
 
 from megatron import get_args, print_rank_last
 from megatron import mpu
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
diff --git a/megatron/model/realm_model.py b/megatron/model/realm_model.py
index c57f51592a..5730a85e36 100644
--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
@@ -6,7 +6,7 @@
 from megatron.model import BertModel
 from .module import MegatronModule
 from megatron import mpu
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.language_model import get_language_model
diff --git a/megatron/model/t5_model.py b/megatron/model/t5_model.py
index f6cefe8979..3ed032c697 100644
--- a/megatron/model/t5_model.py
+++ b/megatron/model/t5_model.py
@@ -21,7 +21,7 @@
     get_args,
     mpu
 )
-from megatron.enums import AttnMaskType
+from megatron.model.enums import AttnMaskType
 from megatron.model.language_model import parallel_lm_logits, get_language_model
 from megatron.model.transformer import LayerNorm
 from megatron.model.utils import (
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 62664272a6..f011770ee7 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -23,7 +23,7 @@
 from megatron import get_timers, get_args, get_global_memory_buffer
 from megatron import mpu
 from .module import MegatronModule
-from megatron.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
+from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl

From 691226a7194513dc8fb9764d1f538bc4fa4628a9 Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Wed, 7 Sep 2022 13:59:07 +0100
Subject: [PATCH 029/144] fix: breaking circular import

---
 megatron/global_vars.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index 03b0ef0f44..eb7331b427 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -24,7 +24,6 @@
 
 from megatron import dist_signal_handler
 from megatron.tokenizer import build_tokenizer
-from .arguments import parse_args
 from .microbatches import build_num_microbatches_calculator
 
 _GLOBAL_ARGS = None
@@ -116,6 +115,7 @@ def set_global_variables(extra_args_provider=None, args_defaults={},
 def _parse_args(extra_args_provider=None, defaults={},
                 ignore_unknown_args=False):
     """Parse entire arguments."""
+    from .arguments import parse_args
     global _GLOBAL_ARGS
     _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
     _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider,

From d4ba492ff7ce21e59490d56e8838b5e7b6352eb8 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 7 Sep 2022 10:15:10 -0400
Subject: [PATCH 030/144] allow to load old checkpoints

---
 megatron/checkpointing.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 0079a97cfb..44a03fc733 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -61,7 +61,12 @@ def _compare(arg_name, old_arg_name=None):
     _compare('num_layers')
     _compare('hidden_size')
     _compare('num_attention_heads')
-    _compare('position_embedding_type')
+    try:
+        _compare('position_embedding_type')
+    except AttributeError as e:
+        print_rank_0(f"  Warning, trying to load an old checkpoint: {e}")
+        assert args.position_embedding_type == PositionEmbeddingType.absolute, \
+            f"Checkpoint uses PositionEmbeddingType.absolute, but input argument value was: {args.position_embedding_type}"
     # with alibi we can change `max_position_embeddings`
     if args.position_embedding_type != PositionEmbeddingType.alibi:
         _compare('max_position_embeddings')

From 5045d6f191480c87027132b90cd451f1a954503f Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 7 Sep 2022 10:22:15 -0400
Subject: [PATCH 031/144] resolve conflict

---
 megatron/model/transformer.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 31af14f431..08e5fb2fb0 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -228,12 +228,8 @@ def __init__(self, layer_number,
         self.attention_dropout = torch.nn.Dropout(args.attention_dropout)
 
     def forward(self, query_layer, key_layer,
-<<<<<<< HEAD
-                value_layer, attention_mask):
-=======
                 value_layer, attention_mask, alibi):
 
->>>>>>> load-iter
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================

From 21170585ab38bce077f1e28d31405d861d3e4cb5 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 7 Sep 2022 11:15:45 -0400
Subject: [PATCH 032/144] implement alibi in multiquery core-attention

---
 megatron/model/transformer.py | 47 ++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 08e5fb2fb0..57d6d9fde2 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -254,6 +254,7 @@ def forward(self, query_layer, key_layer,
                 (output_size[0]*output_size[1], output_size[2], output_size[3]),
                 query_layer.dtype, "mpu")
         else:
+            # alibi: (batch_size * num_attention_heads, 1, max_seq_len)
             matmul_input_buffer = alibi[:output_size[0]*output_size[1], :, :output_size[3]]
 
         # Raw attention scores. [b * np, sq, sk]
@@ -342,7 +343,7 @@ class MultiQueryCoreAttention(CoreAttention):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+    def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================
@@ -368,17 +369,39 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
         # key_layer = key_layer.expand(output_size[3], output_size[0], np, -1)
         # key_layer = key_layer.reshape(output_size[3], output_size[0] * np, -1)
 
-        # preallocting input tensor: [b, np * sq, sk]
-        matmul_input_buffer = get_global_memory_buffer().get_tensor(
-            (bs, np * sq, sk),
-            query_layer.dtype, "mpu")
-
-        # Raw attention scores. [b, np * sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_input_buffer,
-            query_layer,   # [b, np * sq, hn]
-            key_layer,  # [b, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
+        if alibi is None:
+            # preallocting input tensor: [b, np * sq, sk]
+            matmul_input_buffer = get_global_memory_buffer().get_tensor(
+                (bs, np * sq, sk),
+                query_layer.dtype, "mpu")
+        else:
+            # alibi: (batch_size * num_attention_heads, 1, max_seq_len)
+            # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk)
+            matmul_input_buffer = alibi[:bs * np, :, :sk].view(bs, np, sk)
+            matmul_input_buffer = matmul_input_buffer.repeat(1, sq, 1)  # [b, np * sq, sk]
+
+        if alibi is None:
+            # Raw attention scores. [b, np * sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer,   # [b, np * sq, hn]
+                key_layer,  # [b, hn, sk]
+                beta=0.0, alpha=(1.0/self.norm_factor))
+        else:
+            if not hasattr(self, "logged_alibi"):
+                print("Using Alibi.")
+                self.logged_alibi = True
+
+            if self.apply_query_key_layer_scaling:
+                beta = 1.0 / self.layer_number
+            else:
+                beta = 1.0
+
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer,
+                key_layer,
+                beta=beta, alpha=(1.0 / self.norm_factor))
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(bs, np, sq, sk)

From e8d47a97a8b68d55f9da84b54c89ddd9749782de Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 17 Oct 2022 13:29:24 -0400
Subject: [PATCH 033/144] add necessary fixes for toolkit-infiniband-example

---
 megatron/fused_kernels/__init__.py |  8 ++------
 megatron/initialize.py             | 12 +++++++-----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 6d063e6686..6a44db2282 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -19,12 +19,8 @@
 
 from torch.utils import cpp_extension
 
-# Setting this param to a list has a problem of generating different
-# compilation commands (with diferent order of architectures) and
-# leading to recompilation of fused kernels. Set it to empty string
-# to avoid recompilation and assign arch flags explicity in
-# extra_cuda_cflags below
-os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+# Do not override TORCH_CUDA_ARCH_LIST to allow for pre-compilation in Dockerfile
+# os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 
 
 def load(args):
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 1f656406b3..5104964e99 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -189,11 +189,13 @@ def _initialize_distributed():
             else:
                 args.local_rank = device
             torch.cuda.set_device(device)
-    # Call the init process
-    torch.distributed.init_process_group(
-        backend=args.distributed_backend,
-        world_size=args.world_size, rank=args.rank,
-        timeout=timedelta(minutes=10))
+        # Include this torch.distributed.init_process_group() code in the `else` branch because
+        # we do not want to reinitialize if torch.distributed.is_initialized() returns True
+        # Call the init process
+        torch.distributed.init_process_group(
+            backend=args.distributed_backend,
+            world_size=args.world_size, rank=args.rank,
+            timeout=timedelta(minutes=10))
 
     # Set the tensor model-parallel, pipeline model-parallel, and
     # data-parallel communicators.

From a360666ba791c7622c068c962235aa4eb2100ba2 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 2 Nov 2022 17:33:47 -0400
Subject: [PATCH 034/144] add FIM code from EleutherAI, some comments and todo

---
 megatron/arguments.py        |   5 ++
 megatron/data/gpt_dataset.py | 107 ++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 2 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 842c7eb7f3..82612658b7 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -910,6 +910,11 @@ def _add_data_args(parser):
                        'end-of-document token.')
     group.add_argument('--eod-mask-loss', action='store_true',
                        help='Mask loss for the end of document tokens.')
+    group.add_argument('--fim-rate', type=float, default=0.,
+                       help='Probability to convert a training sample into a "Fill-in-the-Middle" format. Must be between 0 and 1.')
+    group.add_argument('--fim-spm-rate', type=float, default=0.5,
+                       help='Probability that the a FIM sample uses the SPM format over the PSM format. '
+                       'At 1, exclusively train with SPM. At 0, exclusively train with PSM')
 
     return parser
 
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index fadc79fffd..604098f28a 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -21,7 +21,7 @@
 import numpy as np
 import torch
 
-from megatron import mpu, print_rank_0
+from megatron import mpu, print_rank_0, get_args, get_tokenizer
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_
@@ -152,6 +152,10 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
         self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
             self.name, data_prefix, documents, self.indexed_dataset.sizes,
             num_samples, seq_length, seed)
+        
+        self.args = get_args()
+        self.tokenizer = get_tokenizer()
+        self.np_rng = np.random.RandomState(seed=seed) # rng state for FIM
 
     def __len__(self):
         # -1 is due to data structure used to retieve the index:
@@ -183,8 +187,44 @@ def __getitem__(self, idx):
                 self.doc_idx[doc_index_l],
                 length=offset_l + 1))
             sample = np.concatenate(sample_list)
+        
+        # Code from: https://github.com/EleutherAI/gpt-neox/blob/FIM-clean/megatron/data/gpt2_dataset.py#L109
+        # TODO(Hailey): can merge the code below this line with code above this line.
+        # TODO(Hailey), cont: above already iterates through loop, so just add the permuting in there?
+        sample = np.array(sample, dtype=np.int64)
+        # # print(sample, sample.shape)
+        # # do FIM here, if enabled
+        fim_rate = self.args.fim_rate
+
+        if fim_rate != 0:
+            assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
+
+            eod = self.tokenizer.eod
+
+            segment_breaks = np.argwhere(sample == eod) # split sample by document
+
+            if segment_breaks.shape != (0, 1): # then there is an EOD token in this example
+                curr_start_position = 0
+                for loc in np.nditer(segment_breaks):
+                    # print(loc - curr_start_position, flush=True)
+                    # permute {prefix, suffix, middle} or {suffix, prefix, middle}
+                    # try:
+                    if loc - curr_start_position > 10: # sometimes examples start with EOD or are too short. so avoid this case
+                        sample[curr_start_position:loc], self.np_rng = \
+                            permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer)
+                    # except ValueError:
+                    #     # print(loc - curr_start_position, flush=True)
+                    #     pass
+
+                    curr_start_position = loc + 1 # jump over the EOD token
+                # TODO: Check that we are not skipping the last subsequence (after the last eod)?
+            else:
+                sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer)
+        
+        # end FIM-specific code
 
-        return {'text': np.array(sample, dtype=np.int64)}
+        return {"text": sample}
+        # return {'text': np.array(sample, dtype=np.int64)}
 
 
 def _build_index_mappings(name, data_prefix, documents, sizes,
@@ -429,3 +469,66 @@ def _build_shuffle_idx(num_samples, total_size, np_rng):
     np_rng.shuffle(shuffle_idx_last)
 
     return np.concatenate((shuffle_idx_first, shuffle_idx_last))
+
+
+# From https://github.com/EleutherAI/gpt-neox/blob/FIM-clean/megatron/data/gpt2_dataset.py#L339
+def permute(sample, np_rng, args, tokenizer):
+    """
+    Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. 
+    Maintain the same sample length (if transform creates a few extra tokens, drop them).
+    """
+    fim_rate = args.fim_rate
+
+    # hardcode these for now. TODO(Hailey): should add a way to access all mask tokens in a tokenizer easily.
+    # TODO(Hailey): also, check to ensure there's not an off-by-one error here. 
+    # TODO(Hailey): when testing models trained with this workaround, need to add special tokens w/ the correct indices to the tokenizer.
+
+    # TODO: Add special tokens to tokenizer._GPT2BPETokenizer? Then we could create a new `GPT2BPETokenizerWithFIM` tokenizer type
+    suffix_tok_id, prefix_tok_id, middle_tok_id = 50277, 50278, 50279
+
+    if np_rng.binomial(1, fim_rate): # sample bernoulli dist
+
+        contents = tokenizer.detokenize(sample)
+        
+        try:
+            boundaries = list(np_rng.randint(low=1, high=len(contents) - 1, size=2))
+            boundaries.sort()
+        except ValueError as e:
+            print(len(contents), contents)
+            print(e)
+            raise e
+
+        prefix = contents[:boundaries[0]]
+        middle = contents[boundaries[0]:boundaries[1]]
+        suffix = contents[boundaries[1]:]
+
+
+        suffix = np.array([suffix_tok_id, *tokenizer.tokenize(suffix)])
+        prefix = np.array([prefix_tok_id, *tokenizer.tokenize(prefix)])
+        middle = np.array([middle_tok_id, *tokenizer.tokenize(middle)])
+        
+        # need to make same length as the input
+        new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0]
+        diff = new_length - sample.shape[0]
+
+        # print(new_length, sample.shape, suffix.shape, diff)
+        if diff > 0: # too long
+            # TODO: How to prevent this from happening? 
+            if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
+                return sample, np_rng
+            suffix = suffix[:suffix.shape[0] - diff]
+        elif diff < 0: # too short
+            # TODO: Does this really happen in practice? pad is not used by the GPT2 BPE tokenizer.
+            suffix = np.concatenate([suffix, np.full((-1 * diff), tokenizer.pad)])
+
+        new_sample = np.concatenate([ # TODO(Hailey): add a branch here + a param to select SPM or PSM mode
+            suffix,
+            prefix,
+            middle,
+        ])
+    else:
+        # don't do FIM preproc
+        new_sample = sample
+
+
+    return new_sample, np_rng

From 4390812723f694e9eccc05cb5702872baee3449b Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 3 Nov 2022 15:38:43 -0400
Subject: [PATCH 035/144] add a tokenizer-type for FIM

---
 megatron/arguments.py           |  3 ++-
 megatron/data/gpt_dataset.py    |  8 ++------
 megatron/tokenizer/tokenizer.py | 12 ++++++++++--
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 82612658b7..25c0995ebb 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -898,7 +898,8 @@ def _add_data_args(parser):
                        default=None,
                        choices=['BertWordPieceLowerCase',
                                 'BertWordPieceCase',
-                                'GPT2BPETokenizer'],
+                                'GPT2BPETokenizer',
+                                'GPT2BPETokenizerWithFIM'],
                        help='What type of tokenizer to use.')
     group.add_argument('--data-impl', type=str, default='infer',
                        choices=['lazy', 'cached', 'mmap', 'infer'],
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 604098f28a..bb3f268b0c 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -26,6 +26,7 @@
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
+from megatron.tokenizer.tokenizer import FIM_MIDDLE, FIM_PREFIX, FIM_SUFFIX
 
 
 def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
@@ -479,12 +480,7 @@ def permute(sample, np_rng, args, tokenizer):
     """
     fim_rate = args.fim_rate
 
-    # hardcode these for now. TODO(Hailey): should add a way to access all mask tokens in a tokenizer easily.
-    # TODO(Hailey): also, check to ensure there's not an off-by-one error here. 
-    # TODO(Hailey): when testing models trained with this workaround, need to add special tokens w/ the correct indices to the tokenizer.
-
-    # TODO: Add special tokens to tokenizer._GPT2BPETokenizer? Then we could create a new `GPT2BPETokenizerWithFIM` tokenizer type
-    suffix_tok_id, prefix_tok_id, middle_tok_id = 50277, 50278, 50279
+    suffix_tok_id, prefix_tok_id, middle_tok_id = (tokenizer.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE])
 
     if np_rng.binomial(1, fim_rate): # sample bernoulli dist
 
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 13085a81c9..1f3ba91d48 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -22,6 +22,10 @@
 from .gpt2_tokenization import GPT2Tokenizer
 
 
+FIM_PREFIX = "<fim-prefix>"
+FIM_MIDDLE = "<fim-middle>"
+FIM_SUFFIX = "<fim-suffix>"
+
 def build_tokenizer(args):
     """Initialize tokenizer."""
     if args.rank == 0:
@@ -41,6 +45,9 @@ def build_tokenizer(args):
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.merge_file is not None
         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+    elif args.tokenizer_type == 'GPT2BPETokenizerWithFIM':
+        assert args.merge_file is not None
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX])
     else:
         raise NotImplementedError('{} tokenizer is not '
                                   'implemented.'.format(args.tokenizer_type))
@@ -260,12 +267,13 @@ def additional_special_tokens(self, value):
 class _GPT2BPETokenizer(AbstractTokenizer):
     """Original GPT2 BPE tokenizer."""
 
-    def __init__(self, vocab_file, merge_file):
+    def __init__(self, vocab_file, merge_file, special_tokens=None):
         name = 'GPT2 BPE'
         super().__init__(name)
 
+        special_tokens = special_tokens if special_tokens is not None else []
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
-                                       special_tokens=[], max_len=None)
+                                       special_tokens=special_tokens, max_len=None)
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property

From 5ab8702f5179cd7d7fa64e542cf201355314cc79 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 3 Nov 2022 17:03:09 -0400
Subject: [PATCH 036/144] add spm+psm variants

---
 megatron/data/gpt_dataset.py | 65 ++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 17 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index bb3f268b0c..6e703aa0a9 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -498,30 +498,61 @@ def permute(sample, np_rng, args, tokenizer):
         middle = contents[boundaries[0]:boundaries[1]]
         suffix = contents[boundaries[1]:]
 
-
-        suffix = np.array([suffix_tok_id, *tokenizer.tokenize(suffix)])
-        prefix = np.array([prefix_tok_id, *tokenizer.tokenize(prefix)])
-        middle = np.array([middle_tok_id, *tokenizer.tokenize(middle)])
-        
-        # need to make same length as the input
-        new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0]
+        prefix = np.array([*tokenizer.tokenize(prefix)])
+        middle = np.array([*tokenizer.tokenize(middle)])
+        suffix = np.array([*tokenizer.tokenize(suffix)])
+
+        # TODO: here we truncate each given segment to fit the same length as it was before
+        # A consequence is that we never reach the end of a file?
+        # Should we rather truncate at the context-level? 
+        # need to make same length as the input. Take the 3 sentinel tokens into account
+        new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
         diff = new_length - sample.shape[0]
-
-        # print(new_length, sample.shape, suffix.shape, diff)
         if diff > 0: # too long
             # TODO: How to prevent this from happening? 
             if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
                 return sample, np_rng
             suffix = suffix[:suffix.shape[0] - diff]
         elif diff < 0: # too short
-            # TODO: Does this really happen in practice? pad is not used by the GPT2 BPE tokenizer.
-            suffix = np.concatenate([suffix, np.full((-1 * diff), tokenizer.pad)])
-
-        new_sample = np.concatenate([ # TODO(Hailey): add a branch here + a param to select SPM or PSM mode
-            suffix,
-            prefix,
-            middle,
-        ])
+            raise ValueError("It is not clear how this can happen and how to handle it")
+        
+        if np_rng.binomial(1, args.fim_spm_rate):
+            # SPM (variant 2 from FIM paper)
+            new_sample = np.concatenate([
+                [prefix_tok_id, suffix_tok_id], suffix,
+                [middle_tok_id], prefix, middle
+            ])
+        else:
+            # PSM
+            new_sample = np.concatenate([
+                [prefix_tok_id], prefix,
+                [suffix_tok_id], suffix,
+                [middle_tok_id], middle
+            ])
+
+        # suffix = np.array([suffix_tok_id, *tokenizer.tokenize(suffix)])
+        # prefix = np.array([prefix_tok_id, *tokenizer.tokenize(prefix)])
+        # middle = np.array([middle_tok_id, *tokenizer.tokenize(middle)])
+        
+        # # need to make same length as the input
+        # new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0]
+        # diff = new_length - sample.shape[0]
+
+        # # print(new_length, sample.shape, suffix.shape, diff)
+        # if diff > 0: # too long
+        #     # TODO: How to prevent this from happening? 
+        #     if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
+        #         return sample, np_rng
+        #     suffix = suffix[:suffix.shape[0] - diff]
+        # elif diff < 0: # too short
+        #     # TODO: Does this really happen in practice? pad is not used by the GPT2 BPE tokenizer.
+        #     suffix = np.concatenate([suffix, np.full((-1 * diff), tokenizer.pad)])
+
+        # new_sample = np.concatenate([ # TODO(Hailey): add a branch here + a param to select SPM or PSM mode
+        #     suffix,
+        #     prefix,
+        #     middle,
+        # ])
     else:
         # don't do FIM preproc
         new_sample = sample

From 1f85184dbd5d3c310c4cc5dfda7ec16dad02d134 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 4 Nov 2022 16:06:14 -0400
Subject: [PATCH 037/144] also permute the segment after last eod token, fix
 permute boundaries

---
 megatron/data/gpt_dataset.py | 43 ++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 6e703aa0a9..fd7cfb649e 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -15,6 +15,7 @@
 
 """GPT style dataset."""
 
+import itertools
 import os
 import time
 
@@ -136,6 +137,8 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
 
     return indexed_dataset
 
+SEGMENT_OK = 0
+SEGMENT_TOO_SHORT = 0
 
 class GPTDataset(torch.utils.data.Dataset):
 
@@ -195,7 +198,10 @@ def __getitem__(self, idx):
         sample = np.array(sample, dtype=np.int64)
         # # print(sample, sample.shape)
         # # do FIM here, if enabled
+        # TODO: Do we handle the following point from FIM paper?
+        # To transform data in the character space for context-level FIM, the tokenized documents have to be decoded back into strings before FIM augmentation. Depending on the vocabulary, some care has to be given to ensure decoding does not introduce any spurious characters into training. For example, utf-8 characters are encoded as multiple tokens with a BPE vocabulary; they can result in fragments from chunking and fail to decode. To prevent unforeseen errors midway through training, we encourage checking for these fragments at the beginning or end of a context and removing them.
         fim_rate = self.args.fim_rate
+        global SEGMENT_OK, SEGMENT_TOO_SHORT
 
         if fim_rate != 0:
             assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
@@ -206,24 +212,29 @@ def __getitem__(self, idx):
 
             if segment_breaks.shape != (0, 1): # then there is an EOD token in this example
                 curr_start_position = 0
-                for loc in np.nditer(segment_breaks):
+                # Also permute the segment after the last EOD
+                for loc in itertools.chain.from_iterable((np.nditer(segment_breaks), [len(sample)])):
                     # print(loc - curr_start_position, flush=True)
                     # permute {prefix, suffix, middle} or {suffix, prefix, middle}
                     # try:
                     if loc - curr_start_position > 10: # sometimes examples start with EOD or are too short. so avoid this case
                         sample[curr_start_position:loc], self.np_rng = \
                             permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer)
+                        # SEGMENT_OK += 1
+                        # print(f"SEGMENT TOO SHORT fraction: {SEGMENT_TOO_SHORT / (SEGMENT_TOO_SHORT + SEGMENT_OK)}")
+                    else:
+                        SEGMENT_TOO_SHORT += 1
+                        # print(f"SEGMENT TOO SHORT fraction: {SEGMENT_TOO_SHORT / (SEGMENT_TOO_SHORT + SEGMENT_OK)}")
                     # except ValueError:
                     #     # print(loc - curr_start_position, flush=True)
                     #     pass
 
                     curr_start_position = loc + 1 # jump over the EOD token
-                # TODO: Check that we are not skipping the last subsequence (after the last eod)?
             else:
                 sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer)
         
         # end FIM-specific code
-
+        print(sample, flush=True)
         return {"text": sample}
         # return {'text': np.array(sample, dtype=np.int64)}
 
@@ -473,7 +484,7 @@ def _build_shuffle_idx(num_samples, total_size, np_rng):
 
 
 # From https://github.com/EleutherAI/gpt-neox/blob/FIM-clean/megatron/data/gpt2_dataset.py#L339
-def permute(sample, np_rng, args, tokenizer):
+def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
     """
     Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. 
     Maintain the same sample length (if transform creates a few extra tokens, drop them).
@@ -487,7 +498,10 @@ def permute(sample, np_rng, args, tokenizer):
         contents = tokenizer.detokenize(sample)
         
         try:
-            boundaries = list(np_rng.randint(low=1, high=len(contents) - 1, size=2))
+            # A boundary can be =0 (prefix will be empty)
+            # a boundary can be =len(contents) (suffix will be empty)
+            # The two boundaries can be equal (middle will be empty)
+            boundaries = list(np_rng.randint(low=0, high=len(contents) + 1, size=2))
             boundaries.sort()
         except ValueError as e:
             print(len(contents), contents)
@@ -506,15 +520,16 @@ def permute(sample, np_rng, args, tokenizer):
         # A consequence is that we never reach the end of a file?
         # Should we rather truncate at the context-level? 
         # need to make same length as the input. Take the 3 sentinel tokens into account
-        new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
-        diff = new_length - sample.shape[0]
-        if diff > 0: # too long
-            # TODO: How to prevent this from happening? 
-            if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
-                return sample, np_rng
-            suffix = suffix[:suffix.shape[0] - diff]
-        elif diff < 0: # too short
-            raise ValueError("It is not clear how this can happen and how to handle it")
+        if truncate_or_pad:
+            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
+            diff = new_length - sample.shape[0]
+            if diff > 0: # too long
+                # TODO: How to prevent this from happening? 
+                if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
+                    return sample, np_rng
+                suffix = suffix[:suffix.shape[0] - diff]
+            elif diff < 0: # too short
+                raise ValueError("It is not clear how this can happen and how to handle it")
         
         if np_rng.binomial(1, args.fim_spm_rate):
             # SPM (variant 2 from FIM paper)

From 1290e49fadafe24fdc34f4c21a3b95f710d7db11 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Sat, 5 Nov 2022 10:58:45 -0400
Subject: [PATCH 038/144] fix data type in permutation

---
 megatron/data/gpt_dataset.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index fd7cfb649e..c749e7d0d0 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -217,7 +217,7 @@ def __getitem__(self, idx):
                     # print(loc - curr_start_position, flush=True)
                     # permute {prefix, suffix, middle} or {suffix, prefix, middle}
                     # try:
-                    if loc - curr_start_position > 10: # sometimes examples start with EOD or are too short. so avoid this case
+                    if loc - curr_start_position > 0:
                         sample[curr_start_position:loc], self.np_rng = \
                             permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer)
                         # SEGMENT_OK += 1
@@ -512,9 +512,9 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
         middle = contents[boundaries[0]:boundaries[1]]
         suffix = contents[boundaries[1]:]
 
-        prefix = np.array([*tokenizer.tokenize(prefix)])
-        middle = np.array([*tokenizer.tokenize(middle)])
-        suffix = np.array([*tokenizer.tokenize(suffix)])
+        prefix = np.array([*tokenizer.tokenize(prefix)], dtype=np.int64)
+        middle = np.array([*tokenizer.tokenize(middle)], dtype=np.int64)
+        suffix = np.array([*tokenizer.tokenize(suffix)], dtype=np.int64)
 
         # TODO: here we truncate each given segment to fit the same length as it was before
         # A consequence is that we never reach the end of a file?
@@ -572,5 +572,4 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
         # don't do FIM preproc
         new_sample = sample
 
-
     return new_sample, np_rng

From a5161d7e08f9e3c73c30d0b7cce13423b67d5a70 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Sat, 5 Nov 2022 12:00:00 -0400
Subject: [PATCH 039/144] truncate or pad after all segments are joined back

---
 megatron/data/gpt_dataset.py            | 41 +++++++++++++++----------
 megatron/tokenizer/gpt2_tokenization.py |  2 +-
 megatron/tokenizer/tokenizer.py         |  3 +-
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index c749e7d0d0..b33bfe93f3 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -27,7 +27,7 @@
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-from megatron.tokenizer.tokenizer import FIM_MIDDLE, FIM_PREFIX, FIM_SUFFIX
+from megatron.tokenizer.tokenizer import FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
 
 
 def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
@@ -196,6 +196,7 @@ def __getitem__(self, idx):
         # TODO(Hailey): can merge the code below this line with code above this line.
         # TODO(Hailey), cont: above already iterates through loop, so just add the permuting in there?
         sample = np.array(sample, dtype=np.int64)
+        sample_len = sample.shape[0]
         # # print(sample, sample.shape)
         # # do FIM here, if enabled
         # TODO: Do we handle the following point from FIM paper?
@@ -207,34 +208,40 @@ def __getitem__(self, idx):
             assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
 
             eod = self.tokenizer.eod
+            pad = self.tokenizer.tokenizer.special_tokens[FIM_PAD]
 
             segment_breaks = np.argwhere(sample == eod) # split sample by document
 
             if segment_breaks.shape != (0, 1): # then there is an EOD token in this example
                 curr_start_position = 0
-                # Also permute the segment after the last EOD
-                for loc in itertools.chain.from_iterable((np.nditer(segment_breaks), [len(sample)])):
-                    # print(loc - curr_start_position, flush=True)
-                    # permute {prefix, suffix, middle} or {suffix, prefix, middle}
-                    # try:
+                new_samples = []
+                for loc in np.nditer(segment_breaks):
+                    # Only permute non-empty segments.
                     if loc - curr_start_position > 0:
-                        sample[curr_start_position:loc], self.np_rng = \
-                            permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer)
-                        # SEGMENT_OK += 1
-                        # print(f"SEGMENT TOO SHORT fraction: {SEGMENT_TOO_SHORT / (SEGMENT_TOO_SHORT + SEGMENT_OK)}")
-                    else:
-                        SEGMENT_TOO_SHORT += 1
-                        # print(f"SEGMENT TOO SHORT fraction: {SEGMENT_TOO_SHORT / (SEGMENT_TOO_SHORT + SEGMENT_OK)}")
-                    # except ValueError:
-                    #     # print(loc - curr_start_position, flush=True)
-                    #     pass
+                        # permute {prefix, suffix, middle} or {suffix, prefix, middle}
+                        permuted, self.np_rng = \
+                            permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False)
+                        new_samples += [permuted, [eod]]
 
                     curr_start_position = loc + 1 # jump over the EOD token
+                # Permute the segment after the last EOD
+                permuted, self.np_rng = \
+                    permute(sample[curr_start_position:], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False)
+                new_samples.append(permuted)
+
+                sample = np.concatenate(new_samples)
             else:
                 sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer)
         
+        # Truncate or pad sequence to max-length
+        diff = sample.shape[0] - sample_len
+        if diff > 0: # too long
+            sample = sample[:sample_len]
+        elif diff < 0: # too short
+            sample = np.concatenate([sample, np.full((-1 * diff), pad)])
+
+        assert sample.shape[0] == sample_len
         # end FIM-specific code
-        print(sample, flush=True)
         return {"text": sample}
         # return {'text': np.array(sample, dtype=np.int64)}
 
diff --git a/megatron/tokenizer/gpt2_tokenization.py b/megatron/tokenizer/gpt2_tokenization.py
index 3f37e44908..ff89504351 100644
--- a/megatron/tokenizer/gpt2_tokenization.py
+++ b/megatron/tokenizer/gpt2_tokenization.py
@@ -281,7 +281,7 @@ def encode(self, text):
         return self.convert_tokens_to_ids(self.tokenize(text))
 
     def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
+        text = ''.join(self.convert_ids_to_tokens(tokens))
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
         return text
 
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 1f3ba91d48..353df22750 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -25,6 +25,7 @@
 FIM_PREFIX = "<fim-prefix>"
 FIM_MIDDLE = "<fim-middle>"
 FIM_SUFFIX = "<fim-suffix>"
+FIM_PAD = "<fim-pad>"
 
 def build_tokenizer(args):
     """Initialize tokenizer."""
@@ -47,7 +48,7 @@ def build_tokenizer(args):
         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
     elif args.tokenizer_type == 'GPT2BPETokenizerWithFIM':
         assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX])
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD])
     else:
         raise NotImplementedError('{} tokenizer is not '
                                   'implemented.'.format(args.tokenizer_type))

From 641af1d1bec4c01e5339a1b2dc43a41eb176fd2b Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Sat, 5 Nov 2022 12:09:07 -0400
Subject: [PATCH 040/144] some cleanup

---
 megatron/data/gpt_dataset.py | 39 ++++++------------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index b33bfe93f3..16a5f66099 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -137,8 +137,6 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
 
     return indexed_dataset
 
-SEGMENT_OK = 0
-SEGMENT_TOO_SHORT = 0
 
 class GPTDataset(torch.utils.data.Dataset):
 
@@ -202,7 +200,6 @@ def __getitem__(self, idx):
         # TODO: Do we handle the following point from FIM paper?
         # To transform data in the character space for context-level FIM, the tokenized documents have to be decoded back into strings before FIM augmentation. Depending on the vocabulary, some care has to be given to ensure decoding does not introduce any spurious characters into training. For example, utf-8 characters are encoded as multiple tokens with a BPE vocabulary; they can result in fragments from chunking and fail to decode. To prevent unforeseen errors midway through training, we encourage checking for these fragments at the beginning or end of a context and removing them.
         fim_rate = self.args.fim_rate
-        global SEGMENT_OK, SEGMENT_TOO_SHORT
 
         if fim_rate != 0:
             assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
@@ -231,7 +228,7 @@ def __getitem__(self, idx):
 
                 sample = np.concatenate(new_samples)
             else:
-                sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer)
+                sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer, truncate_or_pad=False)
         
         # Truncate or pad sequence to max-length
         diff = sample.shape[0] - sample_len
@@ -498,7 +495,7 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
     """
     fim_rate = args.fim_rate
 
-    suffix_tok_id, prefix_tok_id, middle_tok_id = (tokenizer.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE])
+    suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
 
     if np_rng.binomial(1, fim_rate): # sample bernoulli dist
 
@@ -523,20 +520,19 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
         middle = np.array([*tokenizer.tokenize(middle)], dtype=np.int64)
         suffix = np.array([*tokenizer.tokenize(suffix)], dtype=np.int64)
 
-        # TODO: here we truncate each given segment to fit the same length as it was before
+        # here we truncate each given segment to fit the same length as it was before
         # A consequence is that we never reach the end of a file?
-        # Should we rather truncate at the context-level? 
-        # need to make same length as the input. Take the 3 sentinel tokens into account
+        # we should rather truncate at the context-level
         if truncate_or_pad:
+            # need to make same length as the input. Take the 3 sentinel tokens into account
             new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
             diff = new_length - sample.shape[0]
             if diff > 0: # too long
-                # TODO: How to prevent this from happening? 
                 if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
                     return sample, np_rng
                 suffix = suffix[:suffix.shape[0] - diff]
             elif diff < 0: # too short
-                raise ValueError("It is not clear how this can happen and how to handle it")
+                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
         
         if np_rng.binomial(1, args.fim_spm_rate):
             # SPM (variant 2 from FIM paper)
@@ -551,30 +547,7 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
                 [suffix_tok_id], suffix,
                 [middle_tok_id], middle
             ])
-
-        # suffix = np.array([suffix_tok_id, *tokenizer.tokenize(suffix)])
-        # prefix = np.array([prefix_tok_id, *tokenizer.tokenize(prefix)])
-        # middle = np.array([middle_tok_id, *tokenizer.tokenize(middle)])
         
-        # # need to make same length as the input
-        # new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0]
-        # diff = new_length - sample.shape[0]
-
-        # # print(new_length, sample.shape, suffix.shape, diff)
-        # if diff > 0: # too long
-        #     # TODO: How to prevent this from happening? 
-        #     if suffix.shape[0] <= diff: # if there's no space to truncate the suffix: stop and report it. atm i should have stopped this from happening
-        #         return sample, np_rng
-        #     suffix = suffix[:suffix.shape[0] - diff]
-        # elif diff < 0: # too short
-        #     # TODO: Does this really happen in practice? pad is not used by the GPT2 BPE tokenizer.
-        #     suffix = np.concatenate([suffix, np.full((-1 * diff), tokenizer.pad)])
-
-        # new_sample = np.concatenate([ # TODO(Hailey): add a branch here + a param to select SPM or PSM mode
-        #     suffix,
-        #     prefix,
-        #     middle,
-        # ])
     else:
         # don't do FIM preproc
         new_sample = sample

From 66e61e70c217b902e03c3ac5f0acf5ec998195a9 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 7 Nov 2022 14:14:21 -0500
Subject: [PATCH 041/144] add preprocessing of HF datasets directly

---
 tools/preprocess_data.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index a86035f214..31ffbb781c 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -31,6 +31,7 @@
 except ImportError:
     nltk_available = False
 
+from datasets import load_dataset
 from megatron.tokenizer import build_tokenizer
 from megatron.data import indexed_dataset
 
@@ -74,9 +75,8 @@ def initializer(self):
 
         else:
             Encoder.splitter = IdentitySplitter()
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
+    
+    def _encode_data(self, data):
         ids = {}
         for key in self.args.json_keys:
             text = data[key]
@@ -88,7 +88,17 @@ def encode(self, json_line):
             if len(doc_ids) > 0 and self.args.append_eod:
                 doc_ids[-1].append(Encoder.tokenizer.eod)
             ids[key] = doc_ids
+        return ids
+
+    def encode(self, json_line):
+        data = json.loads(json_line)
+        ids = self._encode_data(data)
         return ids, len(json_line)
+    
+    def encode_hf(self, sample):
+        ids = self._encode_data(sample)
+        return ids, 1
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -147,8 +157,6 @@ def main():
     args = get_args()
     startup_start = time.time()
 
-    print("Opening", args.input)
-    fin = open(args.input, 'r', encoding='utf-8')
 
     if nltk_available and args.split_sentences:
         nltk.download("punkt", quiet=True)
@@ -156,8 +164,17 @@ def main():
     encoder = Encoder(args)
     tokenizer = build_tokenizer(args)
     pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-    encoded_docs = pool.imap(encoder.encode, fin, args.chunk_size)
-    #encoded_docs = map(encoder.encode, fin)
+    print("Opening", args.input)
+
+    if args.input.endswith(".jsonl"):
+        print("Input is a jsonl file")
+        fin = open(args.input, 'r', encoding='utf-8')
+        encoded_docs = pool.imap(encoder.encode, fin, args.chunk_size)
+        #encoded_docs = map(encoder.encode, fin)
+    else:
+        print("Input is not a jsonl file, will try to load from HF datasets")
+        ds = load_dataset(args.input, use_auth_token=True, streaming=True, split="train")
+        encoded_docs = pool.imap(encoder.encode_hf, ds, args.chunk_size)
 
     level = "document"
     if args.split_sentences:

From a79988a88b1cf620c9348104f64c429d8e1fcffc Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 8 Nov 2022 10:54:26 -0500
Subject: [PATCH 042/144] modify max seq-length from 2048 to 8192

---
 megatron/fused_kernels/scaled_masked_softmax.h | 18 +++++++++++++++---
 .../scaled_masked_softmax_cuda.cu              |  2 +-
 megatron/fused_kernels/scaled_softmax_cuda.cu  |  2 +-
 .../scaled_upper_triang_masked_softmax.h       |  4 ++--
 .../scaled_upper_triang_masked_softmax_cuda.cu |  2 +-
 megatron/initialize.py                         |  2 +-
 megatron/model/fused_softmax.py                |  4 ++--
 7 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h
index e57fd04c62..f9ca0bbc7e 100644
--- a/megatron/fused_kernels/scaled_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_masked_softmax.h
@@ -447,7 +447,7 @@ void dispatch_scaled_softmax_forward(
     int batches,
     int attn_heads)
 {
-    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 );
+    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 );
     if (key_seq_len == 0) {
         return;
     } else {
@@ -523,6 +523,10 @@ void dispatch_scaled_softmax_forward(
                 scaled_softmax_warp_forward<input_t, output_t, acc_t, 12>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
                 break;
+            case 13: // 8192
+                scaled_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, key_seq_len);
+                break;
             default:
                 break;
         }
@@ -541,7 +545,7 @@ void dispatch_scaled_masked_softmax_forward(
     int attn_heads,
     int pad_batches)
 {
-    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 4096 );
+    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 );
     if (key_seq_len == 0) {
         return;
     } else {
@@ -617,6 +621,10 @@ void dispatch_scaled_masked_softmax_forward(
                 scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
                 break;
+            case 13: // 8192
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
             default:
                 break;
         }
@@ -634,7 +642,7 @@ void dispatch_scaled_masked_softmax_backward(
     int batches,
     int attn_heads)
 {
-    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 4096 );
+    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 8192 );
     if (key_seq_len == 0) {
        return;
     } else {
@@ -709,6 +717,10 @@ void dispatch_scaled_masked_softmax_backward(
                 scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
                 break;
+            case 13: // 8192
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
 
             default:
                 break;
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 2efee39a6d..ba48f86c3f 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -44,7 +44,7 @@ torch::Tensor fwd_cuda(
   const int attn_heads = input.size(1);
   const int query_seq_len = input.size(2);
   const int key_seq_len = input.size(3);
-  TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
+  TORCH_INTERNAL_ASSERT(key_seq_len <= 8192);
   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index efc0822481..664e831918 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -37,7 +37,7 @@ torch::Tensor fwd_cuda(
   const int attn_heads = input.size(1);
   const int query_seq_len = input.size(2);
   const int key_seq_len = input.size(3);
-  TORCH_INTERNAL_ASSERT(key_seq_len <= 4096);
+  TORCH_INTERNAL_ASSERT(key_seq_len <= 8192);
   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
 
   // Output 
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
index 6df83fc103..c0801357ec 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -340,7 +340,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
         return;
     } else {
@@ -431,7 +431,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
        return;
     } else {
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 5efc3d4128..6a0e5a8c7a 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -35,7 +35,7 @@ torch::Tensor fwd_cuda(
   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
   const int attn_batches = input.size(0);
   const int seq_len = input.size(1);
-  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(seq_len <= 8192);
 
   // Output 
   auto act_options = input.options().requires_grad(false);
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 5104964e99..7333c2e0e6 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -131,7 +131,7 @@ def _compile_dependencies():
         args.micro_batch_size
     # Constraints on sequence length and attn_batch_size to enable warp based
     # optimization and upper triangular optimization (for causal mask)
-    custom_kernel_constraint = seq_len > 16 and seq_len <=4096 and \
+    custom_kernel_constraint = seq_len > 16 and seq_len <=8192 and \
         seq_len % 4 == 0 and attn_batch_size % 4 == 0
     # Print a warning.
     if not ((args.fp16 or args.bf16) and
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 2409edd59f..dcdad69702 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -168,11 +168,11 @@ def is_kernel_available(self, mask, b, np, sq, sk):
         if (
             self.scaled_masked_softmax_fusion  # user want to fuse
             and self.input_in_float16  # input must be fp16
-            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and 16 < sk <= 8192  # sk must be 16 ~ 8192
             and sq % 4 == 0  # sq must be divisor of 4
             and attn_batches % 4 == 0  # np * b must be divisor of 4
         ):
-            if 0 <= sk <= 4096:
+            if 0 <= sk <= 8192:
                 batch_per_block = self.get_batch_per_block(sq, sk, b, np)
 
                 if self.attn_mask_type == AttnMaskType.causal:

From db3809b7c89c8eb3da7e1c27432173924e3de849 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 14 Nov 2022 10:54:43 -0500
Subject: [PATCH 043/144] add missing cases in fused kernels

---
 .../scaled_upper_triang_masked_softmax.h         | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
index c0801357ec..aae153b078 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -415,6 +415,14 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
                 scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
@@ -506,6 +514,14 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
                 scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }

From acda62738fc44af61d0d39ae046477c247677de6 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 14 Nov 2022 14:44:41 -0500
Subject: [PATCH 044/144] add longer sequence lengths in fused kernels test

---
 .../fused_kernels/tests/test_fused_kernels.py     | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index f8d5027a1f..f1a6a2a4a5 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -23,17 +23,18 @@ def test_load_fused_kernels():
 
 
 def test_fused_softmax():
-    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
+    bert = BertModel.from_pretrained("bert-base-cased", max_position_embeddings=8192, ignore_mismatched_sizes=True).cuda().half()
     tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
     test_text = (
         "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
+        "hi hi hi hi hi hi hi hi hi hi hi hi hi" * 256  # 32 * 256
     )
 
     tokens = tokenizer(
         [test_text] * 4,
         return_tensors="pt",
     )
+    print("tokens :", tokens["input_ids"].shape)
 
     embedding_output = bert.embeddings(
         input_ids=tokens["input_ids"].cuda(),
@@ -121,17 +122,18 @@ def test_fused_softmax():
 
 
 def test_fused_upper_triangle_mask_softmax():
-    gpt = GPT2Model.from_pretrained("gpt2").cuda().half()
+    gpt = GPT2Model.from_pretrained("gpt2", max_position_embeddings=8192, ignore_mismatched_sizes=True).cuda().half()
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     test_text = (
         "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi"  # 24
+        "hi hi hi hi hi hi hi"  * 256 # 24 * 256
     )
 
     tokens = tokenizer(
         [test_text] * 4,
         return_tensors="pt",
     )
+    print("tokens :", tokens["input_ids"].shape)
 
     attention_mask = tokens["attention_mask"].cuda()
     attention_mask = attention_mask.view(attention_mask.size(0), -1)
@@ -221,17 +223,18 @@ def test_fused_upper_triangle_mask_softmax():
 
 
 def test_layer_norm():
-    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
+    bert = BertModel.from_pretrained("bert-base-cased", max_position_embeddings=8192, ignore_mismatched_sizes=True).cuda().half()
     tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
     test_text = (
         "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
+        "hi hi hi hi hi hi hi hi hi hi hi hi hi" * 256  # 32
     )
 
     tokens = tokenizer(
         [test_text] * 4,
         return_tensors="pt",
     )
+    print("tokens :", tokens["input_ids"].shape)
 
     # [bsz, seq_len, d_model]
     embedding_output = (

From d59c85bbe8d87a3aca69b3eba5128cbf3a290185 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 14 Nov 2022 14:45:12 -0500
Subject: [PATCH 045/144] larger MAX_TOKENS_TO_OOM

---
 megatron/text_generation/generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/text_generation/generation.py b/megatron/text_generation/generation.py
index 2b81e89d0a..13f69f0867 100644
--- a/megatron/text_generation/generation.py
+++ b/megatron/text_generation/generation.py
@@ -28,7 +28,7 @@
 from .sampling import sample
 from .beam_utils import BeamHypotheses
 
-MAX_TOKENS_TO_OOM = 12000  # (rprenger) Perfect value depends on hardware and network
+MAX_TOKENS_TO_OOM = 128000  # (rprenger) Perfect value depends on hardware and network
 
 def score_and_return_on_first_stage(model, tokens, lengths):
     """Function for just scoring.

From 7b0cee27e65e406ec898941257978492b5fda29b Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Fri, 18 Nov 2022 05:03:21 +0000
Subject: [PATCH 046/144] use custom barrier with device_ids

---
 megatron/checkpointing.py |  8 +++++---
 megatron/initialize.py    | 11 ++++++-----
 megatron/training.py      | 13 +++++++------
 megatron/utils.py         | 14 +++++++++++++-
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 6a240810f6..21e0a699cd 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -28,6 +28,7 @@
 from .utils import (unwrap_model,
                     print_rank_0)
 from megatron.model.enums import PositionEmbeddingType
+from megatron.utils import barrier
 
 
 _CHECKPOINT_VERSION = None
@@ -291,12 +292,13 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
         # Save model and optimizer together.
         state_dict = {**model_state_dict, **optim_state_dict}
         if state_dict: # only saves if populated (i.e., inherits conditions above)
+            print("Trying to save ...", flush=True)
             ensure_directory_exists(model_checkpoint_name)
             torch.save(state_dict, model_checkpoint_name)
 
     # Wait so everyone is done (necessary)
     if torch.distributed.is_initialized():
-        torch.distributed.barrier()
+        barrier()
 
     print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}'.format(
         iteration, args.save))
@@ -309,7 +311,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
 
     # Wait so everyone is done (not necessary)
     if torch.distributed.is_initialized():
-        torch.distributed.barrier()
+        barrier()
 
 def _transpose_first_dim(t, num_splits, num_splits_first, model):
     input_shape = t.size()
@@ -677,7 +679,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
 
     # Some utilities want to load a checkpoint without distributed being initialized
     if torch.distributed.is_initialized():
-        torch.distributed.barrier()
+        barrier()
 
     print_rank_0(f'  successfully loaded checkpoint from {load_dir} '
                  f'at iteration {iteration}')
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 7333c2e0e6..df8738a7cd 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -40,6 +40,7 @@
                           set_tensor_model_parallel_world_size)
 from megatron.model.transformer import bias_dropout_add_fused_train
 from megatron.model.fused_bias_gelu import bias_gelu
+from megatron.utils import barrier
 
 
 def initialize_megatron(extra_args_provider=None, args_defaults={},
@@ -147,15 +148,15 @@ def _compile_dependencies():
         start_time = time.time()
         print('> compiling and loading fused kernels ...', flush=True)
         fused_kernels.load(args)
-        torch.distributed.barrier()
+        barrier()
     else:
-        torch.distributed.barrier()
+        barrier()
         fused_kernels.load(args)
     # Simple barrier to make sure all ranks have passed the
     # compilation phase successfully before moving on to the
     # rest of the program. We think this might ensure that
     # the lock is released.
-    torch.distributed.barrier()
+    barrier()
     if torch.distributed.get_rank() == 0:
         print('>>> done with compiling and loading fused kernels. '
               'Compilation time: {:.3f} seconds'.format(
@@ -213,9 +214,9 @@ def _init_autoresume():
     """Set autoresume start time."""
     autoresume = get_adlr_autoresume()
     if autoresume:
-        torch.distributed.barrier()
+        barrier()
         autoresume.init()
-        torch.distributed.barrier()
+        barrier()
 
 
 def _set_random_seed(seed_, data_parallel_random_init=False):
diff --git a/megatron/training.py b/megatron/training.py
index 16d190472f..b7728cdc32 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,11 +57,12 @@
 from megatron.schedules import get_forward_backward_func
 from megatron.utils import report_memory
 from megatron.model.vision.knn_monitor import compute_feature_bank
+from megatron.utils import barrier
 
 
 def print_datetime(string):
     """Note that this call will sync across all ranks."""
-    torch.distributed.barrier()
+    barrier()
     time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     print_rank_0('[' + string + '] datetime: {} '.format(time_str))
 
@@ -380,10 +381,10 @@ def setup_model_and_optimizer(model_provider_func,
         timers = get_timers()
         # Extra barrier is added to make sure all ranks report the
         # max time.
-        torch.distributed.barrier()
+        barrier()
         timers('load-checkpoint').start()
         args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler)
-        torch.distributed.barrier()
+        barrier()
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
         # This is critical when only model is loaded. We should make sure
@@ -672,10 +673,10 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
     timers = get_timers()
     # Extra barrier is added to make sure
     # all ranks report the max time.
-    torch.distributed.barrier()
+    barrier()
     timers('save-checkpoint').start()
     save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-    torch.distributed.barrier()
+    barrier()
     timers('save-checkpoint').stop()
     timers.log(['save-checkpoint'])
 
@@ -782,7 +783,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             if not saved_checkpoint:
                 save_checkpoint_and_time(iteration, model, optimizer,
                                          opt_param_scheduler)
-            torch.distributed.barrier()
+            barrier()
             print_datetime('exiting program at iteration {}'.format(iteration))
             sys.exit()
 
diff --git a/megatron/utils.py b/megatron/utils.py
index 89bdba94aa..7ec209431a 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -19,6 +19,7 @@
 
 import torch
 from torch.nn.parallel import DistributedDataParallel as torchDDP
+from torch.distributed import BarrierOptions, GroupMember
 
 from apex.multi_tensor_apply import multi_tensor_applier
 import amp_C
@@ -132,7 +133,7 @@ def check_adlr_autoresume_termination(iteration, model,
     args = get_args()
     autoresume = get_adlr_autoresume()
     # Add barrier to ensure consistnecy.
-    torch.distributed.barrier()
+    barrier()
     if autoresume.termination_requested():
         if args.save:
             save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
@@ -222,3 +223,14 @@ def print_rank_last(message):
             print(message, flush=True)
     else:
         print(message, flush=True)
+
+
+def barrier():
+    args = get_args()
+
+    opts = BarrierOptions()
+    opts.device_ids = [args.local_rank]
+
+    group = GroupMember.WORLD
+    work = group.barrier(opts=opts)
+    work.wait()

From 93cb6a0f7fea69b8e74625d8dbf162bcb92ee4b0 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Tue, 22 Nov 2022 04:48:50 +0000
Subject: [PATCH 047/144] add HF tokenizer

---
 megatron/arguments.py           |  6 ++++-
 megatron/tokenizer/tokenizer.py | 42 +++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 25c0995ebb..3aad6d23e4 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -870,6 +870,8 @@ def _add_data_args(parser):
                        help='Path to the vocab file.')
     group.add_argument('--merge-file', type=str, default=None,
                        help='Path to the BPE merge file.')
+    group.add_argument('--tokenizer-file', type=str, default=None,
+                       help='Path to the tokenizer.json file.')
     group.add_argument('--vocab-extra-ids', type=int, default=0,
                        help='Number of additional vocabulary tokens. '
                             'They are used for span masking in the T5 model')
@@ -899,7 +901,9 @@ def _add_data_args(parser):
                        choices=['BertWordPieceLowerCase',
                                 'BertWordPieceCase',
                                 'GPT2BPETokenizer',
-                                'GPT2BPETokenizerWithFIM'],
+                                'GPT2BPETokenizerWithFIM',
+                                'TokenizerFromFile',
+                                'TokenizerFromFileWithFIM'],
                        help='What type of tokenizer to use.')
     group.add_argument('--data-impl', type=str, default='infer',
                        choices=['lazy', 'cached', 'mmap', 'infer'],
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 353df22750..8b656609b3 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -18,6 +18,7 @@
 from abc import ABC
 from abc import abstractmethod
 
+from transformers import PreTrainedTokenizerFast
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
 from .gpt2_tokenization import GPT2Tokenizer
 
@@ -49,6 +50,12 @@ def build_tokenizer(args):
     elif args.tokenizer_type == 'GPT2BPETokenizerWithFIM':
         assert args.merge_file is not None
         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD])
+    elif args.tokenizer_type == "TokenizerFromFile":
+        assert args.tokenizer_file is not None
+        tokenizer = _HFTokenizer(args.tokenizer_file)
+    elif args.tokenizer_type == "TokenizerFromFileWithFIM":
+        assert args.tokenizer_file is not None
+        tokenizer = _HFTokenizer(args.tokenizer_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD])
     else:
         raise NotImplementedError('{} tokenizer is not '
                                   'implemented.'.format(args.tokenizer_type))
@@ -298,3 +305,38 @@ def detokenize(self, token_ids):
     @property
     def eod(self):
         return self.eod_id
+
+
+class _HFTokenizer(AbstractTokenizer):
+    """HF Tokenizer."""
+
+    def __init__(self, tokenizer_file, special_tokens=None):
+        name = 'HF Tokenizer'
+        super().__init__(name)
+
+        special_tokens = special_tokens if special_tokens is not None else []
+        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file, errors='replace',
+                                       special_tokens=special_tokens, max_len=None)
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id

From 9f2c4428b3be4834de2cf09f1cf06a14cb64a7c8 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 22 Nov 2022 00:35:36 -0500
Subject: [PATCH 048/144] add special tokens in HF tokenizer

---
 megatron/tokenizer/tokenizer.py | 27 ++++++++++++++++++---------
 tools/preprocess_data.py        |  4 +++-
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 8b656609b3..545b3fc645 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -27,6 +27,8 @@
 FIM_MIDDLE = "<fim-middle>"
 FIM_SUFFIX = "<fim-suffix>"
 FIM_PAD = "<fim-pad>"
+EOD = "<|endoftext|>"
+
 
 def build_tokenizer(args):
     """Initialize tokenizer."""
@@ -35,7 +37,10 @@ def build_tokenizer(args):
               flush=True)
 
     # Select and instantiate the tokenizer.
-    assert args.vocab_file is not None
+    if args.tokenizer_type in ['BertWordPieceLowerCase', 'BertWordPieceCase', 'GPT2BPETokenizer', 'GPT2BPETokenizerWithFIM']:
+        assert args.vocab_file is not None
+    else:
+        assert args.tokenizer_file is not None
     if args.tokenizer_type == 'BertWordPieceLowerCase':
         tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
                                             lower_case=True,
@@ -52,15 +57,17 @@ def build_tokenizer(args):
         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD])
     elif args.tokenizer_type == "TokenizerFromFile":
         assert args.tokenizer_file is not None
-        tokenizer = _HFTokenizer(args.tokenizer_file)
+        tokenizer = _HFTokenizer(args.tokenizer_file, special_tokens=[EOD])
     elif args.tokenizer_type == "TokenizerFromFileWithFIM":
         assert args.tokenizer_file is not None
-        tokenizer = _HFTokenizer(args.tokenizer_file, special_tokens=[FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD])
+        tokenizer = _HFTokenizer(args.tokenizer_file, special_tokens=[EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD])
     else:
         raise NotImplementedError('{} tokenizer is not '
                                   'implemented.'.format(args.tokenizer_type))
 
     # Add vocab size.
+    # TODO: For most tokenizers, vocab_size does not take special_tokens into account. 
+    # Might cause an issue if vocab_size + len(special_tokens) exceeds padded_vocab_size?
     args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
                                                       args)
 
@@ -315,21 +322,23 @@ def __init__(self, tokenizer_file, special_tokens=None):
         super().__init__(name)
 
         special_tokens = special_tokens if special_tokens is not None else []
-        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file, errors='replace',
-                                       special_tokens=special_tokens, max_len=None)
-        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file, errors='replace', max_len=None)
+        self.tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
+        self.eod_id = self.tokenizer.vocab[EOD]
+
+        self._inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()}
 
     @property
     def vocab_size(self):
-        return len(self.tokenizer.encoder)
+        return self.tokenizer.vocab_size
 
     @property
     def vocab(self):
-        return self.tokenizer.encoder
+        return self.tokenizer.vocab
 
     @property
     def inv_vocab(self):
-        return self.tokenizer.decoder
+        return self._inv_vocab
 
     def tokenize(self, text):
         return self.tokenizer.encode(text)
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 31ffbb781c..5810d29297 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -115,12 +115,14 @@ def get_args():
     group = parser.add_argument_group(title='tokenizer')
     group.add_argument('--tokenizer-type', type=str, required=True,
                        choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer'],
+                                'GPT2BPETokenizer', 'TokenizerFromFile'],
                        help='What type of tokenizer to use.')
     group.add_argument('--vocab-file', type=str, default=None,
                        help='Path to the vocab file')
     group.add_argument('--merge-file', type=str, default=None,
                        help='Path to the BPE merge file (if necessary).')
+    group.add_argument('--tokenizer-file', type=str, default=None,
+                       help='Path to the tokenizer file')
     group.add_argument('--append-eod', action='store_true',
                        help='Append an <eod> token to the end of a document.')
 

From 9fe3bcb1a4c43adba6e458f92cee2df608f84a70 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 22 Nov 2022 14:38:48 -0500
Subject: [PATCH 049/144] fix vocab_size in _HFTokenizer

---
 megatron/tokenizer/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 545b3fc645..3532238c3e 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -330,7 +330,7 @@ def __init__(self, tokenizer_file, special_tokens=None):
 
     @property
     def vocab_size(self):
-        return self.tokenizer.vocab_size
+        return len(self.tokenizer)
 
     @property
     def vocab(self):

From 6982c4ee30787df990a3ce51c465c1aac4a66d3f Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Tue, 22 Nov 2022 21:11:30 +0000
Subject: [PATCH 050/144] fix: initialize tokenizer with TokenizerFromFile

---
 megatron/checkpointing.py | 2 +-
 megatron/global_vars.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 21e0a699cd..dc2ab5030d 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -72,7 +72,7 @@ def _compare(arg_name, old_arg_name=None):
     # with alibi we can change `max_position_embeddings`
     if args.position_embedding_type != PositionEmbeddingType.alibi:
         _compare('max_position_embeddings')
-    if args.vocab_file:
+    if args.vocab_file or args.tokenizer_file:
         _compare('make_vocab_size_divisible_by')
         _compare('padded_vocab_size')
         _compare('tokenizer_type')
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index f2b2741444..4a9b2a16da 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -104,7 +104,7 @@ def set_global_variables(args):
     set_args(args)
 
     _build_num_microbatches_calculator(args)
-    if args.vocab_file:
+    if args.vocab_file or args.tokenizer_file:
         _ = _build_tokenizer(args)
     _set_tensorboard_writer(args)
     _set_adlr_autoresume(args)

From 4f060a265619a3a757876522f9a78b7ea9d5f6b7 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Tue, 22 Nov 2022 21:45:39 +0000
Subject: [PATCH 051/144] fix: add special_tokens dict for FIM

---
 megatron/data/gpt_dataset.py    | 4 ++--
 megatron/tokenizer/tokenizer.py | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 16a5f66099..02bfad8142 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -205,7 +205,7 @@ def __getitem__(self, idx):
             assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
 
             eod = self.tokenizer.eod
-            pad = self.tokenizer.tokenizer.special_tokens[FIM_PAD]
+            pad = self.tokenizer.special_tokens[FIM_PAD]
 
             segment_breaks = np.argwhere(sample == eod) # split sample by document
 
@@ -495,7 +495,7 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
     """
     fim_rate = args.fim_rate
 
-    suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
+    suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
 
     if np_rng.binomial(1, fim_rate): # sample bernoulli dist
 
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 3532238c3e..f9cad7b642 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -290,6 +290,7 @@ def __init__(self, vocab_file, merge_file, special_tokens=None):
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                        special_tokens=special_tokens, max_len=None)
         self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+        self.special_tokens = self.tokenizer.special_tokens
 
     @property
     def vocab_size(self):
@@ -325,7 +326,10 @@ def __init__(self, tokenizer_file, special_tokens=None):
         self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file, errors='replace', max_len=None)
         self.tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
         self.eod_id = self.tokenizer.vocab[EOD]
-
+        # Token->id mapping for additional special-tokens
+        self.special_tokens = {
+            tok: self.tokenizer.vocab[tok] for tok in special_tokens
+        }
         self._inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()}
 
     @property

From 332e8dba8834ac2bf2f3e5b6919b07c8fcec970b Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Wed, 23 Nov 2022 22:23:23 +0000
Subject: [PATCH 052/144] load attention-head-type from checkpoint

---
 megatron/checkpointing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index dc2ab5030d..7d293ce8c8 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -506,6 +506,7 @@ def _set_arg(arg_name, old_arg_name=None, force=False):
     _set_arg('max_position_embeddings')
     _set_arg('tokenizer_type')
     _set_arg('padded_vocab_size')
+    _set_arg('attention_head_type')
     if checkpoint_version < 3.0:
         _set_arg('tensor_model_parallel_size',
                  'model_parallel_size')

From 0717dab7034a24869332021296ea6b507e9b2454 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Wed, 23 Nov 2022 23:08:10 +0000
Subject: [PATCH 053/144] attention-head-type defaults to None instead

---
 megatron/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 3aad6d23e4..fa0a16cfc0 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -405,7 +405,7 @@ def _add_network_size_args(parser):
                        'attention. This is set to '
                        '   args.hidden_size // args.num_attention_heads '
                        'if not provided.')
-    group.add_argument('--attention-head-type', type=str, default='multihead',
+    group.add_argument('--attention-head-type', type=str, default=None,
                        choices=['multihead', 'multiquery'],
                        help='Type of attention heads. `multihead` is the standard multi-head attention.'
                        '`multiquery` shares the values and keys across attention heads')

From 96daa55e2ecd36ca50fd52581d0d9b4197ca4f11 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Thu, 24 Nov 2022 20:20:56 +0000
Subject: [PATCH 054/144] use detokenize method un text_generation

---
 megatron/text_generation/tokenization.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/megatron/text_generation/tokenization.py b/megatron/text_generation/tokenization.py
index 623a96c619..e850ed9329 100644
--- a/megatron/text_generation/tokenization.py
+++ b/megatron/text_generation/tokenization.py
@@ -43,10 +43,11 @@ def detokenize_generations(tokens_gpu_tensor,
         if return_segments:
             words = []
             for token in sequence_tokens:
-                word = tokenizer.tokenizer.decoder[token]
-                word = bytearray(
-                    [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
-                        'utf-8', errors='replace')
+                # word = tokenizer.tokenizer.decoder[token]
+                # word = bytearray(
+                #     [tokenizer.tokenizer.byte_decoder[c] for c in word]).decode(
+                #         'utf-8', errors='replace')
+                word = tokenizer.detokenize([token])
                 words.append(word)
             prompts_plus_generations_segments.append(words)
 

From 2d36c14ac66167c7fb2d1f86b256d2e41c805774 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 1 Dec 2022 21:13:00 -0500
Subject: [PATCH 055/144] add mqa conversion to huggingface

---
 tools/hf_transformers/__init__.py             |   0
 .../hf_transformers/configuration_gpt2_mq.py  | 201 ++++++++++
 tools/hf_transformers/convert_checkpoint.py   | 334 +++++++++++++++++
 tools/hf_transformers/modeling_gpt2_mq.py     | 346 ++++++++++++++++++
 4 files changed, 881 insertions(+)
 create mode 100644 tools/hf_transformers/__init__.py
 create mode 100644 tools/hf_transformers/configuration_gpt2_mq.py
 create mode 100644 tools/hf_transformers/convert_checkpoint.py
 create mode 100644 tools/hf_transformers/modeling_gpt2_mq.py

diff --git a/tools/hf_transformers/__init__.py b/tools/hf_transformers/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tools/hf_transformers/configuration_gpt2_mq.py b/tools/hf_transformers/configuration_gpt2_mq.py
new file mode 100644
index 0000000000..1a71e73915
--- /dev/null
+++ b/tools/hf_transformers/configuration_gpt2_mq.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Custom GPT-2 configuration"""
+from collections import OrderedDict
+from typing import Any, List, Mapping, Optional
+from enum import Enum
+
+from transformers import PreTrainedTokenizer, TensorType, is_torch_available
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfigWithPast, PatchingSpec
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
+    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
+    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
+    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
+    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
+}
+
+MULTI_HEAD = "multihead"
+MULTI_QUERY = "multiquery"
+
+
+class GPT2CustomConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [gpt2](https://huggingface.co/gpt2) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(head_dim)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPT2Config, GPT2Model
+
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPT2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        attention_head_type=MULTI_HEAD,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.attention_head_type = attention_head_type
+        # assert attention_head_type in [AttentionType.MULTI_HEAD, AttentionType.MULTI_QUERY]
+        assert attention_head_type in [MULTI_HEAD, MULTI_QUERY]
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/tools/hf_transformers/convert_checkpoint.py b/tools/hf_transformers/convert_checkpoint.py
new file mode 100644
index 0000000000..1229d7f5f4
--- /dev/null
+++ b/tools/hf_transformers/convert_checkpoint.py
@@ -0,0 +1,334 @@
+import argparse
+import os
+import re
+import torch
+
+from transformers import AutoTokenizer
+from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import fix_query_key_value_ordering, recursive_print
+from tools.hf_transformers.configuration_gpt2_mq import GPT2CustomConfig
+from tools.hf_transformers.modeling_gpt2_mq import GPT2LMHeadCustomModel
+
+
+####################################################################################################
+
+
+def convert_megatron_checkpoint(args, input_state_dict, config):
+    # The converted output model.
+    output_state_dict = {}
+
+    # old versions did not store training args
+    ds_args = input_state_dict.get("args", None)
+    if ds_args is not None:
+        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
+        # from pprint import pprint
+        # pprint(vars(ds_args))
+
+        config.vocab_size = ds_args.padded_vocab_size
+        config.n_positions = ds_args.max_position_embeddings
+        config.n_embd = ds_args.hidden_size
+        config.n_layer = ds_args.num_layers
+        config.n_head = ds_args.num_attention_heads
+        config.n_inner = ds_args.ffn_hidden_size
+        config.attention_head_type = ds_args.attention_head_type
+        # also set `scale_attn_weights` and `scale_attn_by_inverse_layer_idx` ?
+        # Uncommenting the next line makes the converted model output different logits.
+        # config.scale_attn_by_inverse_layer_idx = ds_args.apply_query_key_layer_scaling
+        # pprint(config)
+
+    # The number of heads.
+    heads = config.n_head
+    # The hidden_size per head.
+    hidden_size_per_head = config.n_embd // config.n_head
+    # Megatron-LM checkpoint version
+    if "checkpoint_version" in input_state_dict.keys():
+        checkpoint_version = input_state_dict["checkpoint_version"]
+    else:
+        checkpoint_version = 0.0
+
+    # The model.
+    model = input_state_dict["model"]
+    # The language model.
+    lm = model["language_model"]
+    # The embeddings.
+    embeddings = lm["embedding"]
+
+    # The word embeddings.
+    word_embeddings = embeddings["word_embeddings"]["weight"]
+    # Truncate the embedding table to vocab_size rows.
+    word_embeddings = word_embeddings[: config.vocab_size, :]
+    output_state_dict["transformer.wte.weight"] = word_embeddings
+
+    # The position embeddings.
+    pos_embeddings = embeddings["position_embeddings"]["weight"]
+    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
+    n_positions = pos_embeddings.size(0)
+    if n_positions != config.n_positions:
+        raise ValueError(
+            f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
+        )
+    # Store the position embeddings.
+    output_state_dict["transformer.wpe.weight"] = pos_embeddings
+
+    # The transformer.
+    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
+
+    # The regex to extract layer names.
+    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
+
+    # The simple map of names for "automated" rules.
+    megatron_to_transformers = {
+        "attention.dense": ".attn.c_proj.",
+        "self_attention.dense": ".attn.c_proj.",
+        "mlp.dense_h_to_4h": ".mlp.c_fc.",
+        "mlp.dense_4h_to_h": ".mlp.c_proj.",
+    }
+
+    # Extract the layers.
+    for key, val in transformer.items():
+        # Match the name.
+        m = layer_re.match(key)
+
+        # Stop if that's not a layer
+        if m is None:
+            break
+
+        # The index of the layer.
+        layer_idx = int(m.group(1))
+        # The name of the operation.
+        op_name = m.group(2)
+        # Is it a weight or a bias?
+        weight_or_bias = m.group(3)
+
+        # The name of the layer.
+        layer_name = f"transformer.h.{layer_idx}"
+
+        # For layernorm(s), simply store the layer norm.
+        if op_name.endswith("layernorm"):
+
+            ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
+            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
+
+        # Transpose the QKV matrix.
+        elif (
+            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
+        ) and weight_or_bias == "weight":
+
+            # Insert a tensor of 1x1xDxD bias.
+            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
+                1, 1, n_positions, n_positions
+            )
+            output_state_dict[layer_name + ".attn.bias"] = causal_mask
+
+            # Insert a "dummy" tensor for masked_bias.
+            masked_bias = torch.tensor(-1e4, dtype=torch.float16)
+            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
+
+            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
+            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
+            out_val = out_val.transpose(0, 1).contiguous()
+            # Store.
+            output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
+        
+        # Tranpose the Q matrix (for MQA)
+        elif (
+            op_name == "self_attention.query"
+        ) and weight_or_bias == "weight":
+            # Insert a tensor of 1x1xDxD bias.
+            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
+                1, 1, n_positions, n_positions
+            )
+            output_state_dict[layer_name + ".attn.bias"] = causal_mask
+
+            # Insert a "dummy" tensor for masked_bias.
+            masked_bias = torch.tensor(-1e4, dtype=torch.float16)
+            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
+
+            out_val = fix_query_key_value_ordering(val, checkpoint_version, 1, heads, hidden_size_per_head)
+            # Megatron stores (out x in) but transformers-GPT2 expects (in x out).
+            out_val = out_val.transpose(0, 1).contiguous()
+            # Store.
+            output_state_dict[layer_name + ".attn.q_attn.weight"] = out_val
+        
+        # Tranpose the KV matrix (for MQA)
+        elif (
+            op_name == "self_attention.key_value"
+        ) and weight_or_bias == "weight":
+            # Key-values are shared across heads
+            out_val = fix_query_key_value_ordering(val, checkpoint_version, 2, 1, hidden_size_per_head)
+            # Megatron stores (out x in) but transformers-GPT2 expects (in x out).
+            out_val = out_val.transpose(0, 1).contiguous()
+            # Store.
+            output_state_dict[layer_name + ".attn.kv_attn.weight"] = out_val
+
+        # Transpose the bias.
+        elif (
+            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
+        ) and weight_or_bias == "bias":
+
+            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
+            # Store. No change of shape.
+            output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
+        
+        # Transpose the Q bias (MQA)
+        elif (
+            op_name == "self_attention.query"
+        ) and weight_or_bias == "bias":
+
+            out_val = fix_query_key_value_ordering(val, checkpoint_version, 1, heads, hidden_size_per_head)
+            # Store. No change of shape.
+            output_state_dict[layer_name + ".attn.q_attn.bias"] = out_val
+        
+        # Transpose the KV bias (MQA)
+        elif (
+            op_name == "self_attention.key_value"
+        ) and weight_or_bias == "bias":
+
+            out_val = fix_query_key_value_ordering(val, checkpoint_version, 2, 1, hidden_size_per_head)
+            # Store. No change of shape.
+            output_state_dict[layer_name + ".attn.kv_attn.bias"] = out_val
+
+        # Transpose the weights.
+        elif weight_or_bias == "weight":
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
+
+        # Copy the bias.
+        elif weight_or_bias == "bias":
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + "bias"] = val
+
+    # DEBUG.
+    assert config.n_layer == layer_idx + 1
+
+    # The final layernorm.
+    output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
+    output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
+
+    # For LM head, transformers' wants the matrix to weight embeddings.
+    output_state_dict["lm_head.weight"] = word_embeddings
+
+    # It should be done!
+    return output_state_dict
+
+
+####################################################################################################
+
+
+def main():
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--print-checkpoint-structure", action="store_true")
+    parser.add_argument(
+        "--path_to_checkpoint",
+        type=str,
+        help="Path to the checkpoint file (.zip archive or direct .pt file)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Ouptut directory where HF checkpoint will be written",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the model.
+    # the .zip is very optional, let's keep it for backward compatibility
+    print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
+    input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
+
+    ds_args = input_state_dict.get("args", None)
+
+    # Read the config, or default to the model released by NVIDIA.
+
+    if ds_args is not None:
+        if ds_args.bias_gelu_fusion:
+            activation_function = "gelu_fast"
+        elif ds_args.openai_gelu:
+            activation_function = "gelu_new"
+        else:
+            activation_function = "gelu"
+    else:
+        # in the very early days this used to be "gelu_new"
+        activation_function = "gelu_new"
+
+    # Spell out all parameters in case the defaults change.
+    config = GPT2CustomConfig(
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=1024,
+        n_layer=24,
+        n_head=16,
+        n_inner=4096,
+        activation_function=activation_function,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+    )
+    # TODO: also set bos and eos?
+
+    config.architectures = ["GPT2LMHeadCustomModel"]
+
+    # Convert.
+    print("Converting")
+    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
+    # TODO: also set `scale_attn_weights` and `scale_attn_by_inverse_layer_idx` ?
+
+    # Print the structure of converted state dict.
+    if args.print_checkpoint_structure:
+        recursive_print(None, output_state_dict)
+
+    # Add tokenizer class info to config
+    # see https://github.com/huggingface/transformers/issues/13906)
+    if ds_args is not None:
+        tokenizer_type = ds_args.tokenizer_type
+        if tokenizer_type == "GPT2BPETokenizer":
+            tokenizer_model_name = "gpt2"
+        elif tokenizer_type == "PretrainedFromHF":
+            tokenizer_model_name = ds_args.tokenizer_name_or_path
+        else:
+            raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
+    else:
+        tokenizer_model_name = "gpt2"
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
+    tokenizer_class = type(tokenizer).__name__
+    config.tokenizer_class = tokenizer_class
+
+    # Store the config to file.
+    print("Saving config")
+    config.save_pretrained(args.output_dir)
+
+    # Save tokenizer based on args
+    print(f"Adding {tokenizer_class} tokenizer files")
+    tokenizer.save_pretrained(args.output_dir)
+
+
+    # Save custom model
+    GPT2CustomConfig.register_for_auto_class()
+    GPT2LMHeadCustomModel.register_for_auto_class("AutoModelForCausalLM")
+    hf_model = GPT2LMHeadCustomModel(config)
+    hf_model.load_state_dict(output_state_dict)
+    hf_model.push_to_hub("custom_gpt2_mqa")
+    hf_model.save_pretrained(args.output_dir)
+
+    # Store the state_dict to file.
+    # print(f'Saving checkpoint to "{output_checkpoint_file}"')
+    # torch.save(output_state_dict, output_checkpoint_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/hf_transformers/modeling_gpt2_mq.py b/tools/hf_transformers/modeling_gpt2_mq.py
new file mode 100644
index 0000000000..2669601f77
--- /dev/null
+++ b/tools/hf_transformers/modeling_gpt2_mq.py
@@ -0,0 +1,346 @@
+"""PyTorch OpenAI GPT-2 model modified with MultiQuery attention"""
+
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel, SequenceSummary
+from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2Block, GPT2PreTrainedModel, GPT2LMHeadModel
+from tools.hf_transformers.configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY, MULTI_HEAD
+
+
+
+class GPT2MQAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        assert config.attention_head_type == MULTI_QUERY
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        if is_cross_attention:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+        self.is_cross_attention = is_cross_attention
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            # self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+            # Keys and values are shared across heads
+            self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # query: (b, num_heads * sq, head_dim)
+        # key: (b, head_dim, sk)
+        # value: (b, sk, head_dim)
+        batch_size = query.size(0)
+        query_length = query.size(1) // self.num_heads
+        key_length = key.size(2)
+        # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
+        attn_weights = torch.bmm(query, key)
+        # -> (b, num_heads, sq, sk)
+        attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.tensor(
+                value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+            )
+
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
+        _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
+        # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
+        attn_output = torch.bmm(_attn_weights, value)
+        attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+
+        return attn_output, attn_weights
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        if encoder_hidden_states is not None:
+            raise NotImplementedError("Cross-attention not implemented for MQA")
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query = self.q_attn(hidden_states)
+            key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
+
+
+        batch_size, seq_length = query.shape[:2]
+        # (query_length, batch, num_heads, head_dim)
+        # (batch, num_heads * query_length, head_dim)\
+
+        # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
+        query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).permute([0, 2, 1, 3])
+        # -> (batch, num_heads * query_length, head_dim)
+        query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
+
+        # (batch, query_length, hidden_size) -> (batch, query_length * num_heads, head_dim)
+        # query = query.view(
+        #     batch_size, seq_length, self.num_heads, self.head_dim,
+        # ).reshape(
+        #     batch_size, seq_length * self.num_heads, self.head_dim
+        # )
+        key = key.permute(0, 2, 1)  # (batch_size, head_dim, seq_length)
+        # value (batch_size, seq_length, head_dim)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # Concatenate on sequence dimension
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        if self.reorder_and_upcast_attn:
+            raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+# inherit from gpt_modeling.py, and override `attn` module
+class GPT2CustomBlock(GPT2Block):
+
+    def __init__(self, config: GPT2CustomConfig, layer_idx=None):
+        super().__init__(config, layer_idx)
+        # Override attention module if using multiquery
+        if config.attention_head_type == MULTI_QUERY:
+            self.attn = GPT2MQAttention(config, layer_idx=layer_idx)
+            if config.add_cross_attention:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
+
+
+# inherit from gpt_modeling.py and override `__init__` method
+class GPT2CustomModel(GPT2Model):
+    config_class = GPT2CustomConfig
+    
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2CustomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class GPT2LMHeadCustomModel(GPT2LMHeadModel):
+    config_class = GPT2CustomConfig
+
+    def __init__(self, config):
+        GPT2PreTrainedModel.__init__(self, config)
+        self.transformer = GPT2CustomModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
\ No newline at end of file

From 760eed9649d2ebcec4d2b8ce6757e56a6e4f1c3a Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 1 Dec 2022 21:37:47 -0500
Subject: [PATCH 056/144] remove config and tokenizer save

---
 tools/hf_transformers/convert_checkpoint.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tools/hf_transformers/convert_checkpoint.py b/tools/hf_transformers/convert_checkpoint.py
index 1229d7f5f4..4440bbffdb 100644
--- a/tools/hf_transformers/convert_checkpoint.py
+++ b/tools/hf_transformers/convert_checkpoint.py
@@ -308,15 +308,6 @@ def main():
     tokenizer_class = type(tokenizer).__name__
     config.tokenizer_class = tokenizer_class
 
-    # Store the config to file.
-    print("Saving config")
-    config.save_pretrained(args.output_dir)
-
-    # Save tokenizer based on args
-    print(f"Adding {tokenizer_class} tokenizer files")
-    tokenizer.save_pretrained(args.output_dir)
-
-
     # Save custom model
     GPT2CustomConfig.register_for_auto_class()
     GPT2LMHeadCustomModel.register_for_auto_class("AutoModelForCausalLM")

From baa7b3b4bf71b016578f92584f408999b3e40e5d Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 1 Dec 2022 21:48:02 -0500
Subject: [PATCH 057/144] add Readme

---
 tools/hf_transformers/README.md             | 10 ++++++++++
 tools/hf_transformers/convert_checkpoint.py |  4 +---
 tools/hf_transformers/push_checkpoints.py   |  6 ++++++
 3 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 tools/hf_transformers/README.md
 create mode 100644 tools/hf_transformers/push_checkpoints.py

diff --git a/tools/hf_transformers/README.md b/tools/hf_transformers/README.md
new file mode 100644
index 0000000000..56e5efdc34
--- /dev/null
+++ b/tools/hf_transformers/README.md
@@ -0,0 +1,10 @@
+# Conversion of Megatron checkpoints to HF transformers
+These scripts support MQA.
+Only supports 1-way tensor/pipeline parallelism for now (use `checkpoint_util` to merge checkpoints if needed).
+
+Convert a megatron checkpoint to a HF-transformer checkpoint that can be directly pushed to the hub:
+```
+python -m tools.hf_transformers.convert_checkpoint --path_to_checkpoint /checkpoint_dir/iter_{num_iter}/mp_rank_00/model_optim_rng.pt --output-dir /checkpoint_dir/hf_checkpoints/iter_{num_iter}
+```
+
+
diff --git a/tools/hf_transformers/convert_checkpoint.py b/tools/hf_transformers/convert_checkpoint.py
index 4440bbffdb..3264381ea7 100644
--- a/tools/hf_transformers/convert_checkpoint.py
+++ b/tools/hf_transformers/convert_checkpoint.py
@@ -223,7 +223,7 @@ def main():
     parser.add_argument(
         "--path_to_checkpoint",
         type=str,
-        help="Path to the checkpoint file (.zip archive or direct .pt file)",
+        help="Path to the `.pt` checkpoint file",
     )
     parser.add_argument(
         "--output-dir",
@@ -285,7 +285,6 @@ def main():
     # Convert.
     print("Converting")
     output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
-    # TODO: also set `scale_attn_weights` and `scale_attn_by_inverse_layer_idx` ?
 
     # Print the structure of converted state dict.
     if args.print_checkpoint_structure:
@@ -313,7 +312,6 @@ def main():
     GPT2LMHeadCustomModel.register_for_auto_class("AutoModelForCausalLM")
     hf_model = GPT2LMHeadCustomModel(config)
     hf_model.load_state_dict(output_state_dict)
-    hf_model.push_to_hub("custom_gpt2_mqa")
     hf_model.save_pretrained(args.output_dir)
 
     # Store the state_dict to file.
diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
new file mode 100644
index 0000000000..81f6d792c3
--- /dev/null
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -0,0 +1,6 @@
+"""
+Convert all intermediate checkpoints from a directory and push to a single repo on the hub.
+Each intermediate checkpoint is associated a different branch corresponding to its iteration
+"""
+
+

From 2ceaf709548eeb1ba3ff9dae7569e9c424ac7e93 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 1 Dec 2022 21:54:23 -0500
Subject: [PATCH 058/144] add some documentation

---
 tools/hf_transformers/push_checkpoints.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
index 81f6d792c3..1a35a5a1d5 100644
--- a/tools/hf_transformers/push_checkpoints.py
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -1,6 +1,10 @@
 """
-Convert all intermediate checkpoints from a directory and push to a single repo on the hub.
-Each intermediate checkpoint is associated a different branch corresponding to its iteration
+Arguments:
+- `experiment_directory` (containing a list of iter_{} subdirectories)
+- `repo_name`: name of the repo to push to.
+Convert all intermediate checkpoints from a `experiment_directory` and push to a single repo `repo_name` on the hub.
+Each intermediate checkpoint is associated a different branch (for example) corresponding to its iteration.
+This script could use `{experiment_directory}/hf_checkpoints/iter_{}` as local path to save the converted checkpoints, and push those for example.
 """
 
 

From 66beabe5439fa9af503b19231dd66c3d44d0468f Mon Sep 17 00:00:00 2001
From: leandro <leandro.vonwerra@spoud.io>
Date: Fri, 2 Dec 2022 13:56:25 +0100
Subject: [PATCH 059/144] add push to hub logic

---
 tools/hf_transformers/push_checkpoints.py | 35 +++++++++++++++++------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
index 1a35a5a1d5..82b4d48fe0 100644
--- a/tools/hf_transformers/push_checkpoints.py
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -1,10 +1,29 @@
-"""
-Arguments:
-- `experiment_directory` (containing a list of iter_{} subdirectories)
-- `repo_name`: name of the repo to push to.
-Convert all intermediate checkpoints from a `experiment_directory` and push to a single repo `repo_name` on the hub.
-Each intermediate checkpoint is associated a different branch (for example) corresponding to its iteration.
-This script could use `{experiment_directory}/hf_checkpoints/iter_{}` as local path to save the converted checkpoints, and push those for example.
-"""
+from huggingface_hub import Repository
+from pathlib import Path
+import subprocess
+import argparse
 
 
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save_dir", type=str, help="Path where repository is cloned to locally.")
+    parser.add_argument("--exp_dir", type=str, help="Path to experiment folder.")
+    parser.add_argument("--repo_name", type=str, help="Name of repository on the Hub in 'ORG/NAME' format.")
+    parser.add_argument("--branch_name", type=str, help="Name of branch in repository to save experiments.")
+    args = parser.parse_args()
+
+    hf_repo = Repository(args.save_dir, clone_from=args.repo_name)
+    hf_repo.git_checkout(args.branch_name, create_branch_ok=True)
+    
+    all_ckpt_dir = Path(args.exp_dir) / "hf_checkpoints/"
+    ckpt_dirs = [x for x in all_ckpt_dir.iterdir() if x.name.startswith("iter_") and x.is_dir()]
+    # TODO: some sorting necessary? they `ckpt_dirs` should be in ascending order
+
+    for ckpt_dir in ckpt_dirs:
+        file_path = next(ckpt_dir.glob('*.pt'))
+        # TODO: if we format convert_checkpoint.py such that the main logic is in a function with args instead of argparse we can avoid using the `suprocess` and import the function instead
+        subprocess.Popen(["python", "convert_checkpoint.py", "--path_to_checkpoint", file_path, "--output-dir", args.save_dir])
+        hf_repo.push_to_hub(commit_message=f"{ckpt_dir.name}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From de83476581493c21362275e9cb2b60e498e71fbc Mon Sep 17 00:00:00 2001
From: leandro <leandro.vonwerra@spoud.io>
Date: Fri, 2 Dec 2022 13:58:47 +0100
Subject: [PATCH 060/144] add docs

---
 tools/hf_transformers/push_checkpoints.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
index 82b4d48fe0..4ffc3b71c6 100644
--- a/tools/hf_transformers/push_checkpoints.py
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -4,6 +4,14 @@
 import argparse
 
 
+"""
+Script to upload Megatron checkpoints to a HF repo on the Hub.
+
+The script clones/creates a repo on the Hub, checks out a branch `--branch_name`,
+and converts each `iter_` checkpoint and saves it as a commit on that branch.
+"""
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--save_dir", type=str, help="Path where repository is cloned to locally.")

From 1b7c96f4e6b14994255a9eb8f0dc7208723af146 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 2 Dec 2022 11:03:17 -0500
Subject: [PATCH 061/144] convert_checkpoint as function, push starting from
 last pushed iteration

---
 tools/hf_transformers/convert_checkpoint.py | 48 ++++++++++-----------
 tools/hf_transformers/push_checkpoints.py   | 40 +++++++++++++----
 2 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/tools/hf_transformers/convert_checkpoint.py b/tools/hf_transformers/convert_checkpoint.py
index 3264381ea7..08b73a61aa 100644
--- a/tools/hf_transformers/convert_checkpoint.py
+++ b/tools/hf_transformers/convert_checkpoint.py
@@ -12,7 +12,7 @@
 ####################################################################################################
 
 
-def convert_megatron_checkpoint(args, input_state_dict, config):
+def convert_megatron_checkpoint(input_state_dict, config):
     # The converted output model.
     output_state_dict = {}
 
@@ -216,28 +216,13 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 ####################################################################################################
 
 
-def main():
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    parser.add_argument(
-        "--path_to_checkpoint",
-        type=str,
-        help="Path to the `.pt` checkpoint file",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="Ouptut directory where HF checkpoint will be written",
-    )
-    args = parser.parse_args()
-
-    os.makedirs(args.output_dir, exist_ok=True)
+def main(path_to_checkpoint, output_dir, print_checkpoint_structure):
+    os.makedirs(output_dir, exist_ok=True)
 
     # Load the model.
     # the .zip is very optional, let's keep it for backward compatibility
-    print(f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}")
-    input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu")
+    print(f"Extracting PyTorch state dictionary from {path_to_checkpoint}")
+    input_state_dict = torch.load(path_to_checkpoint, map_location="cpu")
 
     ds_args = input_state_dict.get("args", None)
 
@@ -284,10 +269,10 @@ def main():
 
     # Convert.
     print("Converting")
-    output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config)
+    output_state_dict = convert_megatron_checkpoint(input_state_dict, config)
 
     # Print the structure of converted state dict.
-    if args.print_checkpoint_structure:
+    if print_checkpoint_structure:
         recursive_print(None, output_state_dict)
 
     # Add tokenizer class info to config
@@ -312,7 +297,7 @@ def main():
     GPT2LMHeadCustomModel.register_for_auto_class("AutoModelForCausalLM")
     hf_model = GPT2LMHeadCustomModel(config)
     hf_model.load_state_dict(output_state_dict)
-    hf_model.save_pretrained(args.output_dir)
+    hf_model.save_pretrained(output_dir)
 
     # Store the state_dict to file.
     # print(f'Saving checkpoint to "{output_checkpoint_file}"')
@@ -320,4 +305,19 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--print-checkpoint-structure", action="store_true")
+    parser.add_argument(
+        "--path_to_checkpoint",
+        type=str,
+        help="Path to the `.pt` checkpoint file",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        help="Ouptut directory where HF checkpoint will be written",
+    )
+    args = parser.parse_args()
+
+    main(args.path_to_checkpoint, args.output_dir, args.print_checkpoint_structure)
diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
index 4ffc3b71c6..e7ec5723cf 100644
--- a/tools/hf_transformers/push_checkpoints.py
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -1,8 +1,12 @@
+import re
 from huggingface_hub import Repository
 from pathlib import Path
 import subprocess
 import argparse
 
+import tools.hf_transformers.convert_checkpoint
+
+
 
 """
 Script to upload Megatron checkpoints to a HF repo on the Hub.
@@ -11,26 +15,46 @@
 and converts each `iter_` checkpoint and saves it as a commit on that branch.
 """
 
+def get_iter_number(iter_dir: str):
+    m = re.match(r'iter_(\d+)', iter_dir)
+    if m is not None:
+        return int(m.group(1))
+    else:
+        raise ValueError(f"Invalid directory name: {iter_dir}")
+
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--save_dir", type=str, help="Path where repository is cloned to locally.")
+    parser.add_argument("--save_dir", type=str, default=None, help="Path where repository is cloned to locally. Will use {exp_dir}/hf_checkpoints if not provided")
     parser.add_argument("--exp_dir", type=str, help="Path to experiment folder.")
     parser.add_argument("--repo_name", type=str, help="Name of repository on the Hub in 'ORG/NAME' format.")
     parser.add_argument("--branch_name", type=str, help="Name of branch in repository to save experiments.")
     args = parser.parse_args()
 
-    hf_repo = Repository(args.save_dir, clone_from=args.repo_name)
+    all_ckpt_dir = Path(args.exp_dir)
+    save_dir = args.save_dir if args.save_dir is not None else all_ckpt_dir / "hf_checkpoints"
+
+    hf_repo = Repository(save_dir, clone_from=args.repo_name)
     hf_repo.git_checkout(args.branch_name, create_branch_ok=True)
+    # Find last checkpoint that was uploaded
+    head_hash = hf_repo.git_head_hash()
+    commit_msg = subprocess.check_output(["git", "show", "-s", "--format=%B", head_hash], cwd=save_dir).decode()
+    try:
+        last_uploaded_iter = get_iter_number(commit_msg.strip())
+    except ValueError:
+        last_uploaded_iter = -1
     
-    all_ckpt_dir = Path(args.exp_dir) / "hf_checkpoints/"
-    ckpt_dirs = [x for x in all_ckpt_dir.iterdir() if x.name.startswith("iter_") and x.is_dir()]
-    # TODO: some sorting necessary? they `ckpt_dirs` should be in ascending order
+    # The checkpoint dirs should be in ascending iteration order, so that the last commit corresponds to the latest checkpoint
+    ckpt_dirs = sorted([x for x in all_ckpt_dir.iterdir() if x.name.startswith("iter_") and x.is_dir()])
 
     for ckpt_dir in ckpt_dirs:
-        file_path = next(ckpt_dir.glob('*.pt'))
-        # TODO: if we format convert_checkpoint.py such that the main logic is in a function with args instead of argparse we can avoid using the `suprocess` and import the function instead
-        subprocess.Popen(["python", "convert_checkpoint.py", "--path_to_checkpoint", file_path, "--output-dir", args.save_dir])
+        iter_number = get_iter_number(ckpt_dir.name)
+        if iter_number <= last_uploaded_iter:
+            print(f"Will skip iter: {iter_number}")
+            continue
+        # TODO: this only works for 1-way tensor/pipeline parallelism
+        file_path = next((ckpt_dir / "mp_rank_00").glob('*.pt'))
+        tools.hf_transformers.convert_checkpoint.main(path_to_checkpoint=file_path, output_dir=save_dir, print_checkpoint_structure=False)
         hf_repo.push_to_hub(commit_message=f"{ckpt_dir.name}")
 
 if __name__ == "__main__":

From 5cb878fb7e16434eff03533327c0f376fd62ed2c Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 2 Dec 2022 11:18:28 -0500
Subject: [PATCH 062/144] add iter_interval argument

---
 tools/hf_transformers/push_checkpoints.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
index e7ec5723cf..41719b6b79 100644
--- a/tools/hf_transformers/push_checkpoints.py
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -29,6 +29,7 @@ def main():
     parser.add_argument("--exp_dir", type=str, help="Path to experiment folder.")
     parser.add_argument("--repo_name", type=str, help="Name of repository on the Hub in 'ORG/NAME' format.")
     parser.add_argument("--branch_name", type=str, help="Name of branch in repository to save experiments.")
+    parser.add_argument("--iter_interval", type=int, default=1, help="Iteration number must be divisble by iter_interval in order to be pushed")
     args = parser.parse_args()
 
     all_ckpt_dir = Path(args.exp_dir)
@@ -40,22 +41,24 @@ def main():
     head_hash = hf_repo.git_head_hash()
     commit_msg = subprocess.check_output(["git", "show", "-s", "--format=%B", head_hash], cwd=save_dir).decode()
     try:
-        last_uploaded_iter = get_iter_number(commit_msg.strip())
+        last_commit_iter = get_iter_number(commit_msg.strip())
+        print(f"Last commit iteration: {last_commit_iter}")
     except ValueError:
-        last_uploaded_iter = -1
+        last_commit_iter = -1
     
     # The checkpoint dirs should be in ascending iteration order, so that the last commit corresponds to the latest checkpoint
     ckpt_dirs = sorted([x for x in all_ckpt_dir.iterdir() if x.name.startswith("iter_") and x.is_dir()])
 
     for ckpt_dir in ckpt_dirs:
         iter_number = get_iter_number(ckpt_dir.name)
-        if iter_number <= last_uploaded_iter:
-            print(f"Will skip iter: {iter_number}")
+        if iter_number <= last_commit_iter:
             continue
-        # TODO: this only works for 1-way tensor/pipeline parallelism
-        file_path = next((ckpt_dir / "mp_rank_00").glob('*.pt'))
-        tools.hf_transformers.convert_checkpoint.main(path_to_checkpoint=file_path, output_dir=save_dir, print_checkpoint_structure=False)
-        hf_repo.push_to_hub(commit_message=f"{ckpt_dir.name}")
+        if iter_number % args.iter_interval == 0:
+            print(f"Will convert and push iteration {iter_number}")
+            # TODO: this only works for 1-way tensor/pipeline parallelism
+            file_path = next((ckpt_dir / "mp_rank_00").glob('*.pt'))
+            tools.hf_transformers.convert_checkpoint.main(path_to_checkpoint=file_path, output_dir=save_dir, print_checkpoint_structure=False)
+            hf_repo.push_to_hub(commit_message=f"{ckpt_dir.name}")
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From ab1c4cc66d8157042bd771c1d94f4d05f8e5254e Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 8 Dec 2022 15:23:52 -0500
Subject: [PATCH 063/144] use relative imports in modeling file

---
 tools/hf_transformers/convert_checkpoint.py | 28 ++++++++++-----------
 tools/hf_transformers/modeling_gpt2_mq.py   |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tools/hf_transformers/convert_checkpoint.py b/tools/hf_transformers/convert_checkpoint.py
index 08b73a61aa..76670ee117 100644
--- a/tools/hf_transformers/convert_checkpoint.py
+++ b/tools/hf_transformers/convert_checkpoint.py
@@ -277,20 +277,20 @@ def main(path_to_checkpoint, output_dir, print_checkpoint_structure):
 
     # Add tokenizer class info to config
     # see https://github.com/huggingface/transformers/issues/13906)
-    if ds_args is not None:
-        tokenizer_type = ds_args.tokenizer_type
-        if tokenizer_type == "GPT2BPETokenizer":
-            tokenizer_model_name = "gpt2"
-        elif tokenizer_type == "PretrainedFromHF":
-            tokenizer_model_name = ds_args.tokenizer_name_or_path
-        else:
-            raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
-    else:
-        tokenizer_model_name = "gpt2"
-
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
-    tokenizer_class = type(tokenizer).__name__
-    config.tokenizer_class = tokenizer_class
+    # if ds_args is not None:
+    #     tokenizer_type = ds_args.tokenizer_type
+    #     if tokenizer_type == "GPT2BPETokenizer":
+    #         tokenizer_model_name = "gpt2"
+    #     elif tokenizer_type == "PretrainedFromHF":
+    #         tokenizer_model_name = ds_args.tokenizer_name_or_path
+    #     else:
+    #         raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
+    # else:
+    #     tokenizer_model_name = "gpt2"
+
+    # tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
+    # tokenizer_class = type(tokenizer).__name__
+    # config.tokenizer_class = tokenizer_class
 
     # Save custom model
     GPT2CustomConfig.register_for_auto_class()
diff --git a/tools/hf_transformers/modeling_gpt2_mq.py b/tools/hf_transformers/modeling_gpt2_mq.py
index 2669601f77..c685656262 100644
--- a/tools/hf_transformers/modeling_gpt2_mq.py
+++ b/tools/hf_transformers/modeling_gpt2_mq.py
@@ -32,7 +32,7 @@
 )
 from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
 from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2Block, GPT2PreTrainedModel, GPT2LMHeadModel
-from tools.hf_transformers.configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY, MULTI_HEAD
+from .configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY, MULTI_HEAD
 
 
 

From 93461dc1994ef7426ef7aecd20dec568c57a5599 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 9 Dec 2022 17:03:39 -0500
Subject: [PATCH 064/144] Fixes for MQA (#12)

---
 megatron/model/transformer.py           |  6 ++++-
 megatron/optimizer/distrib_optimizer.py |  6 +++++
 megatron/optimizer/optimizer.py         | 34 ++++++++++++++++---------
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 57d6d9fde2..207c44d081 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -608,8 +608,12 @@ def forward(self, hidden_states, attention_mask,
              key_layer,
              value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
         elif self.attention_type == AttnType.self_attn and self.attention_head_type == 'multiquery':
+            kv_input=hidden_states
+            if get_args().sequence_parallel:
+                # The linear layer doesn't gather the sequence-parallel.
+                kv_input = mpu.gather_from_sequence_parallel_region(kv_input, tensor_parallel_output_grad=False)
             # Attention heads [sq, b, h] --> [sq, b, (2 * hn)]
-            mixed_kv_layer = self.key_value(hidden_states)
+            mixed_kv_layer = self.key_value(kv_input)
 
             # [sq, b, (2 * hn)] --> [sq, b, np (expanded), 2 * hn]
             # new_tensor_shape = mixed_kv_layer.size()[:-1] + \
diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py
index 331f7846cd..fa3712d914 100644
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
@@ -541,6 +541,12 @@ def reduce_model_grads(self, args, timers):
         self.allreduce_embedding_grads(args)
         timers('backward-embedding-all-reduce').stop()
 
+        # All-reduce key-value grads if needed.
+        if args.attention_head_type == "multiquery":
+            timers('backward-key-value-all-reduce').start()
+            self.allreduce_key_value_grads(args)
+            timers('backward-key-value-all-reduce').stop()
+
         # Reduce-scatter setup.
         timers('backward-params-all-reduce').start()
         data_parallel_rank = mpu.get_data_parallel_rank()
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
index 6e83e6592d..32e5d1d690 100644
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -267,18 +267,28 @@ def allreduce_embedding_grads(self, args):
         self.allreduce_position_embedding_grads(args)
     
     def allreduce_key_value_grads(self, args):
-        # TODO: models[0] ?
-        unwrapped_model = self.models[0]
-        unwrapped_model = unwrap_model(
-                unwrapped_model, (torchDDP, LocalDDP, Float16Module))
-        for layer in unwrapped_model.language_model.encoder.layers:
-            kv_weight = layer.self_attention.key_value.weight
-            if args.DDP_impl == 'local':
-                grad = kv_weight.main_grad
-            else:
-                grad = kv_weight.grad
-            torch.distributed.all_reduce(grad, group=mpu.get_tensor_model_parallel_group())
-
+        """
+        Reduce the gradients for the key_value weights and biases for multi-query attention.
+        Coalesce the bias grads to avoid too many small reductions,
+        but not the weight grads since it could cause memory issues.
+        """
+        grads=[]
+        for model_module in self.models:
+            unwrapped_model = unwrap_model(
+                    model_module, (torchDDP, LocalDDP, Float16Module))
+            for layer in unwrapped_model.language_model.encoder.layers:
+                kv_weight = layer.self_attention.key_value.weight
+                grad = kv_weight.main_grad if args.DDP_impl == 'local' else kv_weight.grad
+                torch.distributed.all_reduce(grad, group=mpu.get_tensor_model_parallel_group())
+                kv_bias = layer.self_attention.key_value.bias
+                grads.append(kv_bias.main_grad if args.DDP_impl == 'local' else kv_bias.grad)
+        if len(grads)>0:
+            coalesced = _flatten_dense_tensors(grads)
+            torch.distributed.all_reduce(
+                coalesced, group=mpu.get_tensor_model_parallel_group())
+            for buf, synced in zip(grads, _unflatten_dense_tensors(
+                    coalesced, grads)):
+                buf.copy_(synced)
 
     def allreduce_layernorm_grads(self, args):
         """All-reduce layernorm grads (for sequence parallelism)."""

From 63c6fbcc7dae7b526c42d04856eeb1f4675f9290 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 9 Dec 2022 23:38:10 -0500
Subject: [PATCH 065/144] Run with toolkit

---
 megatron/arguments.py  |  7 +++++
 megatron/initialize.py | 67 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 25c0995ebb..25ae8d0344 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -488,6 +488,10 @@ def _add_logging_args(parser):
     group.add_argument('--transformer-timers', action='store_true',
                         help="If set, activate the timers within the transformer layers."
                         "Only for debugging, as this slows down the model.")
+    group.add_argument('--structured-logs', action="store_true",
+                       help='Add timestamp and worker name to stdout and stderr.')
+    group.add_argument('--structured-logs-dir', type=str, default=None,
+                       help='Directory to save the logs.')
 
     return parser
 
@@ -836,6 +840,9 @@ def _add_distributed_args(parser):
                        'affects the encoder embedding.)')
     group.add_argument('--use-distributed-optimizer', action='store_true',
                        help='Use distributed optimizer.')
+    group.add_argument('--distributed-timeout', default=600, type=float,
+                       help='Timeout for distributed operations, in seconds. '
+                       'Should be at least as high as the dataset preprocessing ans checkpoint saving times.')
 
     return parser
 
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 5104964e99..2e85861f53 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -15,6 +15,8 @@
 
 """Megatron initialization."""
 
+import logging
+import logging.config
 import random
 import os
 import time
@@ -74,6 +76,7 @@ def finish_mpu_init():
         args = get_args()
         # Pytorch distributed.
         _initialize_distributed()
+        _configure_logging()
         
         # Random seeds for reproducibility.
         if args.rank == 0:
@@ -104,6 +107,51 @@ def finish_mpu_init():
         return None
 
 
+def _configure_logging():
+    args=get_args()
+    if not args.structured_logs:
+        return
+    rank = torch.distributed.get_rank()
+
+    logging_config = {
+        "version": 1,
+        "disable_existing_loggers": False,
+        "formatters": {
+            "default": {
+                "format": f"%(asctime)s [Rank {rank}]: %(message)s",
+                "use_colors": True,
+            }
+        },
+        "handlers": {
+            "default": {
+                "level": "INFO",
+                "formatter": "default",
+                "class": "logging.StreamHandler",
+                "stream": "ext://sys.stdout",
+            }
+        },
+        "loggers": {"default": {"level": "DEBUG", "handlers": ["default"]}},
+        "root": {"handlers": ["default"], "level": "INFO"},
+    }
+    if args.structured_logs_dir is not None:
+        log_dir=args.structured_logs_dir
+        os.makedirs(log_dir, exist_ok=True)
+        logging_config["handlers"]["file"] = {
+            "level": "INFO",
+            "formatter": "default",
+            "class": "logging.FileHandler",
+            "filename": os.path.join(log_dir, f"logs_rank_{rank}.txt"),
+        }
+        logging_config["root"]["handlers"].append("file")
+        logging_config["loggers"]["default"]["handlers"].append("file")
+    logging.config.dictConfig(logging_config)
+
+    # Add these methods so that stdout can be redirected to logging.
+    logging.write = lambda msg: logging.info(msg) if msg != '\n' else None
+    logging.flush = lambda : None
+
+
+
 def _compile_dependencies():
 
     args = get_args()
@@ -191,11 +239,28 @@ def _initialize_distributed():
             torch.cuda.set_device(device)
         # Include this torch.distributed.init_process_group() code in the `else` branch because
         # we do not want to reinitialize if torch.distributed.is_initialized() returns True
+        if "AGENT_STORE_HOST" in os.environ:
+            # This allows creating the store on an external server.
+            # This is needed when the master host address is not known in advance.
+            # Does not handle torch elastic restarts.
+            store = torch.distributed.TCPStore(
+                host_name=os.environ["AGENT_STORE_HOST"],
+                port=int(os.environ["AGENT_STORE_PORT"]),
+                world_size=0,
+                is_master=False,
+                timeout=timedelta(seconds=float(os.environ["AGENT_STORE_TIMEOUT"]))
+                    if "AGENT_STORE_TIMEOUT" in os.environ else None,
+            )
+        else:
+            store=None
+
         # Call the init process
         torch.distributed.init_process_group(
             backend=args.distributed_backend,
             world_size=args.world_size, rank=args.rank,
-            timeout=timedelta(minutes=10))
+            timeout=timedelta(seconds=args.distributed_timeout),
+            store=store,
+        )
 
     # Set the tensor model-parallel, pipeline model-parallel, and
     # data-parallel communicators.

From 92e2ca2ec19d97d1787977a4c13861d489143ac1 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 13 Dec 2022 06:34:25 -0500
Subject: [PATCH 066/144] Redirect output to logs

---
 megatron/initialize.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 2e85861f53..99bb4afbd6 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -19,6 +19,7 @@
 import logging.config
 import random
 import os
+import sys
 import time
 
 import numpy as np
@@ -150,6 +151,9 @@ def _configure_logging():
     logging.write = lambda msg: logging.info(msg) if msg != '\n' else None
     logging.flush = lambda : None
 
+    sys.stdout=logging
+    sys.stderr=logging
+
 
 
 def _compile_dependencies():

From e2c2c2bfa546323c2318f69e4747ca19346fcdb5 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 13 Dec 2022 07:37:33 -0500
Subject: [PATCH 067/144] Remove store

---
 megatron/initialize.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 99bb4afbd6..bcbca6baa1 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -243,28 +243,11 @@ def _initialize_distributed():
             torch.cuda.set_device(device)
         # Include this torch.distributed.init_process_group() code in the `else` branch because
         # we do not want to reinitialize if torch.distributed.is_initialized() returns True
-        if "AGENT_STORE_HOST" in os.environ:
-            # This allows creating the store on an external server.
-            # This is needed when the master host address is not known in advance.
-            # Does not handle torch elastic restarts.
-            store = torch.distributed.TCPStore(
-                host_name=os.environ["AGENT_STORE_HOST"],
-                port=int(os.environ["AGENT_STORE_PORT"]),
-                world_size=0,
-                is_master=False,
-                timeout=timedelta(seconds=float(os.environ["AGENT_STORE_TIMEOUT"]))
-                    if "AGENT_STORE_TIMEOUT" in os.environ else None,
-            )
-        else:
-            store=None
-
         # Call the init process
         torch.distributed.init_process_group(
             backend=args.distributed_backend,
             world_size=args.world_size, rank=args.rank,
-            timeout=timedelta(seconds=args.distributed_timeout),
-            store=store,
-        )
+            timeout=timedelta(seconds=args.distributed_timeout))
 
     # Set the tensor model-parallel, pipeline model-parallel, and
     # data-parallel communicators.

From 60fbd1d67d2ae5be512683d5eafecf54a0505b94 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 15 Dec 2022 10:12:14 -0500
Subject: [PATCH 068/144] update readme

---
 tools/hf_transformers/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/hf_transformers/README.md b/tools/hf_transformers/README.md
index 56e5efdc34..ca67a19d74 100644
--- a/tools/hf_transformers/README.md
+++ b/tools/hf_transformers/README.md
@@ -7,4 +7,9 @@ Convert a megatron checkpoint to a HF-transformer checkpoint that can be directl
 python -m tools.hf_transformers.convert_checkpoint --path_to_checkpoint /checkpoint_dir/iter_{num_iter}/mp_rank_00/model_optim_rng.pt --output-dir /checkpoint_dir/hf_checkpoints/iter_{num_iter}
 ```
 
+Convert all checkpoints and push them to the hub:
+```
+python -m tools.hf_transformers.push_checkpoints --exp_dir /path/to/experiment --repo_name org/repo --branch_name main --iter_interval 20000"
+```
+
 

From 732396a8700685ceefd0debb4ffd71aaa75a211a Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Tue, 24 Jan 2023 19:10:52 +0000
Subject: [PATCH 069/144] remove debug prints

---
 megatron/checkpointing.py                          | 1 -
 megatron/fused_kernels/tests/test_fused_kernels.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 7d293ce8c8..5e63dfd63b 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -292,7 +292,6 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
         # Save model and optimizer together.
         state_dict = {**model_state_dict, **optim_state_dict}
         if state_dict: # only saves if populated (i.e., inherits conditions above)
-            print("Trying to save ...", flush=True)
             ensure_directory_exists(model_checkpoint_name)
             torch.save(state_dict, model_checkpoint_name)
 
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index f1a6a2a4a5..524ce6f0ea 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -34,7 +34,6 @@ def test_fused_softmax():
         [test_text] * 4,
         return_tensors="pt",
     )
-    print("tokens :", tokens["input_ids"].shape)
 
     embedding_output = bert.embeddings(
         input_ids=tokens["input_ids"].cuda(),
@@ -133,7 +132,6 @@ def test_fused_upper_triangle_mask_softmax():
         [test_text] * 4,
         return_tensors="pt",
     )
-    print("tokens :", tokens["input_ids"].shape)
 
     attention_mask = tokens["attention_mask"].cuda()
     attention_mask = attention_mask.view(attention_mask.size(0), -1)
@@ -234,7 +232,6 @@ def test_layer_norm():
         [test_text] * 4,
         return_tensors="pt",
     )
-    print("tokens :", tokens["input_ids"].shape)
 
     # [bsz, seq_len, d_model]
     embedding_output = (

From 9d80f8a2c60ef7c9a6845fa7e2e26cf38da9dc6d Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-02.elementai.net>
Date: Tue, 24 Jan 2023 19:12:51 +0000
Subject: [PATCH 070/144] more precise error for attention_type/head_type
 values

---
 megatron/model/transformer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 57d6d9fde2..8c124cba3b 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -522,8 +522,10 @@ def __init__(self, init_method,
                 2 * projection_size,
                 gather_output=False,
                 init_method=init_method)
-        else:
+        elif attention_type == AttnType.cross_attn and self.attention_head_type == 'multiquery':
             raise NotImplementedError("Multiquery attention not implemented for cross-attention.")
+        else:
+            raise ValueError(f"Invalid attention arguments: {attention_type}, {self.attention_head_type}")
 
         if self.attention_head_type == 'multihead':
             self.core_attention = CoreAttention(self.layer_number,

From 7457e328b913e2be7c3d0ca3329f3ee54c227aeb Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 24 Jan 2023 14:40:03 -0500
Subject: [PATCH 071/144] attention-head-type defaults to multihead again to
 avoid breaking previous configs

---
 megatron/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index fa0a16cfc0..3aad6d23e4 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -405,7 +405,7 @@ def _add_network_size_args(parser):
                        'attention. This is set to '
                        '   args.hidden_size // args.num_attention_heads '
                        'if not provided.')
-    group.add_argument('--attention-head-type', type=str, default=None,
+    group.add_argument('--attention-head-type', type=str, default='multihead',
                        choices=['multihead', 'multiquery'],
                        help='Type of attention heads. `multihead` is the standard multi-head attention.'
                        '`multiquery` shares the values and keys across attention heads')

From cdbcfc9203aaf32afdb7b8a852e8370d63e91bb2 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 30 Jan 2023 15:51:12 -0500
Subject: [PATCH 072/144] documentation on the --tokenizer-file argument

---
 megatron/arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 3aad6d23e4..e274e25e6d 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -871,7 +871,7 @@ def _add_data_args(parser):
     group.add_argument('--merge-file', type=str, default=None,
                        help='Path to the BPE merge file.')
     group.add_argument('--tokenizer-file', type=str, default=None,
-                       help='Path to the tokenizer.json file.')
+                       help='Path to the tokenizer.json file. Used for the TokenizerFromFile[...] tokenizers')
     group.add_argument('--vocab-extra-ids', type=int, default=0,
                        help='Number of additional vocabulary tokens. '
                             'They are used for span masking in the T5 model')

From 94306d15dca04f1837773883e8a6b956af8da8ea Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 30 Jan 2023 15:53:29 -0500
Subject: [PATCH 073/144] add missing newlines

---
 tools/hf_transformers/modeling_gpt2_mq.py | 2 +-
 tools/hf_transformers/push_checkpoints.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/hf_transformers/modeling_gpt2_mq.py b/tools/hf_transformers/modeling_gpt2_mq.py
index c685656262..0c56af9ef2 100644
--- a/tools/hf_transformers/modeling_gpt2_mq.py
+++ b/tools/hf_transformers/modeling_gpt2_mq.py
@@ -343,4 +343,4 @@ def __init__(self, config):
         self.device_map = None
 
         # Initialize weights and apply final processing
-        self.post_init()
\ No newline at end of file
+        self.post_init()
diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
index 41719b6b79..865a4e1c24 100644
--- a/tools/hf_transformers/push_checkpoints.py
+++ b/tools/hf_transformers/push_checkpoints.py
@@ -61,4 +61,4 @@ def main():
             hf_repo.push_to_hub(commit_message=f"{ckpt_dir.name}")
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 506fbd450617b0bdf30e5b24c7480667fb0940e2 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 30 Jan 2023 16:00:42 -0500
Subject: [PATCH 074/144] revert barrier() to torch.distributed.barrier()

---
 megatron/checkpointing.py |  7 +++----
 megatron/initialize.py    | 11 +++++------
 megatron/training.py      | 13 ++++++-------
 megatron/utils.py         | 13 +------------
 4 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 5e63dfd63b..c3359ed18c 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -28,7 +28,6 @@
 from .utils import (unwrap_model,
                     print_rank_0)
 from megatron.model.enums import PositionEmbeddingType
-from megatron.utils import barrier
 
 
 _CHECKPOINT_VERSION = None
@@ -297,7 +296,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
 
     # Wait so everyone is done (necessary)
     if torch.distributed.is_initialized():
-        barrier()
+        torch.distributed.barrier()
 
     print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}'.format(
         iteration, args.save))
@@ -310,7 +309,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
 
     # Wait so everyone is done (not necessary)
     if torch.distributed.is_initialized():
-        barrier()
+        torch.distributed.barrier()
 
 def _transpose_first_dim(t, num_splits, num_splits_first, model):
     input_shape = t.size()
@@ -679,7 +678,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
 
     # Some utilities want to load a checkpoint without distributed being initialized
     if torch.distributed.is_initialized():
-        barrier()
+        torch.distributed.barrier()
 
     print_rank_0(f'  successfully loaded checkpoint from {load_dir} '
                  f'at iteration {iteration}')
diff --git a/megatron/initialize.py b/megatron/initialize.py
index df8738a7cd..7333c2e0e6 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -40,7 +40,6 @@
                           set_tensor_model_parallel_world_size)
 from megatron.model.transformer import bias_dropout_add_fused_train
 from megatron.model.fused_bias_gelu import bias_gelu
-from megatron.utils import barrier
 
 
 def initialize_megatron(extra_args_provider=None, args_defaults={},
@@ -148,15 +147,15 @@ def _compile_dependencies():
         start_time = time.time()
         print('> compiling and loading fused kernels ...', flush=True)
         fused_kernels.load(args)
-        barrier()
+        torch.distributed.barrier()
     else:
-        barrier()
+        torch.distributed.barrier()
         fused_kernels.load(args)
     # Simple barrier to make sure all ranks have passed the
     # compilation phase successfully before moving on to the
     # rest of the program. We think this might ensure that
     # the lock is released.
-    barrier()
+    torch.distributed.barrier()
     if torch.distributed.get_rank() == 0:
         print('>>> done with compiling and loading fused kernels. '
               'Compilation time: {:.3f} seconds'.format(
@@ -214,9 +213,9 @@ def _init_autoresume():
     """Set autoresume start time."""
     autoresume = get_adlr_autoresume()
     if autoresume:
-        barrier()
+        torch.distributed.barrier()
         autoresume.init()
-        barrier()
+        torch.distributed.barrier()
 
 
 def _set_random_seed(seed_, data_parallel_random_init=False):
diff --git a/megatron/training.py b/megatron/training.py
index b7728cdc32..16d190472f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,12 +57,11 @@
 from megatron.schedules import get_forward_backward_func
 from megatron.utils import report_memory
 from megatron.model.vision.knn_monitor import compute_feature_bank
-from megatron.utils import barrier
 
 
 def print_datetime(string):
     """Note that this call will sync across all ranks."""
-    barrier()
+    torch.distributed.barrier()
     time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     print_rank_0('[' + string + '] datetime: {} '.format(time_str))
 
@@ -381,10 +380,10 @@ def setup_model_and_optimizer(model_provider_func,
         timers = get_timers()
         # Extra barrier is added to make sure all ranks report the
         # max time.
-        barrier()
+        torch.distributed.barrier()
         timers('load-checkpoint').start()
         args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler)
-        barrier()
+        torch.distributed.barrier()
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
         # This is critical when only model is loaded. We should make sure
@@ -673,10 +672,10 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
     timers = get_timers()
     # Extra barrier is added to make sure
     # all ranks report the max time.
-    barrier()
+    torch.distributed.barrier()
     timers('save-checkpoint').start()
     save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-    barrier()
+    torch.distributed.barrier()
     timers('save-checkpoint').stop()
     timers.log(['save-checkpoint'])
 
@@ -783,7 +782,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             if not saved_checkpoint:
                 save_checkpoint_and_time(iteration, model, optimizer,
                                          opt_param_scheduler)
-            barrier()
+            torch.distributed.barrier()
             print_datetime('exiting program at iteration {}'.format(iteration))
             sys.exit()
 
diff --git a/megatron/utils.py b/megatron/utils.py
index 7ec209431a..02956070c4 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -133,7 +133,7 @@ def check_adlr_autoresume_termination(iteration, model,
     args = get_args()
     autoresume = get_adlr_autoresume()
     # Add barrier to ensure consistnecy.
-    barrier()
+    torch.distributed.barrier()
     if autoresume.termination_requested():
         if args.save:
             save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
@@ -223,14 +223,3 @@ def print_rank_last(message):
             print(message, flush=True)
     else:
         print(message, flush=True)
-
-
-def barrier():
-    args = get_args()
-
-    opts = BarrierOptions()
-    opts.device_ids = [args.local_rank]
-
-    group = GroupMember.WORLD
-    work = group.barrier(opts=opts)
-    work.wait()

From d47f6238a786f15465d7a24e666b7f29d38f86f6 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 8 Feb 2023 16:31:19 -0500
Subject: [PATCH 075/144] Remove hf transformers tools

---
 tools/hf_transformers/README.md               |  15 -
 tools/hf_transformers/__init__.py             |   0
 .../hf_transformers/configuration_gpt2_mq.py  | 201 ----------
 tools/hf_transformers/convert_checkpoint.py   | 323 ----------------
 tools/hf_transformers/modeling_gpt2_mq.py     | 346 ------------------
 tools/hf_transformers/push_checkpoints.py     |  64 ----
 6 files changed, 949 deletions(-)
 delete mode 100644 tools/hf_transformers/README.md
 delete mode 100644 tools/hf_transformers/__init__.py
 delete mode 100644 tools/hf_transformers/configuration_gpt2_mq.py
 delete mode 100644 tools/hf_transformers/convert_checkpoint.py
 delete mode 100644 tools/hf_transformers/modeling_gpt2_mq.py
 delete mode 100644 tools/hf_transformers/push_checkpoints.py

diff --git a/tools/hf_transformers/README.md b/tools/hf_transformers/README.md
deleted file mode 100644
index ca67a19d74..0000000000
--- a/tools/hf_transformers/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Conversion of Megatron checkpoints to HF transformers
-These scripts support MQA.
-Only supports 1-way tensor/pipeline parallelism for now (use `checkpoint_util` to merge checkpoints if needed).
-
-Convert a megatron checkpoint to a HF-transformer checkpoint that can be directly pushed to the hub:
-```
-python -m tools.hf_transformers.convert_checkpoint --path_to_checkpoint /checkpoint_dir/iter_{num_iter}/mp_rank_00/model_optim_rng.pt --output-dir /checkpoint_dir/hf_checkpoints/iter_{num_iter}
-```
-
-Convert all checkpoints and push them to the hub:
-```
-python -m tools.hf_transformers.push_checkpoints --exp_dir /path/to/experiment --repo_name org/repo --branch_name main --iter_interval 20000"
-```
-
-
diff --git a/tools/hf_transformers/__init__.py b/tools/hf_transformers/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/tools/hf_transformers/configuration_gpt2_mq.py b/tools/hf_transformers/configuration_gpt2_mq.py
deleted file mode 100644
index 1a71e73915..0000000000
--- a/tools/hf_transformers/configuration_gpt2_mq.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Custom GPT-2 configuration"""
-from collections import OrderedDict
-from typing import Any, List, Mapping, Optional
-from enum import Enum
-
-from transformers import PreTrainedTokenizer, TensorType, is_torch_available
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.onnx import OnnxConfigWithPast, PatchingSpec
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
-    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
-    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
-    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
-    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
-}
-
-MULTI_HEAD = "multihead"
-MULTI_QUERY = "multiquery"
-
-
-class GPT2CustomConfig(PretrainedConfig):
-    """
-    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
-    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the GPT-2
-    [gpt2](https://huggingface.co/gpt2) architecture.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
-        n_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        n_embd (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (`int`, *optional*, defaults to None):
-            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (`int`, *optional*, defaults to 0.1):
-            The dropout ratio for the embeddings.
-        attn_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon to use in the layer normalization layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        summary_type (`string`, *optional*, defaults to `"cls_index"`):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            Has to be one of the following options:
-
-                - `"last"`: Take the last token hidden state (like XLNet).
-                - `"first"`: Take the first token hidden state (like BERT).
-                - `"mean"`: Take the mean of all tokens hidden states.
-                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - `"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (`bool`, *optional*, defaults to `True`):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            Whether or not to add a projection after the vector extraction.
-        summary_activation (`str`, *optional*):
-            Argument used when doing sequence summary. Used in for the multiple choice head in
-            [`GPT2DoubleHeadsModel`].
-
-            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
-        summary_first_dropout (`float`, *optional*, defaults to 0.1):
-            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
-            [`TFGPT2DoubleHeadsModel`].
-
-            The dropout ratio to be used after the projection and activation.
-        scale_attn_weights (`bool`, *optional*, defaults to `True`):
-            Scale attention weights by dividing by sqrt(head_dim)..
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
-            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
-        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
-            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
-            dot-product/softmax to float() when training with mixed precision.
-
-    Example:
-
-    ```python
-    >>> from transformers import GPT2Config, GPT2Model
-
-    >>> # Initializing a GPT2 configuration
-    >>> configuration = GPT2Config()
-
-    >>> # Initializing a model (with random weights) from the configuration
-    >>> model = GPT2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "gpt2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "hidden_size": "n_embd",
-        "max_position_embeddings": "n_positions",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-
-    def __init__(
-        self,
-        vocab_size=50257,
-        n_positions=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        n_inner=None,
-        activation_function="gelu_new",
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        scale_attn_weights=True,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-        scale_attn_by_inverse_layer_idx=False,
-        reorder_and_upcast_attn=False,
-        attention_head_type=MULTI_HEAD,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.n_inner = n_inner
-        self.activation_function = activation_function
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
-        self.reorder_and_upcast_attn = reorder_and_upcast_attn
-        self.attention_head_type = attention_head_type
-        # assert attention_head_type in [AttentionType.MULTI_HEAD, AttentionType.MULTI_QUERY]
-        assert attention_head_type in [MULTI_HEAD, MULTI_QUERY]
-
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/tools/hf_transformers/convert_checkpoint.py b/tools/hf_transformers/convert_checkpoint.py
deleted file mode 100644
index 76670ee117..0000000000
--- a/tools/hf_transformers/convert_checkpoint.py
+++ /dev/null
@@ -1,323 +0,0 @@
-import argparse
-import os
-import re
-import torch
-
-from transformers import AutoTokenizer
-from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import fix_query_key_value_ordering, recursive_print
-from tools.hf_transformers.configuration_gpt2_mq import GPT2CustomConfig
-from tools.hf_transformers.modeling_gpt2_mq import GPT2LMHeadCustomModel
-
-
-####################################################################################################
-
-
-def convert_megatron_checkpoint(input_state_dict, config):
-    # The converted output model.
-    output_state_dict = {}
-
-    # old versions did not store training args
-    ds_args = input_state_dict.get("args", None)
-    if ds_args is not None:
-        # do not make the user write a config file when the exact dimensions/sizes are already in the checkpoint
-        # from pprint import pprint
-        # pprint(vars(ds_args))
-
-        config.vocab_size = ds_args.padded_vocab_size
-        config.n_positions = ds_args.max_position_embeddings
-        config.n_embd = ds_args.hidden_size
-        config.n_layer = ds_args.num_layers
-        config.n_head = ds_args.num_attention_heads
-        config.n_inner = ds_args.ffn_hidden_size
-        config.attention_head_type = ds_args.attention_head_type
-        # also set `scale_attn_weights` and `scale_attn_by_inverse_layer_idx` ?
-        # Uncommenting the next line makes the converted model output different logits.
-        # config.scale_attn_by_inverse_layer_idx = ds_args.apply_query_key_layer_scaling
-        # pprint(config)
-
-    # The number of heads.
-    heads = config.n_head
-    # The hidden_size per head.
-    hidden_size_per_head = config.n_embd // config.n_head
-    # Megatron-LM checkpoint version
-    if "checkpoint_version" in input_state_dict.keys():
-        checkpoint_version = input_state_dict["checkpoint_version"]
-    else:
-        checkpoint_version = 0.0
-
-    # The model.
-    model = input_state_dict["model"]
-    # The language model.
-    lm = model["language_model"]
-    # The embeddings.
-    embeddings = lm["embedding"]
-
-    # The word embeddings.
-    word_embeddings = embeddings["word_embeddings"]["weight"]
-    # Truncate the embedding table to vocab_size rows.
-    word_embeddings = word_embeddings[: config.vocab_size, :]
-    output_state_dict["transformer.wte.weight"] = word_embeddings
-
-    # The position embeddings.
-    pos_embeddings = embeddings["position_embeddings"]["weight"]
-    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
-    n_positions = pos_embeddings.size(0)
-    if n_positions != config.n_positions:
-        raise ValueError(
-            f"pos_embeddings.max_sequence_length={n_positions} and config.n_positions={config.n_positions} don't match"
-        )
-    # Store the position embeddings.
-    output_state_dict["transformer.wpe.weight"] = pos_embeddings
-
-    # The transformer.
-    transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]
-
-    # The regex to extract layer names.
-    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
-
-    # The simple map of names for "automated" rules.
-    megatron_to_transformers = {
-        "attention.dense": ".attn.c_proj.",
-        "self_attention.dense": ".attn.c_proj.",
-        "mlp.dense_h_to_4h": ".mlp.c_fc.",
-        "mlp.dense_4h_to_h": ".mlp.c_proj.",
-    }
-
-    # Extract the layers.
-    for key, val in transformer.items():
-        # Match the name.
-        m = layer_re.match(key)
-
-        # Stop if that's not a layer
-        if m is None:
-            break
-
-        # The index of the layer.
-        layer_idx = int(m.group(1))
-        # The name of the operation.
-        op_name = m.group(2)
-        # Is it a weight or a bias?
-        weight_or_bias = m.group(3)
-
-        # The name of the layer.
-        layer_name = f"transformer.h.{layer_idx}"
-
-        # For layernorm(s), simply store the layer norm.
-        if op_name.endswith("layernorm"):
-
-            ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
-            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
-
-        # Transpose the QKV matrix.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "weight":
-
-            # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
-                1, 1, n_positions, n_positions
-            )
-            output_state_dict[layer_name + ".attn.bias"] = causal_mask
-
-            # Insert a "dummy" tensor for masked_bias.
-            masked_bias = torch.tensor(-1e4, dtype=torch.float16)
-            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
-            out_val = out_val.transpose(0, 1).contiguous()
-            # Store.
-            output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
-        
-        # Tranpose the Q matrix (for MQA)
-        elif (
-            op_name == "self_attention.query"
-        ) and weight_or_bias == "weight":
-            # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
-                1, 1, n_positions, n_positions
-            )
-            output_state_dict[layer_name + ".attn.bias"] = causal_mask
-
-            # Insert a "dummy" tensor for masked_bias.
-            masked_bias = torch.tensor(-1e4, dtype=torch.float16)
-            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 1, heads, hidden_size_per_head)
-            # Megatron stores (out x in) but transformers-GPT2 expects (in x out).
-            out_val = out_val.transpose(0, 1).contiguous()
-            # Store.
-            output_state_dict[layer_name + ".attn.q_attn.weight"] = out_val
-        
-        # Tranpose the KV matrix (for MQA)
-        elif (
-            op_name == "self_attention.key_value"
-        ) and weight_or_bias == "weight":
-            # Key-values are shared across heads
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 2, 1, hidden_size_per_head)
-            # Megatron stores (out x in) but transformers-GPT2 expects (in x out).
-            out_val = out_val.transpose(0, 1).contiguous()
-            # Store.
-            output_state_dict[layer_name + ".attn.kv_attn.weight"] = out_val
-
-        # Transpose the bias.
-        elif (
-            op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
-        ) and weight_or_bias == "bias":
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
-            # Store. No change of shape.
-            output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
-        
-        # Transpose the Q bias (MQA)
-        elif (
-            op_name == "self_attention.query"
-        ) and weight_or_bias == "bias":
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 1, heads, hidden_size_per_head)
-            # Store. No change of shape.
-            output_state_dict[layer_name + ".attn.q_attn.bias"] = out_val
-        
-        # Transpose the KV bias (MQA)
-        elif (
-            op_name == "self_attention.key_value"
-        ) and weight_or_bias == "bias":
-
-            out_val = fix_query_key_value_ordering(val, checkpoint_version, 2, 1, hidden_size_per_head)
-            # Store. No change of shape.
-            output_state_dict[layer_name + ".attn.kv_attn.bias"] = out_val
-
-        # Transpose the weights.
-        elif weight_or_bias == "weight":
-
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
-
-        # Copy the bias.
-        elif weight_or_bias == "bias":
-
-            out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + "bias"] = val
-
-    # DEBUG.
-    assert config.n_layer == layer_idx + 1
-
-    # The final layernorm.
-    output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
-    output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
-
-    # For LM head, transformers' wants the matrix to weight embeddings.
-    output_state_dict["lm_head.weight"] = word_embeddings
-
-    # It should be done!
-    return output_state_dict
-
-
-####################################################################################################
-
-
-def main(path_to_checkpoint, output_dir, print_checkpoint_structure):
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Load the model.
-    # the .zip is very optional, let's keep it for backward compatibility
-    print(f"Extracting PyTorch state dictionary from {path_to_checkpoint}")
-    input_state_dict = torch.load(path_to_checkpoint, map_location="cpu")
-
-    ds_args = input_state_dict.get("args", None)
-
-    # Read the config, or default to the model released by NVIDIA.
-
-    if ds_args is not None:
-        if ds_args.bias_gelu_fusion:
-            activation_function = "gelu_fast"
-        elif ds_args.openai_gelu:
-            activation_function = "gelu_new"
-        else:
-            activation_function = "gelu"
-    else:
-        # in the very early days this used to be "gelu_new"
-        activation_function = "gelu_new"
-
-    # Spell out all parameters in case the defaults change.
-    config = GPT2CustomConfig(
-        vocab_size=50257,
-        n_positions=1024,
-        n_embd=1024,
-        n_layer=24,
-        n_head=16,
-        n_inner=4096,
-        activation_function=activation_function,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        scale_attn_weights=True,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-    )
-    # TODO: also set bos and eos?
-
-    config.architectures = ["GPT2LMHeadCustomModel"]
-
-    # Convert.
-    print("Converting")
-    output_state_dict = convert_megatron_checkpoint(input_state_dict, config)
-
-    # Print the structure of converted state dict.
-    if print_checkpoint_structure:
-        recursive_print(None, output_state_dict)
-
-    # Add tokenizer class info to config
-    # see https://github.com/huggingface/transformers/issues/13906)
-    # if ds_args is not None:
-    #     tokenizer_type = ds_args.tokenizer_type
-    #     if tokenizer_type == "GPT2BPETokenizer":
-    #         tokenizer_model_name = "gpt2"
-    #     elif tokenizer_type == "PretrainedFromHF":
-    #         tokenizer_model_name = ds_args.tokenizer_name_or_path
-    #     else:
-    #         raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}")
-    # else:
-    #     tokenizer_model_name = "gpt2"
-
-    # tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)
-    # tokenizer_class = type(tokenizer).__name__
-    # config.tokenizer_class = tokenizer_class
-
-    # Save custom model
-    GPT2CustomConfig.register_for_auto_class()
-    GPT2LMHeadCustomModel.register_for_auto_class("AutoModelForCausalLM")
-    hf_model = GPT2LMHeadCustomModel(config)
-    hf_model.load_state_dict(output_state_dict)
-    hf_model.save_pretrained(output_dir)
-
-    # Store the state_dict to file.
-    # print(f'Saving checkpoint to "{output_checkpoint_file}"')
-    # torch.save(output_state_dict, output_checkpoint_file)
-
-
-if __name__ == "__main__":
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--print-checkpoint-structure", action="store_true")
-    parser.add_argument(
-        "--path_to_checkpoint",
-        type=str,
-        help="Path to the `.pt` checkpoint file",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="Ouptut directory where HF checkpoint will be written",
-    )
-    args = parser.parse_args()
-
-    main(args.path_to_checkpoint, args.output_dir, args.print_checkpoint_structure)
diff --git a/tools/hf_transformers/modeling_gpt2_mq.py b/tools/hf_transformers/modeling_gpt2_mq.py
deleted file mode 100644
index 0c56af9ef2..0000000000
--- a/tools/hf_transformers/modeling_gpt2_mq.py
+++ /dev/null
@@ -1,346 +0,0 @@
-"""PyTorch OpenAI GPT-2 model modified with MultiQuery attention"""
-
-
-import math
-import os
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.cuda.amp import autocast
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel, SequenceSummary
-from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
-
-from transformers.utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
-from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2Block, GPT2PreTrainedModel, GPT2LMHeadModel
-from .configuration_gpt2_mq import GPT2CustomConfig, MULTI_QUERY, MULTI_HEAD
-
-
-
-class GPT2MQAttention(nn.Module):
-    def __init__(self, config, is_cross_attention=False, layer_idx=None):
-        super().__init__()
-        assert config.attention_head_type == MULTI_QUERY
-
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-                1, 1, max_positions, max_positions
-            ),
-        )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        self.split_size = self.embed_dim
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-
-        self.scale_attn_weights = config.scale_attn_weights
-        if is_cross_attention:
-            raise NotImplementedError("Cross-attention not implemented for MQA")
-        self.is_cross_attention = is_cross_attention
-
-        # Layer-wise attention scaling, reordering, and upcasting
-        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
-        self.layer_idx = layer_idx
-        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
-
-        if self.is_cross_attention:
-            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
-            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
-        else:
-            # self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
-            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
-            # Keys and values are shared across heads
-            self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
-        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
-
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
-        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
-
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-
-        # Update hyper params
-        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
-        self.num_heads = self.num_heads - len(heads)
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # query: (b, num_heads * sq, head_dim)
-        # key: (b, head_dim, sk)
-        # value: (b, sk, head_dim)
-        batch_size = query.size(0)
-        query_length = query.size(1) // self.num_heads
-        key_length = key.size(2)
-        # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
-        attn_weights = torch.bmm(query, key)
-        # -> (b, num_heads, sq, sk)
-        attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.tensor(
-                value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
-        _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
-        # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
-        attn_output = torch.bmm(_attn_weights, value)
-        attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        # Preallocate attn_weights for `baddbmm`
-        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
-
-        # Compute Scale Factor
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= float(self.layer_idx + 1)
-
-        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
-        with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
-            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Splits hidden_size dim into attn_head_size and num_heads
-        """
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Merges attn_head_size dim and num_attn_heads dim into hidden_size
-        """
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        if encoder_hidden_states is not None:
-            raise NotImplementedError("Cross-attention not implemented for MQA")
-            if not hasattr(self, "q_attn"):
-                raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
-                )
-
-            query = self.q_attn(hidden_states)
-            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
-            attention_mask = encoder_attention_mask
-        else:
-            query = self.q_attn(hidden_states)
-            key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
-
-
-        batch_size, seq_length = query.shape[:2]
-        # (query_length, batch, num_heads, head_dim)
-        # (batch, num_heads * query_length, head_dim)\
-
-        # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
-        query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).permute([0, 2, 1, 3])
-        # -> (batch, num_heads * query_length, head_dim)
-        query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
-
-        # (batch, query_length, hidden_size) -> (batch, query_length * num_heads, head_dim)
-        # query = query.view(
-        #     batch_size, seq_length, self.num_heads, self.head_dim,
-        # ).reshape(
-        #     batch_size, seq_length * self.num_heads, self.head_dim
-        # )
-        key = key.permute(0, 2, 1)  # (batch_size, head_dim, seq_length)
-        # value (batch_size, seq_length, head_dim)
-
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            # Concatenate on sequence dimension
-            key = torch.cat((past_key, key), dim=-1)
-            value = torch.cat((past_value, value), dim=-2)
-
-        if use_cache is True:
-            present = (key, value)
-        else:
-            present = None
-
-        if self.reorder_and_upcast_attn:
-            raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
-            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        else:
-            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-# inherit from gpt_modeling.py, and override `attn` module
-class GPT2CustomBlock(GPT2Block):
-
-    def __init__(self, config: GPT2CustomConfig, layer_idx=None):
-        super().__init__(config, layer_idx)
-        # Override attention module if using multiquery
-        if config.attention_head_type == MULTI_QUERY:
-            self.attn = GPT2MQAttention(config, layer_idx=layer_idx)
-            if config.add_cross_attention:
-                raise NotImplementedError("Cross-attention not implemented for MQA")
-
-
-# inherit from gpt_modeling.py and override `__init__` method
-class GPT2CustomModel(GPT2Model):
-    config_class = GPT2CustomConfig
-    
-    def __init__(self, config):
-        GPT2PreTrainedModel.__init__(self, config)
-
-        self.embed_dim = config.hidden_size
-
-        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-
-        self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([GPT2CustomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-
-class GPT2LMHeadCustomModel(GPT2LMHeadModel):
-    config_class = GPT2CustomConfig
-
-    def __init__(self, config):
-        GPT2PreTrainedModel.__init__(self, config)
-        self.transformer = GPT2CustomModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
diff --git a/tools/hf_transformers/push_checkpoints.py b/tools/hf_transformers/push_checkpoints.py
deleted file mode 100644
index 865a4e1c24..0000000000
--- a/tools/hf_transformers/push_checkpoints.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import re
-from huggingface_hub import Repository
-from pathlib import Path
-import subprocess
-import argparse
-
-import tools.hf_transformers.convert_checkpoint
-
-
-
-"""
-Script to upload Megatron checkpoints to a HF repo on the Hub.
-
-The script clones/creates a repo on the Hub, checks out a branch `--branch_name`,
-and converts each `iter_` checkpoint and saves it as a commit on that branch.
-"""
-
-def get_iter_number(iter_dir: str):
-    m = re.match(r'iter_(\d+)', iter_dir)
-    if m is not None:
-        return int(m.group(1))
-    else:
-        raise ValueError(f"Invalid directory name: {iter_dir}")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--save_dir", type=str, default=None, help="Path where repository is cloned to locally. Will use {exp_dir}/hf_checkpoints if not provided")
-    parser.add_argument("--exp_dir", type=str, help="Path to experiment folder.")
-    parser.add_argument("--repo_name", type=str, help="Name of repository on the Hub in 'ORG/NAME' format.")
-    parser.add_argument("--branch_name", type=str, help="Name of branch in repository to save experiments.")
-    parser.add_argument("--iter_interval", type=int, default=1, help="Iteration number must be divisble by iter_interval in order to be pushed")
-    args = parser.parse_args()
-
-    all_ckpt_dir = Path(args.exp_dir)
-    save_dir = args.save_dir if args.save_dir is not None else all_ckpt_dir / "hf_checkpoints"
-
-    hf_repo = Repository(save_dir, clone_from=args.repo_name)
-    hf_repo.git_checkout(args.branch_name, create_branch_ok=True)
-    # Find last checkpoint that was uploaded
-    head_hash = hf_repo.git_head_hash()
-    commit_msg = subprocess.check_output(["git", "show", "-s", "--format=%B", head_hash], cwd=save_dir).decode()
-    try:
-        last_commit_iter = get_iter_number(commit_msg.strip())
-        print(f"Last commit iteration: {last_commit_iter}")
-    except ValueError:
-        last_commit_iter = -1
-    
-    # The checkpoint dirs should be in ascending iteration order, so that the last commit corresponds to the latest checkpoint
-    ckpt_dirs = sorted([x for x in all_ckpt_dir.iterdir() if x.name.startswith("iter_") and x.is_dir()])
-
-    for ckpt_dir in ckpt_dirs:
-        iter_number = get_iter_number(ckpt_dir.name)
-        if iter_number <= last_commit_iter:
-            continue
-        if iter_number % args.iter_interval == 0:
-            print(f"Will convert and push iteration {iter_number}")
-            # TODO: this only works for 1-way tensor/pipeline parallelism
-            file_path = next((ckpt_dir / "mp_rank_00").glob('*.pt'))
-            tools.hf_transformers.convert_checkpoint.main(path_to_checkpoint=file_path, output_dir=save_dir, print_checkpoint_structure=False)
-            hf_repo.push_to_hub(commit_message=f"{ckpt_dir.name}")
-
-if __name__ == "__main__":
-    main()

From c4468364e4225b32adb648d7918a3e645ff9e409 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 6 Mar 2023 11:32:19 -0500
Subject: [PATCH 076/144] add santacoder example script

---
 examples/pretrain_gpt_1B_santacoder.sh | 61 ++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 examples/pretrain_gpt_1B_santacoder.sh

diff --git a/examples/pretrain_gpt_1B_santacoder.sh b/examples/pretrain_gpt_1B_santacoder.sh
new file mode 100644
index 0000000000..dfb754429c
--- /dev/null
+++ b/examples/pretrain_gpt_1B_santacoder.sh
@@ -0,0 +1,61 @@
+#! /bin/bash
+
+set -u # stop on unset variables
+
+# Runs the SantaCoder 1B model
+
+GPUS_PER_NODE=8
+MASTER_ADDR=${MASTER_NODE}  # Adjust
+MASTER_PORT=6000
+NNODES=12  # Adjust
+# NODE_RANK=0  # Adjust
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+CHECKPOINT_PATH=/my/experiment/path  # Adjust: Directory to store the checkpoints
+DATA_PATH=/preprocessed/data/path  # Adjust: Prefix of the preprocessed dataset.
+TOKENIZER_FILE=/tokenizer/path  # Adjust
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --recompute-granularity full \
+       --recompute-method uniform \
+--num-layers 24 \
+--hidden-size 2048 \
+--num-attention-heads 16 \
+--attention-head-type multiquery \
+--init-method-std 0.022 \
+       --seq-length 2048 \
+       --max-position-embeddings 2048 \
+--attention-dropout 0.1 \
+--hidden-dropout 0.1 \
+       --micro-batch-size 2 \
+       --global-batch-size 192 \
+--lr 0.0002 \
+--train-iters 3000 \
+--lr-decay-iters 600000 \
+--lr-decay-style cosine \
+--lr-warmup-iters 175 \
+--weight-decay .1 \
+--adam-beta2 .95 \
+--clip-grad 1.0 \
+--fp16 \
+       --log-interval 10 \
+       --save-interval 4000 \
+       --eval-interval 200 \
+       --eval-iters 10 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+torchrun $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       $GPT_ARGS \
+       --tokenizer-type TokenizerFromFileWithFIM \
+       --tokenizer-file $TOKENIZER_FILE \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       $TENSORBOARD_ARGS
\ No newline at end of file

From afd079ae03323276a7be133dc89a2496abab4c27 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 7 Mar 2023 09:15:08 -0500
Subject: [PATCH 077/144] update arguments in example srcipt

---
 examples/pretrain_gpt_1B_santacoder.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/pretrain_gpt_1B_santacoder.sh b/examples/pretrain_gpt_1B_santacoder.sh
index dfb754429c..7602fcc335 100644
--- a/examples/pretrain_gpt_1B_santacoder.sh
+++ b/examples/pretrain_gpt_1B_santacoder.sh
@@ -20,8 +20,7 @@ TOKENIZER_FILE=/tokenizer/path  # Adjust
 GPT_ARGS="\
        --tensor-model-parallel-size 1 \
        --pipeline-model-parallel-size 1 \
-       --recompute-granularity full \
-       --recompute-method uniform \
+       --recompute-activations \
 --num-layers 24 \
 --hidden-size 2048 \
 --num-attention-heads 16 \
@@ -37,7 +36,7 @@ GPT_ARGS="\
 --train-iters 3000 \
 --lr-decay-iters 600000 \
 --lr-decay-style cosine \
---lr-warmup-iters 175 \
+--lr-warmup-fraction 0.02 \
 --weight-decay .1 \
 --adam-beta2 .95 \
 --clip-grad 1.0 \
@@ -46,6 +45,8 @@ GPT_ARGS="\
        --save-interval 4000 \
        --eval-interval 200 \
        --eval-iters 10 \
+--initial-loss-scale 65536 \
+--fim-rate 0.5 \
 "
 
 TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"

From a16826e6e65e31e9f692cc63737edc9c7c39ba2c Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 9 Mar 2023 16:32:47 -0500
Subject: [PATCH 078/144] add multi-validation for gpt training

---
 examples/pretrain_gpt_multilingual.sh |  65 +++++++++++
 megatron/arguments.py                 | 123 ++++++++++++++++++++
 megatron/data/dataset_utils.py        |  27 ++++-
 megatron/data/gpt_dataset.py          | 161 ++++++++++++++++++++------
 megatron/training.py                  | 123 +++++++++++++-------
 pretrain_gpt.py                       |  50 ++++++--
 6 files changed, 461 insertions(+), 88 deletions(-)
 create mode 100644 examples/pretrain_gpt_multilingual.sh

diff --git a/examples/pretrain_gpt_multilingual.sh b/examples/pretrain_gpt_multilingual.sh
new file mode 100644
index 0000000000..5edebe770d
--- /dev/null
+++ b/examples/pretrain_gpt_multilingual.sh
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1  # Adjust
+NODE_RANK=0  # Adjust
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+# paths to multilingual preprocessed datasets
+DATA_PATH_EN=<Specify path and file prefix>_text_document
+DATA_PATH_AR=<Specify path and file prefix>_text_document
+DATA_PATH_KR=<Specify path and file prefix>_text_document
+DATA_PATH_JP=<Specify path and file prefix>_text_document
+
+CHECKPOINT_PATH=<Specify path>
+
+
+torchrun $DISTRIBUTED_ARGS \
+    pretrain_gpt.py \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --train-iters 1000 \
+    --lr-decay-iters 320000 \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
+    --valid-weighted-split-paths \
+    "VALID_EN: 1 0.6:0.8 $DATA_EN" \
+    "VALID_AR: 1 0.6:0.8 $DATA_AR" \
+    "VALID_JP: 1 0.6:0.8 $DATA_KR" \
+    "VALID_KR: 1 0.6:0.8 $DATA_JP" \
+    "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
+    --test-weighted-split-paths \
+    "TEST_EN: 1 0.8:1 $DATA_EN" \
+    "TEST_AR: 1 0.8:1 $DATA_AR" \
+    "TEST_JP: 1 0.8:1 $DATA_JP" \
+    "TEST_KR: 1 0.8:1 $DATA_KR" \
+    "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --lr-warmup-fraction .01 \
+    --checkpoint-activations \
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --fp16
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 1bff85c0a9..057e626a8c 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -17,6 +17,7 @@
 
 import argparse
 import os
+import re
 
 import torch
 
@@ -100,6 +101,30 @@ def validate_args(args, defaults={}):
                     ' to be less than pipeline model parallel size ({})'.format(
                             args.pipeline_model_parallel_size)
 
+    # --data-path and --train-weighted-splits-paths
+    message = "Data loading Mode 1: --data-path and --split "\
+            "and Mode 2: --(train|valid|test)-weighted-split-paths"\
+            "are mutually exclusive i.e. cannot be set together."
+
+    if args.data_path:
+        assert args.train_weighted_split_paths is None, message
+        setattr(args, "valid_weighted_split_names", None)
+        setattr(args, "valid_weighted_split_weights", None)
+        setattr(args, "valid_weighted_split_splits", None)
+
+        setattr(args, "test_weighted_split_names", None)
+        setattr(args, "test_weighted_split_weights", None)
+        setattr(args, "test_weighted_split_splits", None)
+
+        # args.split default value in the args is None it is set here in order
+        # to check that it does not to overlap with the 2nd mode of data loading
+        if args.split is None:
+            args.split = "969, 30, 1"
+
+    if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
+                args.test_weighted_split_paths:
+        assert args.data_path is None and args.split is None, message
+
     # Deprecated arguments
     assert args.batch_size is None, '--batch-size argument is no longer ' \
         'valid, use --micro-batch-size instead'
@@ -863,6 +888,7 @@ def _add_validation_args(parser):
 def _add_data_args(parser):
     group = parser.add_argument_group(title='data and dataloader')
 
+    # option 1 for data loading  (mutually exclusive with option2)
     group.add_argument('--data-path', nargs='*', default=None,
                        help='Path to the training dataset. Accepted format:'
                        '1) a single data path, 2) multiple datasets in the'
@@ -873,6 +899,103 @@ def _add_data_args(parser):
                        ' validation, and test split. For example the split '
                        '`90,5,5` will use 90%% of data for training, 5%% for '
                        'validation and 5%% for test.')
+    # option 2 for data loading (mutually exclusive with option1)
+    # see https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/97/files
+
+    # helper class to parse the --xxx-weighted-split-paths
+    # note here two args are set: extra valid dataset paths and names
+    class parse_data_paths(argparse.Action):
+        def __call__(self, parser, args, values, option_string=None):
+
+            if option_string == "--train-weighted-split-paths":
+                assert len(values) == 1, 'Only 1 dataset group is allowed to'
+                'be passed for the argument --train-weighted-split-paths'
+
+            # make sure string given in the correct format
+            err_message = 'Each data group should be input on the following format'
+            '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+            'where START < END'
+            for v in values:
+                # each prefix consists several datasets separated by commas
+                prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
+                datasets = prefix.split(",")
+                # check if each dataset is formatted like `WEIGHT START:END PATH`
+                for d in datasets:
+                    assert len(d.split()) == 3, err_message
+                    start, end = d.split()[1].split(":")
+                    assert float(start) < float(end), err_message
+
+            names = [v.split(":")[0] for v in values]
+
+            prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
+            weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
+            splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
+            paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
+
+            # # to keep consistency with Option 1 of data loading (through --data-path)
+            # #  paths will contain strings on the following form
+            # # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
+            # # while data will be parsed in additional arguments below
+            # paths_option1_style = []
+            # for p, w in zip(paths, weights):
+            #   paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
+            # setattr(args, self.dest, paths_option1_style)
+            setattr(args, self.dest, paths)
+            setattr(args, self.dest.replace("paths", "weights"), weights)
+            setattr(args, self.dest.replace("paths", "splits"), splits)
+            setattr(args, self.dest.replace("paths","names"), names)
+
+
+    group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: ONE dataset groups could be'
+                    'submitted in the following form between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    'WEIGHT is used to up and down sample each dataset A,B,C in the group'
+                    'START:END indicates the split portion of the dataset',
+                    action=parse_data_paths)
+
+    group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: one or many dataset groups could be'
+                    'submitted in the following form each between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
+                    'validation will be run on each of those groups independently',
+                    action=parse_data_paths)
+
+    group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: one or many dataset groups could be'
+                    'submitted in the following form each between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
+                    'test will be run on each of those groups independently',
+                    action=parse_data_paths)
+
+    class parse_data_paths_path(argparse.Action):
+        def __call__(self, parser, args, values, option_string=None):
+            expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
+            assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
+
+            with open(values, "r") as fi:
+                lines = fi.readlines()
+                assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
+                assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
+                values = lines[0][1:-2].split("\" \"")
+                weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
+                weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
+                setattr(args, weighted_split_paths_dest, values)
+                parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
+
+    # option 2-bis: load x-weighted-split-paths from a file in case this argument is very long
+    group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
+    group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
+    group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
+
     group.add_argument('--vocab-file', type=str, default=None,
                        help='Path to the vocab file.')
     group.add_argument('--merge-file', type=str, default=None,
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 426e965c85..e6a27e6bea 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -41,8 +41,7 @@
 DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
 
 
-def get_datasets_weights_and_num_samples(data_prefix,
-                                         train_valid_test_num_samples):
+def analyze_data_prefix(data_prefix):
 
     # The data prefix should be in the format of:
     #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
@@ -59,10 +58,16 @@ def get_datasets_weights_and_num_samples(data_prefix,
         weight_sum += weight
     assert weight_sum > 0.0
     weights = [weight / weight_sum for weight in weights]
+    return prefixes, weights
+
+
+def get_datasets_weights_and_num_samples(data_prefix,
+                                         train_valid_test_num_samples):
 
-    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
     # not uniformly distribute the number of samples, we still have
     # samples left to feed to the network.
+    prefixes, weights = analyze_data_prefix(data_prefix)
     datasets_train_valid_test_num_samples = []
     for weight in weights:
         datasets_train_valid_test_num_samples.append(
@@ -603,6 +608,22 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     return indexed_dataset
 
 
+def get_split_by_range_(range_string, size):
+    """ Get dataset splits based on a range:
+    range_string is in the form  START%:END%  for e.g. 0.2:0.8
+    outputs an array of two values [start_index, end_index]
+    """
+    # some checks that range is given in the correct form
+    splits = [float(i) for i in range_string.split(":")]
+    assert len(splits) == 2, "splits should be passed as start:end"
+    assert splits[0] <= 1 and splits[1] <= 1
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits_index = [round(s * float(size)) for s in splits]
+    assert len(splits_index) == 2
+    return splits_index
+
+
 def get_train_valid_test_split_(splits_string, size):
     """ Get dataset splits from comma or '/' separated string list."""
 
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 02bfad8142..e401d417f3 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -25,7 +25,7 @@
 from megatron import mpu, print_rank_0, get_args, get_tokenizer
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
-from megatron.data.dataset_utils import get_train_valid_test_split_
+from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.tokenizer.tokenizer import FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
 
@@ -37,46 +37,134 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
 
     # Single dataset.
     if len(data_prefix) == 1:
-        return _build_train_valid_test_datasets(data_prefix[0],
+        all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0],
                                                 data_impl, splits_string,
                                                 train_valid_test_num_samples,
                                                 seq_length, seed, skip_warmup)
+    # Blending dataset.
+    else:
+
+        output = get_datasets_weights_and_num_samples(data_prefix,
+                                                    train_valid_test_num_samples)
+        prefixes, weights, datasets_train_valid_test_num_samples = output
+
+        # Build individual datasets.
+        train_datasets = []
+        valid_datasets = []
+        test_datasets = []
+        for i in range(len(prefixes)):
+            train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+                                            prefixes[i], data_impl, splits_string,
+                                            datasets_train_valid_test_num_samples[i],
+                                            seq_length, seed, skip_warmup)
+            if train_ds:
+                train_datasets.append(train_ds)
+            if valid_ds:
+                valid_datasets.append(valid_ds)
+            if test_ds:
+                test_datasets.append(test_ds)
+
+        all_train_datasets = BlendableDataset(train_datasets, weights) \
+                            if train_datasets else None
+        all_valid_datasets = BlendableDataset(valid_datasets, weights) \
+                            if valid_datasets else None
+        all_test_datasets = BlendableDataset(test_datasets, weights) \
+                            if test_datasets else None
+
+    return all_train_datasets, all_valid_datasets, all_test_datasets
+
+
+def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
+                        train_valid_test_num_samples,
+                        seq_length, seed, skip_warmup, train_valid_test):
+    '''
+    Build a single dataset group corresponding to Option 2 of data loading see arguments.py
+    a dataset group is passed on the following form
+    GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2
+    or alternatively
+    GIVEN_NAME PATH1    # for a single dataset to be used fully
+    '''
+
+    assert train_valid_test in ["train","valid","test"]
 
+    # Single dataset.
+    if len(paths) == 1:
+        dataset =  _build_single_datasets(paths[0],
+                                          splits[0],
+                                          data_impl,
+                                          train_valid_test_num_samples,
+                                          seq_length, seed, skip_warmup,
+                                          dataset_group_name, train_valid_test)
+        return dataset
     # Blending dataset.
-    # Parse the values.
-    output = get_datasets_weights_and_num_samples(data_prefix,
-                                                  train_valid_test_num_samples)
-    prefixes, weights, datasets_train_valid_test_num_samples = output
-
-    # Build individual datasets.
-    train_datasets = []
-    valid_datasets = []
-    test_datasets = []
-    for i in range(len(prefixes)):
-        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
-            prefixes[i], data_impl, splits_string,
-            datasets_train_valid_test_num_samples[i],
-            seq_length, seed, skip_warmup)
-        if train_ds:
-            train_datasets.append(train_ds)
-        if valid_ds:
-            valid_datasets.append(valid_ds)
-        if test_ds:
-            test_datasets.append(test_ds)
-
-    # Blend.
-    blending_train_dataset = None
-    if train_datasets:
-        blending_train_dataset = BlendableDataset(train_datasets, weights)
-    blending_valid_dataset = None
-    if valid_datasets:
-        blending_valid_dataset = BlendableDataset(valid_datasets, weights)
-    blending_test_dataset = None
-    if test_datasets:
-        blending_test_dataset = BlendableDataset(test_datasets, weights)
-
-    return (blending_train_dataset, blending_valid_dataset,
-            blending_test_dataset)
+    else:
+
+        data_prefix = []
+        # data_prefix is on the shape:
+        # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"]
+        for w,p in zip(weights, paths):
+            data_prefix += [w,p]
+
+        output = get_datasets_weights_and_num_samples(data_prefix,
+                                                    train_valid_test_num_samples)
+        prefixes, weights, datasets_train_valid_test_num_samples = output
+
+        # Build individual datasets.
+        datasets = []
+        for i in range(len(prefixes)):
+            ds = _build_single_datasets(prefixes[i],
+                                        splits[i],
+                                        data_impl,
+                                        datasets_train_valid_test_num_samples[i],
+                                        seq_length,
+                                        seed, skip_warmup,
+                                        dataset_group_name, train_valid_test)
+
+            datasets.append(ds)
+        all_datasets = BlendableDataset(datasets, weights)
+
+        return all_datasets
+
+def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples,
+                            seq_length, seed, skip_warmup, dataset_group_name, train_valid_test):
+    """Build a single dataset"""
+
+    assert train_valid_test in ["train","valid","test"]
+    index = ["train","valid","test"].index(train_valid_test)
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    # this corresponds to option2 for data loading on the form
+    # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3
+    # splits here is an array of size 2  [start_index, end_index]
+    splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    print_rank_0('    {}:'.format(dataset_group_name))
+    print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[0], splits[1],
+                                        splits[1] - splits[0]))
+
+    def build_dataset(name):
+        dataset = None
+        if splits[1] > splits[0]:
+            documents = np.arange(start=splits[0], stop=splits[1],
+                                  step=1, dtype=np.int32)
+            dataset = GPTDataset(name, data_prefix,
+                                  documents, indexed_dataset,
+                                  train_valid_test_num_samples[index],
+                                  seq_length, seed)
+        return dataset
+
+    dataset = build_dataset(dataset_group_name)
+
+    return dataset
 
 
 def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
@@ -90,6 +178,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                            skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
+    # splits here is an array of size 4  [train_start_index, valid_start_index, test_start_index, test_end_index]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
diff --git a/megatron/training.py b/megatron/training.py
index 16d190472f..953831d470 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,6 +57,7 @@
 from megatron.schedules import get_forward_backward_func
 from megatron.utils import report_memory
 from megatron.model.vision.knn_monitor import compute_feature_bank
+from megatron.data.dataset_utils import analyze_data_prefix
 
 
 def print_datetime(string):
@@ -162,11 +163,13 @@ def pretrain(train_valid_test_dataset_provider,
     print_datetime('after training is done')
 
     if args.do_valid:
-        prefix = 'the end of training for val data'
-        evaluate_and_print_results(prefix, forward_step_func,
-                                   valid_data_iterator, model,
-                                   iteration, process_non_loss_data_func,
-                                   False)
+        names = args.valid_weighted_split_names
+        names = names if names is not None else ['valid'] * len(valid_data_iterator)
+        for iterator, name in zip(valid_data_iterator, names):
+            prefix = 'the end of training for val data'
+            evaluate_and_print_results(prefix, forward_step_func,
+                                       iterator, model,
+                                       iteration, process_non_loss_data_func, False, data_group_name=name)
 
     if args.save and iteration != 0:
         save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
@@ -174,10 +177,12 @@ def pretrain(train_valid_test_dataset_provider,
     if args.do_test:
         # Run on test data.
         prefix = 'the end of training for test data'
-        evaluate_and_print_results(prefix, forward_step_func,
-                                   test_data_iterator, model,
-                                   0, process_non_loss_data_func,
-                                   True)
+        names = args.test_weighted_split_names
+        names = names if names is not None else ['test'] * len(test_data_iterator)
+        for iterator, name in zip(test_data_iterator, names):
+            evaluate_and_print_results(prefix, forward_step_func,
+                                       iterator, model,
+                                       0, process_non_loss_data_func, True, data_group_name=name)
 
 def update_train_iters(args):
 
@@ -741,10 +746,12 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         if args.eval_interval and iteration % args.eval_interval == 0 and \
            args.do_valid:
             prefix = 'iteration {}'.format(iteration)
-            evaluate_and_print_results(prefix, forward_step_func,
-                                       valid_data_iterator, model,
-                                       iteration, process_non_loss_data_func,
-                                       False)
+            names = args.valid_weighted_split_names
+            names = names if names is not None else ['valid'] * len(valid_data_iterator)
+            for iterator, name in zip(valid_data_iterator, names):
+                evaluate_and_print_results(prefix, forward_step_func,
+                                           iterator, model,
+                                           iteration, process_non_loss_data_func, False, data_group_name=name)
 
         # Checkpointing
         saved_checkpoint = False
@@ -852,36 +859,40 @@ def evaluate(forward_step_func,
 def evaluate_and_print_results(prefix, forward_step_func,
                                data_iterator, model,
                                iteration, process_non_loss_data_func,
-                               verbose=False):
+                               verbose=False, data_group_name=None):
     """Helper function to evaluate and dump results on screen."""
     args = get_args()
     writer = get_tensorboard_writer()
+    ds_name = data_group_name
+    # print corresponding dataset name (used for multiple validation datasets)
+    tf_plot_prefix = f"lm-loss-validation/{ds_name}" if ds_name else "lm-loss-validation"
 
     total_loss_dict, collected_non_loss_data = evaluate(
         forward_step_func, data_iterator, model,
         process_non_loss_data_func, verbose)
-    string = ' validation loss at {} | '.format(prefix)
+    string = '{} loss at {} | '.format(ds_name, prefix) if ds_name is not None\
+        else 'validation loss at {} | '.format(prefix)
     for key in total_loss_dict:
         string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
         ppl = math.exp(min(20, total_loss_dict[key].item()))
         string += '{} PPL: {:.6E} | '.format(key, ppl)
         if writer:
-            writer.add_scalar('{} validation'.format(key),
+            writer.add_scalar(f'{tf_plot_prefix}/{key} validation',
                               total_loss_dict[key].item(),
                               iteration)
-            writer.add_scalar('{} validation vs samples'.format(key),
+            writer.add_scalar(f'{tf_plot_prefix}/{key} validation vs samples',
                               total_loss_dict[key].item(),
                               args.consumed_train_samples)
             if args.log_validation_ppl_to_tensorboard:
-                writer.add_scalar('{} validation ppl'.format(key), ppl,
+                writer.add_scalar(f'{tf_plot_prefix}/{key} validation ppl', ppl,
                                   iteration)
-                writer.add_scalar('{} validation ppl vs samples'.format(key),
+                writer.add_scalar(f'{tf_plot_prefix}/{key} validation ppl vs samples',
                                   ppl, args.consumed_train_samples)
     
     # Weights and biases reporting
     if is_last_rank() and args.wandb_project_name:
         metrics = {
-            '{} validation'.format(key): total_loss_dict[key].item() for key in total_loss_dict
+            f'{tf_plot_prefix}/{key} validation': total_loss_dict[key].item() for key in total_loss_dict
         }
         wandb.log(metrics, step=iteration)
 
@@ -904,7 +915,7 @@ def build_train_valid_test_data_iterators(
     """XXX"""
     args = get_args()
 
-    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+    (train_dataloader, valid_dataloaders, test_dataloaders) = (None, None, None)
 
     print_rank_0('> building train, validation, and test datasets ...')
 
@@ -940,21 +951,47 @@ def build_train_valid_test_data_iterators(
         # Build the datasets.
         train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider(
             train_val_test_num_samples)
+        # if dataloading option is not 2 convert to list to allow
+        # same interface for multiple data groups
+        # for validation and testing in option 2
+        if type(train_ds) != list and train_ds is not None:
+            train_ds = [train_ds]
+        if type(valid_ds) != list and valid_ds is not None:
+            valid_ds = [valid_ds]
+        if type(test_ds) != list and test_ds is not None:
+            test_ds = [test_ds]
 
         # Build dataloders.
+        assert len(train_ds) == 1, "only one training dataset group is allowed"
+
+        # train_dataloader is a single item while valid_dataloaders
+        # and test_dataloaders are arrays
         train_dataloader = build_pretraining_data_loader(
-            train_ds, args.consumed_train_samples)
-        valid_dataloader = build_pretraining_data_loader(
-            valid_ds, args.consumed_valid_samples)
-        test_dataloader = build_pretraining_data_loader(test_ds, 0)
+            train_ds[0], args.consumed_train_samples)
+
+        # We collapse None and empty list as both should mean we don't run validation
+        # args.consumed_valid_samples accumulates the sum of valid steps for every dataset, which are all equal
+        #
+        # XXX: we get a deadlock in the dataloader on multi-dataset eval, after the first dataset,
+        # possibly due to this bug in pytorch https://github.com/pytorch/pytorch/pull/25158. Using
+        # num_workers=0 to work around it - the training can't use that since it impacts throughput
+        # by a few percent
+        valid_dataloaders = [build_pretraining_data_loader(d, args.consumed_valid_samples // len(valid_ds), num_workers=args.valid_num_workers)
+                            for d in valid_ds] \
+                            if valid_ds is not None else []
+        # We collapse None and empty list as both should mean we don't run test
+        test_dataloaders = [build_pretraining_data_loader(d, 0) for d in test_ds] \
+                            if test_ds is not None else []
 
         # Flags to know if we need to do training/validation/testing.
         do_train = train_dataloader is not None and args.train_iters > 0
-        do_valid = valid_dataloader is not None and args.eval_iters > 0
-        do_test = test_dataloader is not None and args.eval_iters > 0
+
         # Need to broadcast num_tokens and num_type_tokens.
-        flags = torch.cuda.LongTensor(
-            [int(do_train), int(do_valid), int(do_test)])
+        flags = torch.cuda.LongTensor([
+            int(do_train),
+            len(valid_dataloaders) if args.eval_iters > 0 else 0, # eval_iters == 0 is equivalent to having no validation
+            len(test_dataloaders) if args.eval_iters > 0 else 0, # eval_iters == 0 is equivalent to having no test
+        ])
     else:
         flags = torch.cuda.LongTensor([0, 0, 0])
 
@@ -963,8 +1000,12 @@ def build_train_valid_test_data_iterators(
                                 mpu.get_tensor_model_parallel_src_rank(),
                                 group=mpu.get_tensor_model_parallel_group())
     args.do_train = flags[0].item()
-    args.do_valid = flags[1].item()
-    args.do_test = flags[2].item()
+    num_valid_ds = flags[1].item()
+    num_test_ds = flags[2].item()
+    assert num_test_ds >= 0
+    assert num_valid_ds >= 0
+    args.do_valid = num_valid_ds > 0
+    args.do_test = num_test_ds > 0
 
     # Build iterators.
     dl_type = args.dataloader_type
@@ -976,16 +1017,18 @@ def build_train_valid_test_data_iterators(
     else:
         train_data_iterator = None
 
-    if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(valid_dataloader))
+    if valid_dataloaders is not None:
+        valid_data_iterators = [iter(vdl) if dl_type == 'single' \
+                              else iter(cyclic_iter(valid_dataloaders))
+                                 for vdl in valid_dataloaders]
     else:
-        valid_data_iterator = None
+        valid_data_iterators = [None] * num_valid_ds
 
-    if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
-                             else iter(cyclic_iter(test_dataloader))
+    if test_dataloaders is not None:
+        test_data_iterators = [iter(tdl) if dl_type == 'single' \
+                             else iter(cyclic_iter(test_dataloaders))
+                            for tdl in test_dataloaders]
     else:
-        test_data_iterator = None
+        test_data_iterators = [None] * num_test_ds
 
-    return train_data_iterator, valid_data_iterator, test_data_iterator
+    return train_data_iterator, valid_data_iterators, test_data_iterators
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index b6d09a8da5..67f8e836cd 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -22,7 +22,7 @@
 from megatron import get_timers
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
+from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group
 from megatron.model import GPTModel, ModelType
 from megatron.training import pretrain
 from megatron.utils import get_ltor_masks_and_position_ids
@@ -103,17 +103,49 @@ def forward_step(data_iterator, model):
 def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
+    train_ds, valid_ds, test_ds = None, None, None
 
     print_rank_0('> building train, validation, and test datasets '
                  'for GPT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup))
+    # Option 1 of data loading using --data-path
+    if args.data_path:
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            seq_length=args.seq_length,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup))
+    # Option 2 of data loading using --(train|valid|test)-weighted-split-paths
+    elif args.train_weighted_split_paths:
+        assigned_train_valid_test = []
+        if args.train_weighted_split_paths is not None:
+            train_ds = []
+            assigned_train_valid_test.append("train")
+        if args.valid_weighted_split_paths is not None:
+            valid_ds = []
+            assigned_train_valid_test.append("valid")
+        if args.test_weighted_split_paths is not None:
+            test_ds = []
+            assigned_train_valid_test.append("test")
+
+        for s in assigned_train_valid_test:
+            data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
+                                eval(f"args.{s}_weighted_split_weights"),
+                                eval(f"args.{s}_weighted_split_splits"),
+                                eval(f"args.{s}_weighted_split_names"))
+            for paths, weights, splits, name in data_groups:
+                d = build_dataset_group(name, paths, weights, splits,
+                                        args.data_impl,
+                                        train_val_test_num_samples,
+                                        args.seq_length, args.seed,
+                                        (not args.mmap_warmup),
+                                        train_valid_test=s)
+                eval(f"{s}_ds").append(d)
+    else:
+        raise NotImplementedError("No dataloading argument passed")
+
     print_rank_0("> finished creating GPT datasets ...")
 
     return train_ds, valid_ds, test_ds

From c73ff5c30e48f475ab62e1f74bfa50f55379bb24 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-01.elementai.net>
Date: Thu, 9 Mar 2023 22:38:41 +0000
Subject: [PATCH 079/144] add subset argument to preprocessing

---
 tools/preprocess_data.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
index 5810d29297..2355023852 100644
--- a/tools/preprocess_data.py
+++ b/tools/preprocess_data.py
@@ -105,6 +105,8 @@ def get_args():
     group = parser.add_argument_group(title='input data')
     group.add_argument('--input', type=str, required=True,
                        help='Path to input JSON')
+    group.add_argument('--subset', type=str, default=None,
+                       help='Subset argument when loading input data from a HuggingFace dataset')
     group.add_argument('--json-keys', nargs='+', default=['text'],
                        help='space separate listed of keys to extract from json')
     group.add_argument('--split-sentences', action='store_true',
@@ -170,12 +172,17 @@ def main():
 
     if args.input.endswith(".jsonl"):
         print("Input is a jsonl file")
+        assert args.subset is None, f"subset argument set to: {args.subset}, but loading a jsonl file."
         fin = open(args.input, 'r', encoding='utf-8')
         encoded_docs = pool.imap(encoder.encode, fin, args.chunk_size)
         #encoded_docs = map(encoder.encode, fin)
     else:
+        # NOTE: this is not recommended for datasets larger than 40-50GB, as iterating through a dataset can be slow.
+        # Somehow, it seems faster to first dump the dataset to a jsonl file: ds.to_json() and then process the jsonl file.
+        # NOTE: this will be even slower if the dataset has large objects in other columns.
+        # In this case, it is recommended to dump as json only the required key: ds = ds.remove_columns(...) then to_json()
         print("Input is not a jsonl file, will try to load from HF datasets")
-        ds = load_dataset(args.input, use_auth_token=True, streaming=True, split="train")
+        ds = load_dataset(args.input, use_auth_token=True, streaming=True, split="train", data_dir=args.subset)
         encoded_docs = pool.imap(encoder.encode_hf, ds, args.chunk_size)
 
     level = "document"

From 1d7768aac69debbd6a7e95e51526935e1f929e9b Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 9 Mar 2023 18:43:52 -0500
Subject: [PATCH 080/144] add valid-num-workers argument

---
 megatron/arguments.py          | 4 +++-
 megatron/data/data_samplers.py | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 057e626a8c..9b6a90321d 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -894,7 +894,7 @@ def _add_data_args(parser):
                        '1) a single data path, 2) multiple datasets in the'
                        'form: dataset1-weight dataset1-path dataset2-weight '
                        'dataset2-path ...')
-    group.add_argument('--split', type=str, default='969, 30, 1',
+    group.add_argument('--split', type=str, default=None,
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
                        '`90,5,5` will use 90%% of data for training, 5%% for '
@@ -1026,6 +1026,8 @@ def __call__(self, parser, args, values, option_string=None):
                        help='Warm up mmap files.')
     group.add_argument('--num-workers', type=int, default=2,
                        help="Dataloader number of workers.")
+    group.add_argument('--valid-num-workers', type=int, default=2,
+                       help="Dataloader number of workers for validation.")
     group.add_argument('--tokenizer-type', type=str,
                        default=None,
                        choices=['BertWordPieceLowerCase',
diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py
index 2efef42bf4..91de3ac224 100644
--- a/megatron/data/data_samplers.py
+++ b/megatron/data/data_samplers.py
@@ -24,7 +24,7 @@
 from megatron import mpu
 
 
-def build_pretraining_data_loader(dataset, consumed_samples):
+def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None):
     """Buld dataloader given an input dataset."""
 
     if dataset is None:
@@ -52,10 +52,11 @@ def build_pretraining_data_loader(dataset, consumed_samples):
         raise Exception('{} dataloader type is not supported.'.format(
                 args.dataloader_type))
 
+    num_workers = args.num_workers if num_workers is None else num_workers
     # Torch dataloader.
     return torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
-                                       num_workers=args.num_workers,
+                                       num_workers=num_workers,
                                        pin_memory=True)
 
 class MegatronPretrainingSampler:

From 30328216c26e833f5094a8e21bbdecb0ea667250 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 9 Mar 2023 19:47:01 -0500
Subject: [PATCH 081/144] change fim special tokens to use underscore

---
 megatron/tokenizer/tokenizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index f9cad7b642..87aeb512f4 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -23,10 +23,10 @@
 from .gpt2_tokenization import GPT2Tokenizer
 
 
-FIM_PREFIX = "<fim-prefix>"
-FIM_MIDDLE = "<fim-middle>"
-FIM_SUFFIX = "<fim-suffix>"
-FIM_PAD = "<fim-pad>"
+FIM_PREFIX = "<fim_prefix>"
+FIM_MIDDLE = "<fim_middle>"
+FIM_SUFFIX = "<fim_suffix>"
+FIM_PAD = "<fim_pad>"
 EOD = "<|endoftext|>"
 
 

From 497832129b9ddc5a971ca3b5d70a4084eea51310 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 9 Mar 2023 20:16:34 -0500
Subject: [PATCH 082/144] log tflops

---
 megatron/training.py | 27 ++++++++++++++++-----------
 megatron/utils.py    | 26 ++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 953831d470..4c3c5fff05 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -50,7 +50,7 @@
 from megatron.initialize import set_jit_fusion_options
 from megatron.optimizer_param_scheduler import OptimizerParamScheduler
 from megatron.model import DistributedDataParallel as LocalDDP
-from megatron.utils import check_adlr_autoresume_termination
+from megatron.utils import check_adlr_autoresume_termination, get_tflops
 from megatron.utils import unwrap_model
 from megatron.data.data_samplers import build_pretraining_data_loader
 from megatron.utils import calc_params_l2_norm
@@ -615,24 +615,17 @@ def add_to_logging(name):
                 iteration,
             )
     
-    # Weights and biases reporting
-    if (iteration % args.log_interval == 0) and is_last_rank() and args.wandb_project_name:
-        metrics = {
-            'learning-rate': learning_rate,
-            'samples': args.consumed_train_samples,
-            'loss-scale': loss_scale,
-            'grad-norm': grad_norm,
-            **loss_dict
-        }
-        wandb.log(metrics, step=iteration)
 
     if iteration % args.log_interval == 0:
         elapsed_time = timers('interval-time').elapsed()
         elapsed_time_per_iteration = elapsed_time / total_iterations
+        tflops = get_tflops(batch_size, elapsed_time_per_iteration)
         if writer:
             if args.log_timers_to_tensorboard:
                 writer.add_scalar('iteration-time',
                                   elapsed_time_per_iteration, iteration)
+                writer.add_scalar('TFLOPs per gpu (estimated)',
+                                  tflops, iteration)
         log_string = ' iteration {:8d}/{:8d} |'.format(
             iteration, args.train_iters)
         log_string += ' consumed samples: {:12d} |'.format(
@@ -660,6 +653,7 @@ def add_to_logging(name):
             total_loss_dict[skipped_iters_key])
         log_string += ' number of nan iterations: {:3d} |'.format(
             total_loss_dict[nan_iters_key])
+        log_string += ' TFLOPs: {:.2f} |'.format(tflops)
         total_loss_dict[advanced_iters_key] = 0
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[nan_iters_key] = 0
@@ -670,6 +664,17 @@ def add_to_logging(name):
             report_memory_flag = False
         timers.log(timers_to_log, normalizer=args.log_interval)
 
+    # Weights and biases reporting
+    if (iteration % args.log_interval == 0) and is_last_rank() and args.wandb_project_name:
+        metrics = {
+            'learning-rate': learning_rate,
+            'samples': args.consumed_train_samples,
+            'loss-scale': loss_scale,
+            'grad-norm': grad_norm,
+            'tflops': tflops,
+            **loss_dict
+        }
+        wandb.log(metrics, step=iteration)
     return report_memory_flag
 
 
diff --git a/megatron/utils.py b/megatron/utils.py
index 02956070c4..01012e9efd 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -223,3 +223,29 @@ def print_rank_last(message):
             print(message, flush=True)
     else:
         print(message, flush=True)
+
+
+def get_tflops(batch_size, elapsed_time_per_iteration):
+    """Get tflop/s/GPU from global-batch-size and elapsed-time"""
+    args = get_args()
+    seq_len = args.seq_length
+    hidden_size = args.hidden_size
+    num_layers = args.num_layers
+    vocab_size = args.padded_vocab_size
+
+    # Compute throughput.
+    samples_per_sec = batch_size / elapsed_time_per_iteration
+    tokens_per_sec = samples_per_sec * seq_len
+
+    # General TFLOPs formula (borrowed from Equation 3 in Section 5.1 of
+    # https://arxiv.org/pdf/2104.04473.pdf).
+    # The factor of 4 is when used with activation check-pointing,
+    # otherwise it will be 3, but for 200B model, activation check-pointing will always be on.
+    checkpoint_activations_factor = 4 if args.recompute_granularity == 'full' else 3
+    # GLU activations double the hidden states in the upscaling feed-forward in each transformer layer
+    # This leads to 16bsh^2 instead of 8bsh^2 per first feed-forward layer in MLP, thus we increase the coefficient by 8.
+    # Refer to https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/283#issue-1260805063 for more details.
+    coefficient = 32 if args.glu_activation else 24
+    flops_per_iteration = (coefficient * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size)))
+    tflops = flops_per_iteration / (elapsed_time_per_iteration * args.world_size * (10**12))
+    return tflops

From 3a6286ba11181899cccfb11d2e508eca9fd15bea Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 10 Mar 2023 02:15:05 -0500
Subject: [PATCH 083/144] make assert less strict for very small datasets
 (typically when one epoch is less than 5 samples)

---
 megatron/data/gpt_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index e401d417f3..3495713ed5 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -385,7 +385,10 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
                 assert last_epoch_num_samples >= 0, \
                     'last epoch number of samples should be non-negative.'
                 num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
-                assert last_epoch_num_samples < (num_samples_per_epoch + 1), \
+                # For very small datasets, `last_epoch_num_samples` can be equal to
+                # (num_samples_per_epoch + 1).
+                # TODO: check that this is not problematic indeed
+                assert last_epoch_num_samples <= (num_samples_per_epoch + 1), \
                     'last epoch number of samples exceeded max value.'
                 # If we have less than 80% of the samples for the last epoch,
                 # seperate out the epoch and treat it differently.

From a950409b833a3378367b0a7e70f08f430ffd8049 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 10 Mar 2023 03:00:49 -0500
Subject: [PATCH 084/144] fix fim for new tokenizer

---
 megatron/data/gpt_dataset.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 3495713ed5..e9d85997fc 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -294,7 +294,10 @@ def __getitem__(self, idx):
             assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
 
             eod = self.tokenizer.eod
-            pad = self.tokenizer.special_tokens[FIM_PAD]
+            try:
+                pad = self.tokenizer.special_tokens[FIM_PAD]
+            except KeyError:
+                pad = self.tokenizer.vocab[FIM_PAD]
 
             segment_breaks = np.argwhere(sample == eod) # split sample by document
 
@@ -587,8 +590,10 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
     """
     fim_rate = args.fim_rate
 
-    suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
-
+    try:
+        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
+    except KeyError:
+        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
     if np_rng.binomial(1, fim_rate): # sample bernoulli dist
 
         contents = tokenizer.detokenize(sample)

From 294ef35f2f23017a6aa7c0103f6f08d98601867e Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 10 Mar 2023 03:12:46 -0500
Subject: [PATCH 085/144] fix fim

---
 megatron/data/gpt_dataset.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index e9d85997fc..7ff95c661e 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -247,6 +247,11 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
         self.args = get_args()
         self.tokenizer = get_tokenizer()
         self.np_rng = np.random.RandomState(seed=seed) # rng state for FIM
+        
+        try:
+            self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
+        except KeyError:
+            self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
 
     def __len__(self):
         # -1 is due to data structure used to retieve the index:
@@ -294,11 +299,6 @@ def __getitem__(self, idx):
             assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
 
             eod = self.tokenizer.eod
-            try:
-                pad = self.tokenizer.special_tokens[FIM_PAD]
-            except KeyError:
-                pad = self.tokenizer.vocab[FIM_PAD]
-
             segment_breaks = np.argwhere(sample == eod) # split sample by document
 
             if segment_breaks.shape != (0, 1): # then there is an EOD token in this example
@@ -309,25 +309,28 @@ def __getitem__(self, idx):
                     if loc - curr_start_position > 0:
                         # permute {prefix, suffix, middle} or {suffix, prefix, middle}
                         permuted, self.np_rng = \
-                            permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False)
+                            permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False,
+                                    suffix_tok_id=self.suffix_tok_id, prefix_tok_id=self.prefix_tok_id, middle_tok_id=self.middle_tok_id, pad_tok_id=self.pad_tok_id)
                         new_samples += [permuted, [eod]]
 
                     curr_start_position = loc + 1 # jump over the EOD token
                 # Permute the segment after the last EOD
                 permuted, self.np_rng = \
-                    permute(sample[curr_start_position:], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False)
+                    permute(sample[curr_start_position:], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False,
+                            suffix_tok_id=self.suffix_tok_id, prefix_tok_id=self.prefix_tok_id, middle_tok_id=self.middle_tok_id, pad_tok_id=self.pad_tok_id)
                 new_samples.append(permuted)
 
                 sample = np.concatenate(new_samples)
             else:
-                sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer, truncate_or_pad=False)
+                sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer, truncate_or_pad=False,
+                                              suffix_tok_id=self.suffix_tok_id, prefix_tok_id=self.prefix_tok_id, middle_tok_id=self.middle_tok_id, pad_tok_id=self.pad_tok_id)
         
         # Truncate or pad sequence to max-length
         diff = sample.shape[0] - sample_len
         if diff > 0: # too long
             sample = sample[:sample_len]
         elif diff < 0: # too short
-            sample = np.concatenate([sample, np.full((-1 * diff), pad)])
+            sample = np.concatenate([sample, np.full((-1 * diff), self.pad_tok_id)])
 
         assert sample.shape[0] == sample_len
         # end FIM-specific code
@@ -583,17 +586,14 @@ def _build_shuffle_idx(num_samples, total_size, np_rng):
 
 
 # From https://github.com/EleutherAI/gpt-neox/blob/FIM-clean/megatron/data/gpt2_dataset.py#L339
-def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True):
+def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True,
+            suffix_tok_id=None, prefix_tok_id=None, middle_tok_id=None, pad_tok_id=None):
     """
     Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. 
     Maintain the same sample length (if transform creates a few extra tokens, drop them).
     """
     fim_rate = args.fim_rate
 
-    try:
-        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
-    except KeyError:
-        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
     if np_rng.binomial(1, fim_rate): # sample bernoulli dist
 
         contents = tokenizer.detokenize(sample)

From 9ce961178d4dbf920ac9ed0d5443e9ebff0bd6da Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 10 Mar 2023 10:17:00 -0500
Subject: [PATCH 086/144] more explicit error when trying to create empty
 splits

---
 megatron/data/gpt_dataset.py | 4 ++++
 pretrain_gpt.py              | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 7ff95c661e..8b218ba50f 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -120,6 +120,10 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
                                         seed, skip_warmup,
                                         dataset_group_name, train_valid_test)
 
+            # ds can be none if the dataset is so small that not a single document
+            # is present in the split.
+            assert ds is not None, \
+                f"Got an empty split when trying to create dataset: {prefixes[i], splits[i]}"
             datasets.append(ds)
         all_datasets = BlendableDataset(datasets, weights)
 
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 67f8e836cd..f01635b210 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -142,6 +142,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
                                         args.seq_length, args.seed,
                                         (not args.mmap_warmup),
                                         train_valid_test=s)
+                assert d is not None, \
+                    f"Got an empty split when trying to create dataset: {paths, weights, splits, name}"
                 eval(f"{s}_ds").append(d)
     else:
         raise NotImplementedError("No dataloading argument passed")

From 654d0d8ab41e7a1edcd431d8f0f30aa79a5e486c Mon Sep 17 00:00:00 2001
From: RaymondLi0 <rlraymondli@gmail.com>
Date: Tue, 21 Mar 2023 18:25:31 +0100
Subject: [PATCH 087/144] add multi-validation for gpt training (#32)

* add multi-validation for gpt training

* add valid-num-workers argument
---
 examples/pretrain_gpt_multilingual.sh |  65 +++++++++++
 megatron/arguments.py                 | 127 +++++++++++++++++++-
 megatron/data/data_samplers.py        |   5 +-
 megatron/data/dataset_utils.py        |  27 ++++-
 megatron/data/gpt_dataset.py          | 161 ++++++++++++++++++++------
 megatron/training.py                  | 123 +++++++++++++-------
 pretrain_gpt.py                       |  50 ++++++--
 7 files changed, 467 insertions(+), 91 deletions(-)
 create mode 100644 examples/pretrain_gpt_multilingual.sh

diff --git a/examples/pretrain_gpt_multilingual.sh b/examples/pretrain_gpt_multilingual.sh
new file mode 100644
index 0000000000..5edebe770d
--- /dev/null
+++ b/examples/pretrain_gpt_multilingual.sh
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1  # Adjust
+NODE_RANK=0  # Adjust
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+# paths to multilingual preprocessed datasets
+DATA_PATH_EN=<Specify path and file prefix>_text_document
+DATA_PATH_AR=<Specify path and file prefix>_text_document
+DATA_PATH_KR=<Specify path and file prefix>_text_document
+DATA_PATH_JP=<Specify path and file prefix>_text_document
+
+CHECKPOINT_PATH=<Specify path>
+
+
+torchrun $DISTRIBUTED_ARGS \
+    pretrain_gpt.py \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --train-iters 1000 \
+    --lr-decay-iters 320000 \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
+    --valid-weighted-split-paths \
+    "VALID_EN: 1 0.6:0.8 $DATA_EN" \
+    "VALID_AR: 1 0.6:0.8 $DATA_AR" \
+    "VALID_JP: 1 0.6:0.8 $DATA_KR" \
+    "VALID_KR: 1 0.6:0.8 $DATA_JP" \
+    "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
+    --test-weighted-split-paths \
+    "TEST_EN: 1 0.8:1 $DATA_EN" \
+    "TEST_AR: 1 0.8:1 $DATA_AR" \
+    "TEST_JP: 1 0.8:1 $DATA_JP" \
+    "TEST_KR: 1 0.8:1 $DATA_KR" \
+    "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --lr-warmup-fraction .01 \
+    --checkpoint-activations \
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --fp16
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 1bff85c0a9..9b6a90321d 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -17,6 +17,7 @@
 
 import argparse
 import os
+import re
 
 import torch
 
@@ -100,6 +101,30 @@ def validate_args(args, defaults={}):
                     ' to be less than pipeline model parallel size ({})'.format(
                             args.pipeline_model_parallel_size)
 
+    # --data-path and --train-weighted-splits-paths
+    message = "Data loading Mode 1: --data-path and --split "\
+            "and Mode 2: --(train|valid|test)-weighted-split-paths"\
+            "are mutually exclusive i.e. cannot be set together."
+
+    if args.data_path:
+        assert args.train_weighted_split_paths is None, message
+        setattr(args, "valid_weighted_split_names", None)
+        setattr(args, "valid_weighted_split_weights", None)
+        setattr(args, "valid_weighted_split_splits", None)
+
+        setattr(args, "test_weighted_split_names", None)
+        setattr(args, "test_weighted_split_weights", None)
+        setattr(args, "test_weighted_split_splits", None)
+
+        # args.split default value in the args is None it is set here in order
+        # to check that it does not to overlap with the 2nd mode of data loading
+        if args.split is None:
+            args.split = "969, 30, 1"
+
+    if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
+                args.test_weighted_split_paths:
+        assert args.data_path is None and args.split is None, message
+
     # Deprecated arguments
     assert args.batch_size is None, '--batch-size argument is no longer ' \
         'valid, use --micro-batch-size instead'
@@ -863,16 +888,114 @@ def _add_validation_args(parser):
 def _add_data_args(parser):
     group = parser.add_argument_group(title='data and dataloader')
 
+    # option 1 for data loading  (mutually exclusive with option2)
     group.add_argument('--data-path', nargs='*', default=None,
                        help='Path to the training dataset. Accepted format:'
                        '1) a single data path, 2) multiple datasets in the'
                        'form: dataset1-weight dataset1-path dataset2-weight '
                        'dataset2-path ...')
-    group.add_argument('--split', type=str, default='969, 30, 1',
+    group.add_argument('--split', type=str, default=None,
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
                        '`90,5,5` will use 90%% of data for training, 5%% for '
                        'validation and 5%% for test.')
+    # option 2 for data loading (mutually exclusive with option1)
+    # see https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/97/files
+
+    # helper class to parse the --xxx-weighted-split-paths
+    # note here two args are set: extra valid dataset paths and names
+    class parse_data_paths(argparse.Action):
+        def __call__(self, parser, args, values, option_string=None):
+
+            if option_string == "--train-weighted-split-paths":
+                assert len(values) == 1, 'Only 1 dataset group is allowed to'
+                'be passed for the argument --train-weighted-split-paths'
+
+            # make sure string given in the correct format
+            err_message = 'Each data group should be input on the following format'
+            '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+            'where START < END'
+            for v in values:
+                # each prefix consists several datasets separated by commas
+                prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
+                datasets = prefix.split(",")
+                # check if each dataset is formatted like `WEIGHT START:END PATH`
+                for d in datasets:
+                    assert len(d.split()) == 3, err_message
+                    start, end = d.split()[1].split(":")
+                    assert float(start) < float(end), err_message
+
+            names = [v.split(":")[0] for v in values]
+
+            prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
+            weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
+            splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
+            paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
+
+            # # to keep consistency with Option 1 of data loading (through --data-path)
+            # #  paths will contain strings on the following form
+            # # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
+            # # while data will be parsed in additional arguments below
+            # paths_option1_style = []
+            # for p, w in zip(paths, weights):
+            #   paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
+            # setattr(args, self.dest, paths_option1_style)
+            setattr(args, self.dest, paths)
+            setattr(args, self.dest.replace("paths", "weights"), weights)
+            setattr(args, self.dest.replace("paths", "splits"), splits)
+            setattr(args, self.dest.replace("paths","names"), names)
+
+
+    group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: ONE dataset groups could be'
+                    'submitted in the following form between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    'WEIGHT is used to up and down sample each dataset A,B,C in the group'
+                    'START:END indicates the split portion of the dataset',
+                    action=parse_data_paths)
+
+    group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: one or many dataset groups could be'
+                    'submitted in the following form each between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
+                    'validation will be run on each of those groups independently',
+                    action=parse_data_paths)
+
+    group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: one or many dataset groups could be'
+                    'submitted in the following form each between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
+                    'test will be run on each of those groups independently',
+                    action=parse_data_paths)
+
+    class parse_data_paths_path(argparse.Action):
+        def __call__(self, parser, args, values, option_string=None):
+            expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
+            assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
+
+            with open(values, "r") as fi:
+                lines = fi.readlines()
+                assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
+                assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
+                values = lines[0][1:-2].split("\" \"")
+                weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
+                weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
+                setattr(args, weighted_split_paths_dest, values)
+                parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
+
+    # option 2-bis: load x-weighted-split-paths from a file in case this argument is very long
+    group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
+    group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
+    group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
+
     group.add_argument('--vocab-file', type=str, default=None,
                        help='Path to the vocab file.')
     group.add_argument('--merge-file', type=str, default=None,
@@ -903,6 +1026,8 @@ def _add_data_args(parser):
                        help='Warm up mmap files.')
     group.add_argument('--num-workers', type=int, default=2,
                        help="Dataloader number of workers.")
+    group.add_argument('--valid-num-workers', type=int, default=2,
+                       help="Dataloader number of workers for validation.")
     group.add_argument('--tokenizer-type', type=str,
                        default=None,
                        choices=['BertWordPieceLowerCase',
diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py
index 2efef42bf4..91de3ac224 100644
--- a/megatron/data/data_samplers.py
+++ b/megatron/data/data_samplers.py
@@ -24,7 +24,7 @@
 from megatron import mpu
 
 
-def build_pretraining_data_loader(dataset, consumed_samples):
+def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None):
     """Buld dataloader given an input dataset."""
 
     if dataset is None:
@@ -52,10 +52,11 @@ def build_pretraining_data_loader(dataset, consumed_samples):
         raise Exception('{} dataloader type is not supported.'.format(
                 args.dataloader_type))
 
+    num_workers = args.num_workers if num_workers is None else num_workers
     # Torch dataloader.
     return torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
-                                       num_workers=args.num_workers,
+                                       num_workers=num_workers,
                                        pin_memory=True)
 
 class MegatronPretrainingSampler:
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 426e965c85..e6a27e6bea 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -41,8 +41,7 @@
 DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
 
 
-def get_datasets_weights_and_num_samples(data_prefix,
-                                         train_valid_test_num_samples):
+def analyze_data_prefix(data_prefix):
 
     # The data prefix should be in the format of:
     #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
@@ -59,10 +58,16 @@ def get_datasets_weights_and_num_samples(data_prefix,
         weight_sum += weight
     assert weight_sum > 0.0
     weights = [weight / weight_sum for weight in weights]
+    return prefixes, weights
+
+
+def get_datasets_weights_and_num_samples(data_prefix,
+                                         train_valid_test_num_samples):
 
-    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
     # not uniformly distribute the number of samples, we still have
     # samples left to feed to the network.
+    prefixes, weights = analyze_data_prefix(data_prefix)
     datasets_train_valid_test_num_samples = []
     for weight in weights:
         datasets_train_valid_test_num_samples.append(
@@ -603,6 +608,22 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     return indexed_dataset
 
 
+def get_split_by_range_(range_string, size):
+    """ Get dataset splits based on a range:
+    range_string is in the form  START%:END%  for e.g. 0.2:0.8
+    outputs an array of two values [start_index, end_index]
+    """
+    # some checks that range is given in the correct form
+    splits = [float(i) for i in range_string.split(":")]
+    assert len(splits) == 2, "splits should be passed as start:end"
+    assert splits[0] <= 1 and splits[1] <= 1
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits_index = [round(s * float(size)) for s in splits]
+    assert len(splits_index) == 2
+    return splits_index
+
+
 def get_train_valid_test_split_(splits_string, size):
     """ Get dataset splits from comma or '/' separated string list."""
 
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 02bfad8142..e401d417f3 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -25,7 +25,7 @@
 from megatron import mpu, print_rank_0, get_args, get_tokenizer
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
-from megatron.data.dataset_utils import get_train_valid_test_split_
+from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.tokenizer.tokenizer import FIM_MIDDLE, FIM_PAD, FIM_PREFIX, FIM_SUFFIX
 
@@ -37,46 +37,134 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
 
     # Single dataset.
     if len(data_prefix) == 1:
-        return _build_train_valid_test_datasets(data_prefix[0],
+        all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0],
                                                 data_impl, splits_string,
                                                 train_valid_test_num_samples,
                                                 seq_length, seed, skip_warmup)
+    # Blending dataset.
+    else:
+
+        output = get_datasets_weights_and_num_samples(data_prefix,
+                                                    train_valid_test_num_samples)
+        prefixes, weights, datasets_train_valid_test_num_samples = output
+
+        # Build individual datasets.
+        train_datasets = []
+        valid_datasets = []
+        test_datasets = []
+        for i in range(len(prefixes)):
+            train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+                                            prefixes[i], data_impl, splits_string,
+                                            datasets_train_valid_test_num_samples[i],
+                                            seq_length, seed, skip_warmup)
+            if train_ds:
+                train_datasets.append(train_ds)
+            if valid_ds:
+                valid_datasets.append(valid_ds)
+            if test_ds:
+                test_datasets.append(test_ds)
+
+        all_train_datasets = BlendableDataset(train_datasets, weights) \
+                            if train_datasets else None
+        all_valid_datasets = BlendableDataset(valid_datasets, weights) \
+                            if valid_datasets else None
+        all_test_datasets = BlendableDataset(test_datasets, weights) \
+                            if test_datasets else None
+
+    return all_train_datasets, all_valid_datasets, all_test_datasets
+
+
+def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
+                        train_valid_test_num_samples,
+                        seq_length, seed, skip_warmup, train_valid_test):
+    '''
+    Build a single dataset group corresponding to Option 2 of data loading see arguments.py
+    a dataset group is passed on the following form
+    GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2
+    or alternatively
+    GIVEN_NAME PATH1    # for a single dataset to be used fully
+    '''
+
+    assert train_valid_test in ["train","valid","test"]
 
+    # Single dataset.
+    if len(paths) == 1:
+        dataset =  _build_single_datasets(paths[0],
+                                          splits[0],
+                                          data_impl,
+                                          train_valid_test_num_samples,
+                                          seq_length, seed, skip_warmup,
+                                          dataset_group_name, train_valid_test)
+        return dataset
     # Blending dataset.
-    # Parse the values.
-    output = get_datasets_weights_and_num_samples(data_prefix,
-                                                  train_valid_test_num_samples)
-    prefixes, weights, datasets_train_valid_test_num_samples = output
-
-    # Build individual datasets.
-    train_datasets = []
-    valid_datasets = []
-    test_datasets = []
-    for i in range(len(prefixes)):
-        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
-            prefixes[i], data_impl, splits_string,
-            datasets_train_valid_test_num_samples[i],
-            seq_length, seed, skip_warmup)
-        if train_ds:
-            train_datasets.append(train_ds)
-        if valid_ds:
-            valid_datasets.append(valid_ds)
-        if test_ds:
-            test_datasets.append(test_ds)
-
-    # Blend.
-    blending_train_dataset = None
-    if train_datasets:
-        blending_train_dataset = BlendableDataset(train_datasets, weights)
-    blending_valid_dataset = None
-    if valid_datasets:
-        blending_valid_dataset = BlendableDataset(valid_datasets, weights)
-    blending_test_dataset = None
-    if test_datasets:
-        blending_test_dataset = BlendableDataset(test_datasets, weights)
-
-    return (blending_train_dataset, blending_valid_dataset,
-            blending_test_dataset)
+    else:
+
+        data_prefix = []
+        # data_prefix is on the shape:
+        # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"]
+        for w,p in zip(weights, paths):
+            data_prefix += [w,p]
+
+        output = get_datasets_weights_and_num_samples(data_prefix,
+                                                    train_valid_test_num_samples)
+        prefixes, weights, datasets_train_valid_test_num_samples = output
+
+        # Build individual datasets.
+        datasets = []
+        for i in range(len(prefixes)):
+            ds = _build_single_datasets(prefixes[i],
+                                        splits[i],
+                                        data_impl,
+                                        datasets_train_valid_test_num_samples[i],
+                                        seq_length,
+                                        seed, skip_warmup,
+                                        dataset_group_name, train_valid_test)
+
+            datasets.append(ds)
+        all_datasets = BlendableDataset(datasets, weights)
+
+        return all_datasets
+
+def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples,
+                            seq_length, seed, skip_warmup, dataset_group_name, train_valid_test):
+    """Build a single dataset"""
+
+    assert train_valid_test in ["train","valid","test"]
+    index = ["train","valid","test"].index(train_valid_test)
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           data_impl,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    # this corresponds to option2 for data loading on the form
+    # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3
+    # splits here is an array of size 2  [start_index, end_index]
+    splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents)
+
+    # Print stats about the splits.
+    print_rank_0(' > dataset split:')
+
+    print_rank_0('    {}:'.format(dataset_group_name))
+    print_rank_0('     document indices in [{}, {}) total of {} '
+                     'documents'.format(splits[0], splits[1],
+                                        splits[1] - splits[0]))
+
+    def build_dataset(name):
+        dataset = None
+        if splits[1] > splits[0]:
+            documents = np.arange(start=splits[0], stop=splits[1],
+                                  step=1, dtype=np.int32)
+            dataset = GPTDataset(name, data_prefix,
+                                  documents, indexed_dataset,
+                                  train_valid_test_num_samples[index],
+                                  seq_length, seed)
+        return dataset
+
+    dataset = build_dataset(dataset_group_name)
+
+    return dataset
 
 
 def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
@@ -90,6 +178,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                            skip_warmup)
 
     total_num_of_documents = indexed_dataset.sizes.shape[0]
+    # splits here is an array of size 4  [train_start_index, valid_start_index, test_start_index, test_end_index]
     splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
 
     # Print stats about the splits.
diff --git a/megatron/training.py b/megatron/training.py
index 16d190472f..953831d470 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,6 +57,7 @@
 from megatron.schedules import get_forward_backward_func
 from megatron.utils import report_memory
 from megatron.model.vision.knn_monitor import compute_feature_bank
+from megatron.data.dataset_utils import analyze_data_prefix
 
 
 def print_datetime(string):
@@ -162,11 +163,13 @@ def pretrain(train_valid_test_dataset_provider,
     print_datetime('after training is done')
 
     if args.do_valid:
-        prefix = 'the end of training for val data'
-        evaluate_and_print_results(prefix, forward_step_func,
-                                   valid_data_iterator, model,
-                                   iteration, process_non_loss_data_func,
-                                   False)
+        names = args.valid_weighted_split_names
+        names = names if names is not None else ['valid'] * len(valid_data_iterator)
+        for iterator, name in zip(valid_data_iterator, names):
+            prefix = 'the end of training for val data'
+            evaluate_and_print_results(prefix, forward_step_func,
+                                       iterator, model,
+                                       iteration, process_non_loss_data_func, False, data_group_name=name)
 
     if args.save and iteration != 0:
         save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
@@ -174,10 +177,12 @@ def pretrain(train_valid_test_dataset_provider,
     if args.do_test:
         # Run on test data.
         prefix = 'the end of training for test data'
-        evaluate_and_print_results(prefix, forward_step_func,
-                                   test_data_iterator, model,
-                                   0, process_non_loss_data_func,
-                                   True)
+        names = args.test_weighted_split_names
+        names = names if names is not None else ['test'] * len(test_data_iterator)
+        for iterator, name in zip(test_data_iterator, names):
+            evaluate_and_print_results(prefix, forward_step_func,
+                                       iterator, model,
+                                       0, process_non_loss_data_func, True, data_group_name=name)
 
 def update_train_iters(args):
 
@@ -741,10 +746,12 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         if args.eval_interval and iteration % args.eval_interval == 0 and \
            args.do_valid:
             prefix = 'iteration {}'.format(iteration)
-            evaluate_and_print_results(prefix, forward_step_func,
-                                       valid_data_iterator, model,
-                                       iteration, process_non_loss_data_func,
-                                       False)
+            names = args.valid_weighted_split_names
+            names = names if names is not None else ['valid'] * len(valid_data_iterator)
+            for iterator, name in zip(valid_data_iterator, names):
+                evaluate_and_print_results(prefix, forward_step_func,
+                                           iterator, model,
+                                           iteration, process_non_loss_data_func, False, data_group_name=name)
 
         # Checkpointing
         saved_checkpoint = False
@@ -852,36 +859,40 @@ def evaluate(forward_step_func,
 def evaluate_and_print_results(prefix, forward_step_func,
                                data_iterator, model,
                                iteration, process_non_loss_data_func,
-                               verbose=False):
+                               verbose=False, data_group_name=None):
     """Helper function to evaluate and dump results on screen."""
     args = get_args()
     writer = get_tensorboard_writer()
+    ds_name = data_group_name
+    # print corresponding dataset name (used for multiple validation datasets)
+    tf_plot_prefix = f"lm-loss-validation/{ds_name}" if ds_name else "lm-loss-validation"
 
     total_loss_dict, collected_non_loss_data = evaluate(
         forward_step_func, data_iterator, model,
         process_non_loss_data_func, verbose)
-    string = ' validation loss at {} | '.format(prefix)
+    string = '{} loss at {} | '.format(ds_name, prefix) if ds_name is not None\
+        else 'validation loss at {} | '.format(prefix)
     for key in total_loss_dict:
         string += '{} value: {:.6E} | '.format(key, total_loss_dict[key].item())
         ppl = math.exp(min(20, total_loss_dict[key].item()))
         string += '{} PPL: {:.6E} | '.format(key, ppl)
         if writer:
-            writer.add_scalar('{} validation'.format(key),
+            writer.add_scalar(f'{tf_plot_prefix}/{key} validation',
                               total_loss_dict[key].item(),
                               iteration)
-            writer.add_scalar('{} validation vs samples'.format(key),
+            writer.add_scalar(f'{tf_plot_prefix}/{key} validation vs samples',
                               total_loss_dict[key].item(),
                               args.consumed_train_samples)
             if args.log_validation_ppl_to_tensorboard:
-                writer.add_scalar('{} validation ppl'.format(key), ppl,
+                writer.add_scalar(f'{tf_plot_prefix}/{key} validation ppl', ppl,
                                   iteration)
-                writer.add_scalar('{} validation ppl vs samples'.format(key),
+                writer.add_scalar(f'{tf_plot_prefix}/{key} validation ppl vs samples',
                                   ppl, args.consumed_train_samples)
     
     # Weights and biases reporting
     if is_last_rank() and args.wandb_project_name:
         metrics = {
-            '{} validation'.format(key): total_loss_dict[key].item() for key in total_loss_dict
+            f'{tf_plot_prefix}/{key} validation': total_loss_dict[key].item() for key in total_loss_dict
         }
         wandb.log(metrics, step=iteration)
 
@@ -904,7 +915,7 @@ def build_train_valid_test_data_iterators(
     """XXX"""
     args = get_args()
 
-    (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
+    (train_dataloader, valid_dataloaders, test_dataloaders) = (None, None, None)
 
     print_rank_0('> building train, validation, and test datasets ...')
 
@@ -940,21 +951,47 @@ def build_train_valid_test_data_iterators(
         # Build the datasets.
         train_ds, valid_ds, test_ds = build_train_valid_test_datasets_provider(
             train_val_test_num_samples)
+        # if dataloading option is not 2 convert to list to allow
+        # same interface for multiple data groups
+        # for validation and testing in option 2
+        if type(train_ds) != list and train_ds is not None:
+            train_ds = [train_ds]
+        if type(valid_ds) != list and valid_ds is not None:
+            valid_ds = [valid_ds]
+        if type(test_ds) != list and test_ds is not None:
+            test_ds = [test_ds]
 
         # Build dataloders.
+        assert len(train_ds) == 1, "only one training dataset group is allowed"
+
+        # train_dataloader is a single item while valid_dataloaders
+        # and test_dataloaders are arrays
         train_dataloader = build_pretraining_data_loader(
-            train_ds, args.consumed_train_samples)
-        valid_dataloader = build_pretraining_data_loader(
-            valid_ds, args.consumed_valid_samples)
-        test_dataloader = build_pretraining_data_loader(test_ds, 0)
+            train_ds[0], args.consumed_train_samples)
+
+        # We collapse None and empty list as both should mean we don't run validation
+        # args.consumed_valid_samples accumulates the sum of valid steps for every dataset, which are all equal
+        #
+        # XXX: we get a deadlock in the dataloader on multi-dataset eval, after the first dataset,
+        # possibly due to this bug in pytorch https://github.com/pytorch/pytorch/pull/25158. Using
+        # num_workers=0 to work around it - the training can't use that since it impacts throughput
+        # by a few percent
+        valid_dataloaders = [build_pretraining_data_loader(d, args.consumed_valid_samples // len(valid_ds), num_workers=args.valid_num_workers)
+                            for d in valid_ds] \
+                            if valid_ds is not None else []
+        # We collapse None and empty list as both should mean we don't run test
+        test_dataloaders = [build_pretraining_data_loader(d, 0) for d in test_ds] \
+                            if test_ds is not None else []
 
         # Flags to know if we need to do training/validation/testing.
         do_train = train_dataloader is not None and args.train_iters > 0
-        do_valid = valid_dataloader is not None and args.eval_iters > 0
-        do_test = test_dataloader is not None and args.eval_iters > 0
+
         # Need to broadcast num_tokens and num_type_tokens.
-        flags = torch.cuda.LongTensor(
-            [int(do_train), int(do_valid), int(do_test)])
+        flags = torch.cuda.LongTensor([
+            int(do_train),
+            len(valid_dataloaders) if args.eval_iters > 0 else 0, # eval_iters == 0 is equivalent to having no validation
+            len(test_dataloaders) if args.eval_iters > 0 else 0, # eval_iters == 0 is equivalent to having no test
+        ])
     else:
         flags = torch.cuda.LongTensor([0, 0, 0])
 
@@ -963,8 +1000,12 @@ def build_train_valid_test_data_iterators(
                                 mpu.get_tensor_model_parallel_src_rank(),
                                 group=mpu.get_tensor_model_parallel_group())
     args.do_train = flags[0].item()
-    args.do_valid = flags[1].item()
-    args.do_test = flags[2].item()
+    num_valid_ds = flags[1].item()
+    num_test_ds = flags[2].item()
+    assert num_test_ds >= 0
+    assert num_valid_ds >= 0
+    args.do_valid = num_valid_ds > 0
+    args.do_test = num_test_ds > 0
 
     # Build iterators.
     dl_type = args.dataloader_type
@@ -976,16 +1017,18 @@ def build_train_valid_test_data_iterators(
     else:
         train_data_iterator = None
 
-    if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
-                              else iter(cyclic_iter(valid_dataloader))
+    if valid_dataloaders is not None:
+        valid_data_iterators = [iter(vdl) if dl_type == 'single' \
+                              else iter(cyclic_iter(valid_dataloaders))
+                                 for vdl in valid_dataloaders]
     else:
-        valid_data_iterator = None
+        valid_data_iterators = [None] * num_valid_ds
 
-    if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
-                             else iter(cyclic_iter(test_dataloader))
+    if test_dataloaders is not None:
+        test_data_iterators = [iter(tdl) if dl_type == 'single' \
+                             else iter(cyclic_iter(test_dataloaders))
+                            for tdl in test_dataloaders]
     else:
-        test_data_iterator = None
+        test_data_iterators = [None] * num_test_ds
 
-    return train_data_iterator, valid_data_iterator, test_data_iterator
+    return train_data_iterator, valid_data_iterators, test_data_iterators
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index b6d09a8da5..67f8e836cd 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -22,7 +22,7 @@
 from megatron import get_timers
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.data.gpt_dataset import build_train_valid_test_datasets
+from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group
 from megatron.model import GPTModel, ModelType
 from megatron.training import pretrain
 from megatron.utils import get_ltor_masks_and_position_ids
@@ -103,17 +103,49 @@ def forward_step(data_iterator, model):
 def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
+    train_ds, valid_ds, test_ds = None, None, None
 
     print_rank_0('> building train, validation, and test datasets '
                  'for GPT ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        seq_length=args.seq_length,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup))
+    # Option 1 of data loading using --data-path
+    if args.data_path:
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            seq_length=args.seq_length,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup))
+    # Option 2 of data loading using --(train|valid|test)-weighted-split-paths
+    elif args.train_weighted_split_paths:
+        assigned_train_valid_test = []
+        if args.train_weighted_split_paths is not None:
+            train_ds = []
+            assigned_train_valid_test.append("train")
+        if args.valid_weighted_split_paths is not None:
+            valid_ds = []
+            assigned_train_valid_test.append("valid")
+        if args.test_weighted_split_paths is not None:
+            test_ds = []
+            assigned_train_valid_test.append("test")
+
+        for s in assigned_train_valid_test:
+            data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
+                                eval(f"args.{s}_weighted_split_weights"),
+                                eval(f"args.{s}_weighted_split_splits"),
+                                eval(f"args.{s}_weighted_split_names"))
+            for paths, weights, splits, name in data_groups:
+                d = build_dataset_group(name, paths, weights, splits,
+                                        args.data_impl,
+                                        train_val_test_num_samples,
+                                        args.seq_length, args.seed,
+                                        (not args.mmap_warmup),
+                                        train_valid_test=s)
+                eval(f"{s}_ds").append(d)
+    else:
+        raise NotImplementedError("No dataloading argument passed")
+
     print_rank_0("> finished creating GPT datasets ...")
 
     return train_ds, valid_ds, test_ds

From 042a09149eb07b00f3b2317dd20cf296ab3ae0fb Mon Sep 17 00:00:00 2001
From: RaymondLi0 <rlraymondli@gmail.com>
Date: Tue, 21 Mar 2023 18:44:33 +0100
Subject: [PATCH 088/144] Take MQA into account in flops formula, fix
 glu-activation factor

Co-authored-by: Nouamane Tazi <nouamane98@gmail.com>
---
 megatron/utils.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 01012e9efd..5266d780c3 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -242,10 +242,27 @@ def get_tflops(batch_size, elapsed_time_per_iteration):
     # The factor of 4 is when used with activation check-pointing,
     # otherwise it will be 3, but for 200B model, activation check-pointing will always be on.
     checkpoint_activations_factor = 4 if args.recompute_granularity == 'full' else 3
+    coefficient_h_squared = 24
     # GLU activations double the hidden states in the upscaling feed-forward in each transformer layer
-    # This leads to 16bsh^2 instead of 8bsh^2 per first feed-forward layer in MLP, thus we increase the coefficient by 8.
+    # This leads to 16bsh^2 instead of 8bsh^2 per first feed-forward layer in MLP, thus we increase the coefficient_h_squared by 8.
     # Refer to https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/283#issue-1260805063 for more details.
-    coefficient = 32 if args.glu_activation else 24
-    flops_per_iteration = (coefficient * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size)))
+    if args.glu_activation :
+        coefficient_h_squared += 8 
+
+    # In MultiQuery attention, keys and values are shared across heads
+    # qkv projection: 6Bsh^2 -> 4Bsh^2 + 4Bsd_kv
+    # The formula in https://arxiv.org/pdf/2104.04473.pdf becomes:
+    # 4 * (20 Bsh^2 + 4Bsd_kv + 4Bs^2h) = 4*20*Bsh^2 (1 + (d_kv+s)/5h)
+    if args.attention_head_type == 'multiquery':
+        coefficient_h_squared -= 4 # We substract 4 because of shared kv projection
+
+    flops_per_iteration = (coefficient_h_squared * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) 
+    flops_per_iteration += (4 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (seq_len / hidden_size) 
+    flops_per_iteration += (6 * batch_size * seq_len * num_layers * (hidden_size**2)) * (vocab_size / (num_layers * hidden_size))
+
+    if args.attention_head_type == 'multiquery':
+        d_kv = args.kv_channels
+        flops_per_iteration += (4 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (d_kv / hidden_size)  # TODO: maybe tp_size factor missing here
+
     tflops = flops_per_iteration / (elapsed_time_per_iteration * args.world_size * (10**12))
     return tflops

From b18ecf6b332c67e88a20b017a0714172702229b5 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 21 Mar 2023 13:59:26 -0400
Subject: [PATCH 089/144] adjust formula in comments

---
 megatron/utils.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 5266d780c3..d115f815a4 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -250,14 +250,17 @@ def get_tflops(batch_size, elapsed_time_per_iteration):
         coefficient_h_squared += 8 
 
     # In MultiQuery attention, keys and values are shared across heads
-    # qkv projection: 6Bsh^2 -> 4Bsh^2 + 4Bsd_kv
+    # qkv projection: 6Bsh^2 -> 2Bsh^2 + 4Bshd_kv
     # The formula in https://arxiv.org/pdf/2104.04473.pdf becomes:
-    # 4 * (20 Bsh^2 + 4Bsd_kv + 4Bs^2h) = 4*20*Bsh^2 (1 + (d_kv+s)/5h)
+    # 4 * (20 Bsh^2 + 4Bshd_kv + 4Bs^2h) = 4*20*Bsh^2 (1 + (d_kv+s)/5h)
     if args.attention_head_type == 'multiquery':
         coefficient_h_squared -= 4 # We substract 4 because of shared kv projection
 
-    flops_per_iteration = (coefficient_h_squared * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) 
-    flops_per_iteration += (4 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (seq_len / hidden_size) 
+    # Feed-forward and projections
+    flops_per_iteration = (coefficient_h_squared * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2))
+    # Attention-matrix computation
+    flops_per_iteration += (4 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (seq_len / hidden_size)
+    # LM-head
     flops_per_iteration += (6 * batch_size * seq_len * num_layers * (hidden_size**2)) * (vocab_size / (num_layers * hidden_size))
 
     if args.attention_head_type == 'multiquery':

From 659295a34081266ca0fb8dba99efd9e4da3eb212 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 21 Mar 2023 19:04:17 -0400
Subject: [PATCH 090/144] Kv grad allreduce v2 (#39)

Co-authored-by: thomasw21 <24695242+thomasw21@users.noreply.github.com>
---
 megatron/model/transformer.py           | 16 +++++++++++++---
 megatron/optimizer/distrib_optimizer.py |  6 +++++-
 megatron/optimizer/optimizer.py         |  9 +++++++--
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 61b943263a..e625874d4f 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -611,12 +611,22 @@ def forward(self, hidden_states, attention_mask,
              value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3)
         elif self.attention_type == AttnType.self_attn and self.attention_head_type == 'multiquery':
             kv_input=hidden_states
-            if get_args().sequence_parallel:
-                # The linear layer doesn't gather the sequence-parallel.
-                kv_input = mpu.gather_from_sequence_parallel_region(kv_input, tensor_parallel_output_grad=False)
             # Attention heads [sq, b, h] --> [sq, b, (2 * hn)]
             mixed_kv_layer = self.key_value(kv_input)
 
+            # Reduce the KV gradients in the tensor-parallel direction.
+            # This is different from multi-head attention which reduces the KV input,
+            # because the sum over attn heads happens in the attn weight gradient instead of the KV layer:
+            #   A [b, n * sq, sk] = Q [b, n * sq, hn] x K^T [b, hn, sk]
+            #   G_K [b, sk, hn] = G_A [b, sk, n * sq] x Q [b, n * sq, hn]
+            #                   = sum_p (G_Ap [b, sk, np * sq] x Q_p [b, np * sq, hn])
+            if get_args().sequence_parallel:
+                # We switch to the tensor parallel regime here instead of at the KV input
+                # so that the KV layer is done in parallel instead of just duplicated.
+                mixed_kv_layer = mpu.gather_from_sequence_parallel_region(mixed_kv_layer, tensor_parallel_output_grad=True)
+            else:
+                mixed_kv_layer = mpu.copy_to_tensor_model_parallel_region(mixed_kv_layer)
+
             # [sq, b, (2 * hn)] --> [sq, b, np (expanded), 2 * hn]
             # new_tensor_shape = mixed_kv_layer.size()[:-1] + \
             #     (self.num_attention_heads_per_partition,
diff --git a/megatron/optimizer/distrib_optimizer.py b/megatron/optimizer/distrib_optimizer.py
index fa3712d914..ee266175c2 100644
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
@@ -542,7 +542,11 @@ def reduce_model_grads(self, args, timers):
         timers('backward-embedding-all-reduce').stop()
 
         # All-reduce key-value grads if needed.
-        if args.attention_head_type == "multiquery":
+        if (
+            args.attention_head_type == "multiquery"
+            and mpu.get_tensor_model_parallel_world_size() > 1
+            and args.sequence_parallel
+        ):
             timers('backward-key-value-all-reduce').start()
             self.allreduce_key_value_grads(args)
             timers('backward-key-value-all-reduce').stop()
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
index 32e5d1d690..efa1bd36f8 100644
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -268,7 +268,8 @@ def allreduce_embedding_grads(self, args):
     
     def allreduce_key_value_grads(self, args):
         """
-        Reduce the gradients for the key_value weights and biases for multi-query attention.
+        Reduce the gradients for the key_value weights and biases for multi-query attention
+        with sequence parallelism.
         Coalesce the bias grads to avoid too many small reductions,
         but not the weight grads since it could cause memory issues.
         """
@@ -334,7 +335,11 @@ def reduce_model_grads(self, args, timers):
         timers('backward-embedding-all-reduce').stop()
 
         # All-reduce key-value grads if needed.
-        if args.attention_head_type == "multiquery":
+        if (
+            args.attention_head_type == "multiquery"
+            and mpu.get_tensor_model_parallel_world_size() > 1
+            and args.sequence_parallel
+        ):
             timers('backward-key-value-all-reduce').start()
             self.allreduce_key_value_grads(args)
             timers('backward-key-value-all-reduce').stop()

From bd12802633202eee33d667793c1e6f87cff59c0e Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Tue, 21 Mar 2023 23:39:57 -0400
Subject: [PATCH 091/144] support mqa in checkpoint-merging tools

---
 megatron/arguments.py               |  2 +-
 tools/checkpoint_loader_megatron.py | 24 ++++++++++++++++++++----
 tools/checkpoint_saver_megatron.py  | 23 +++++++++++++++++++----
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9b6a90321d..a6e21b8831 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -430,7 +430,7 @@ def _add_network_size_args(parser):
                        'attention. This is set to '
                        '   args.hidden_size // args.num_attention_heads '
                        'if not provided.')
-    group.add_argument('--attention-head-type', type=str, default='multihead',
+    group.add_argument('--attention-head-type', type=str, default=None,
                        choices=['multihead', 'multiquery'],
                        help='Type of attention heads. `multihead` is the standard multi-head attention.'
                        '`multiquery` shares the values and keys across attention heads')
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index 64dfd8be79..5d6849f730 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -78,6 +78,7 @@ def check_for_arg(arg_name):
     check_for_arg('iteration')
     check_for_arg('bert_binary_head')
     check_for_arg('params_dtype')
+    check_for_arg('attention_head_type')
 
     # Determine how to make our models
     if args.model_type == 'GPT':
@@ -147,6 +148,7 @@ def get_models(count, dtype, pre_process, post_process):
     # metadata
     md = types.SimpleNamespace()
     md.model_type = args.model_type
+    md.attention_head_type = margs.attention_head_type
     md.num_layers = margs.num_layers
     md.hidden_size = margs.hidden_size
     md.seq_length = margs.seq_length
@@ -202,26 +204,40 @@ def queue_put(name, msg):
             message["post layernorm weight"] = layer.post_attention_layernorm.weight.data
             message["post layernorm bias"] = layer.post_attention_layernorm.bias.data
             message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data
+            if margs.attention_head_type == "multiquery":
+                # MQA: kv is shared across tp-ranks
+                message["kv weight"] = layer.self_attention.key_value.weight.data
+                message["kv bias"] = layer.self_attention.key_value.bias.data
 
             # Grab all parallel tensors for this layer
             qkv_weight = []
             qkv_bias = []
+            q_weight = []
+            q_bias = []
             dense_weight = []
             mlp_l0_weight = []
             mlp_l0_bias = []
             mlp_l1_weight = []
             for tp_rank, model in enumerate(models):
                 layer = model.language_model.encoder.layers[layer_num]
-                qkv_weight.append(layer.self_attention.query_key_value.weight.data)
-                qkv_bias.append(layer.self_attention.query_key_value.bias.data)
+                if margs.attention_head_type == "multihead":
+                    qkv_weight.append(layer.self_attention.query_key_value.weight.data)
+                    qkv_bias.append(layer.self_attention.query_key_value.bias.data)
+                elif margs.attention_head_type == "multiquery":
+                    q_weight.append(layer.self_attention.query.weight.data)
+                    q_bias.append(layer.self_attention.query.bias.data)
                 dense_weight.append(layer.self_attention.dense.weight.data)
                 mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data)
                 mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data)
                 mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data)
 
             # concat them
-            message["qkv weight"] = torch.cat(qkv_weight, dim=0)
-            message["qkv bias"] = torch.cat(qkv_bias, dim=0)
+            if margs.attention_head_type == "multihead":
+                message["qkv weight"] = torch.cat(qkv_weight, dim=0)
+                message["qkv bias"] = torch.cat(qkv_bias, dim=0)
+            elif margs.attention_head_type == "multiquery":
+                message["q weight"] = torch.cat(q_weight, dim=0)
+                message["q bias"] = torch.cat(q_bias, dim=0)
             message["dense weight"] = torch.cat(dense_weight, dim=1)
             message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0)
             message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0)
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index 2695a00ac8..95b34a01d1 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -95,6 +95,7 @@ def check_message(msg):
                 '--seq-length', str(md.seq_length),
                 '--num-attention-heads', str(md.num_attention_heads),
                 '--max-position-embeddings', str(md.max_position_embeddings),
+                '--attention-head-type', str(md.attention_head_type),
                 '--tokenizer-type', str(md.tokenizer_type),
                 '--tensor-model-parallel-size', str(args.target_tensor_parallel_size),
                 '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size),
@@ -225,10 +226,17 @@ def get_models(count, dtype, pre_process, post_process):
             post_layernorm_weight = msg.pop("post layernorm weight")
             post_layernorm_bias = msg.pop("post layernorm bias")
             mlp_l1_bias = msg.pop("mlp l1 bias")
+            if margs.attention_head_type == "multiquery":
+                kv_weight = msg.pop("kv weight")
+                kv_bias = msg.pop("kv bias")
 
             # Split up the parallel tensors
-            qkv_weight = torch.chunk(msg.pop("qkv weight"), args.target_tensor_parallel_size, dim=0)
-            qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0)
+            if margs.attention_head_type == "multihead":
+                qkv_weight = torch.chunk(msg.pop("qkv weight"), args.target_tensor_parallel_size, dim=0)
+                qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0)
+            elif margs.attention_head_type == "multiquery":
+                q_weight = torch.chunk(msg.pop("q weight"), args.target_tensor_parallel_size, dim=0)
+                q_bias = torch.chunk(msg.pop("q bias"), args.target_tensor_parallel_size, dim=0)
             dense_weight = torch.chunk(msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1)
             mlp_l0_weight = torch.chunk(msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0)
             mlp_l0_bias = torch.chunk(msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0)
@@ -239,8 +247,15 @@ def get_models(count, dtype, pre_process, post_process):
                 l = models[tp_rank].language_model.encoder.layers[layer]
                 l.input_layernorm.weight.data.copy_(input_layernorm_weight)
                 l.input_layernorm.bias.data.copy_(input_layernorm_bias)
-                l.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank])
-                l.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank])
+                if margs.attention_head_type == "multihead":
+                    l.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank])
+                    l.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank])
+                elif margs.attention_head_type == "multiquery":
+                    # MQA: key-value are shared across tp-ranks
+                    l.self_attention.key_value.weight.data.copy_(kv_weight)
+                    l.self_attention.key_value.bias.data.copy_(kv_bias)
+                    l.self_attention.query.weight.data.copy_(q_weight[tp_rank])
+                    l.self_attention.query.bias.data.copy_(q_bias[tp_rank])
                 l.self_attention.dense.weight.data.copy_(dense_weight[tp_rank])
                 l.self_attention.dense.bias.data.copy_(dense_bias)
                 l.post_attention_layernorm.weight.data.copy_(post_layernorm_weight)

From 7d5154fdcb8876abccdcae46989e9a9955ee8513 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 22 Mar 2023 14:43:36 -0400
Subject: [PATCH 092/144] add flash-attn

---
 megatron/arguments.py         |   3 +
 megatron/model/transformer.py | 100 ++++++++++++++++++++++++++++++++--
 2 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9b6a90321d..73e33f51cb 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -648,6 +648,9 @@ def _add_training_args(parser):
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
+    group.add_argument('--use-flash-attn', action='store_true',
+                       help='use FlashAttention implementation of attention. '
+                       'https://arxiv.org/abs/2205.14135')
     group.add_argument('--optimizer', type=str, default='adam',
                        choices=['adam', 'sgd'],
                        help='Optimizer function')
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index e625874d4f..a9d6e31f6d 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -38,6 +38,16 @@
 torch._C._jit_override_can_fuse_on_cpu(True)
 torch._C._jit_override_can_fuse_on_gpu(True)
 
+try:
+    from einops import rearrange
+except ImportError:
+    rearrange = None
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+except ImportError:
+    flash_attn_unpadded_func = None
+
 
 """ We use the following notation throughout this file:
      h: hidden size
@@ -459,6 +469,48 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
         return context_layer
 
 
+class FlashSelfAttention(torch.nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
+                 device=None, dtype=None):
+        super().__init__()
+        assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, '
+                                                      'e.g., with pip install flash-attn')
+        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, q, k, v):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+        assert q.dtype in [torch.float16, torch.bfloat16]
+        assert q.is_cuda
+        batch_size, seqlen = q.shape[0], q.shape[1]
+        q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
+        max_s = seqlen
+        cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                  device=q.device)
+        output = flash_attn_unpadded_func(
+            q, k, v, cu_seqlens, cu_seqlens, max_s, max_s,
+            self.dropout_p if self.training else 0.0,
+            softmax_scale=self.softmax_scale, causal=self.causal
+        )
+        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+        return output
+
+
 class ParallelAttention(MegatronModule):
     """Parallel self-attention layer abstract class.
 
@@ -477,6 +529,20 @@ def __init__(self, init_method,
         self.attn_mask_type = attn_mask_type
         self.params_dtype = args.params_dtype
         self.attention_head_type = args.attention_head_type
+        self.sequence_parallel = args.sequence_parallel
+
+        self.use_flash_attn = args.use_flash_attn
+        if self.use_flash_attn:
+            if flash_attn_unpadded_func is None:
+                raise ImportError('FlashAttention is not installed, please install with '
+                                  'pip install flash-attn')
+            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
+                                                          'self-attention for now')
+            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
+                                                                'supports causal mask for now')
+            # TODO: add assert that we are not using alibi
+            if rearrange is None:
+                raise ImportError('einops is not installed, please install with pip install einops')
 
         projection_size = args.kv_channels * args.num_attention_heads
 
@@ -533,6 +599,11 @@ def __init__(self, init_method,
         else:
             self.core_attention = MultiQueryCoreAttention(self.layer_number, self.attn_mask_type)
         self.checkpoint_core_attention = args.recompute_granularity == 'selective'
+        
+        if self.use_flash_attn:
+            self.core_attention_flash = FlashSelfAttention(
+                causal=True, attention_dropout=args.attention_dropout
+            )
 
         # Output.
         self.dense = mpu.RowParallelLinear(
@@ -699,13 +770,30 @@ def forward(self, hidden_states, attention_mask,
         # ==================================
         # core attention computation
         # ==================================
-
-        if self.checkpoint_core_attention:
-            context_layer = self._checkpointed_attention_forward(
-                query_layer, key_layer, value_layer, attention_mask, alibi)
+        if not self.use_flash_attn:
+            if self.checkpoint_core_attention:
+                context_layer = self._checkpointed_attention_forward(
+                    query_layer, key_layer, value_layer, attention_mask, alibi)
+            else:
+                context_layer = self.core_attention(
+                    query_layer, key_layer, value_layer, attention_mask, alibi)
         else:
-            context_layer = self.core_attention(
-                query_layer, key_layer, value_layer, attention_mask, alibi)
+            
+            if self.attention_head_type == "multiquery":
+                sq, b, np, hn = query_layer.size()
+                # Expand kv to be compatible with flash-attn implementation
+                # [sq, b, 1, hn] -> [sq, b, np, hn]
+                key_layer = key_layer.expand((sq, b, np, hn))
+                value_layer = value_layer.expand((sq, b, np, hn))
+            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
+                       for x in (query_layer, key_layer, value_layer)]
+            if not self.sequence_parallel:
+                with mpu.get_cuda_rng_tracker().fork():
+                    context_layer = self.core_attention_flash(q, k, v)
+            else:
+                context_layer = self.core_attention_flash(q, k, v)
+            context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+
 
         # =================
         # Output. [sq, b, h]

From 118f0a8ae9663f01e33217a475e2dccf2e780507 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 23 Mar 2023 11:05:42 -0400
Subject: [PATCH 093/144] flash-attn: assert that alibi is not used

---
 megatron/model/transformer.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index a9d6e31f6d..77abb0cdf1 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -20,7 +20,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from megatron import get_timers, get_args, get_global_memory_buffer
+from megatron import get_timers, get_args, get_global_memory_buffer, print_rank_0
 from megatron import mpu
 from .module import MegatronModule
 from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
@@ -532,17 +532,6 @@ def __init__(self, init_method,
         self.sequence_parallel = args.sequence_parallel
 
         self.use_flash_attn = args.use_flash_attn
-        if self.use_flash_attn:
-            if flash_attn_unpadded_func is None:
-                raise ImportError('FlashAttention is not installed, please install with '
-                                  'pip install flash-attn')
-            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
-                                                          'self-attention for now')
-            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
-                                                                'supports causal mask for now')
-            # TODO: add assert that we are not using alibi
-            if rearrange is None:
-                raise ImportError('einops is not installed, please install with pip install einops')
 
         projection_size = args.kv_channels * args.num_attention_heads
 
@@ -601,6 +590,20 @@ def __init__(self, init_method,
         self.checkpoint_core_attention = args.recompute_granularity == 'selective'
         
         if self.use_flash_attn:
+            if flash_attn_unpadded_func is None:
+                raise ImportError('FlashAttention is not installed, please install with '
+                                  'pip install flash-attn')
+            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
+                                                          'self-attention for now')
+            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
+                                                                'supports causal mask for now')
+            assert args.position_embedding_type != PositionEmbeddingType.alibi, \
+                ('FlashAttention does not support alibi positional embeddings yet')
+            if rearrange is None:
+                raise ImportError('einops is not installed, please install with pip install einops')
+            
+            if self.checkpoint_core_attention:
+                print_rank_0("  Warning, using selective recomputation with flash-attn: this is not implemented and will have no effect")
             self.core_attention_flash = FlashSelfAttention(
                 causal=True, attention_dropout=args.attention_dropout
             )

From d50a89b2ce4637ec91071af717adbd5e89d0cba9 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 23 Mar 2023 11:08:20 -0400
Subject: [PATCH 094/144] fix import

---
 megatron/model/transformer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 77abb0cdf1..063527a14a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -20,8 +20,9 @@
 import torch.nn.functional as F
 from torch import nn
 
-from megatron import get_timers, get_args, get_global_memory_buffer, print_rank_0
+from megatron import get_timers, get_args, get_global_memory_buffer
 from megatron import mpu
+from megatron.utils import print_rank_0
 from .module import MegatronModule
 from megatron.model.enums import AttnMaskType, ModelType, LayerType, AttnType, PositionEmbeddingType
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm

From 61fe86d08a3924704b8ef9820f6cf97361a3b443 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Thu, 23 Mar 2023 11:22:06 -0400
Subject: [PATCH 095/144] update readme

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 2a898d87c2..bb94fe16da 100644
--- a/README.md
+++ b/README.md
@@ -333,6 +333,18 @@ Theoretical memory savings vary depending on the combination of the model's para
 | bf16 param, fp32 grads | 18 | 6 + 12/d |
 | fp32 param, fp32 grads | 16 | 8 + 8/d |
 
+## FlashAttention
+
+Usage: `--use-flash-attn`. Support attention head dimensions at most 128.
+
+[FlashAttention](https://github.com/HazyResearch/flash-attention) is a fast and
+memory-efficient algorithm to compute exact attention. It speeds up model
+training and reduces memory requirement.
+
+To install FlashAttention:
+```sh
+pip install flash-attn
+```
 
 ## GPT-3 Example
 

From f5019c884465cbf05a76c7952ec28d3ec9ccf2eb Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 24 Mar 2023 14:58:32 -0400
Subject: [PATCH 096/144] raise if using flash-attn with selective
 recomputation, swap if/else

---
 megatron/model/transformer.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 063527a14a..ae8e43d853 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -604,7 +604,7 @@ def __init__(self, init_method,
                 raise ImportError('einops is not installed, please install with pip install einops')
             
             if self.checkpoint_core_attention:
-                print_rank_0("  Warning, using selective recomputation with flash-attn: this is not implemented and will have no effect")
+                raise NotImplementedError("Using selective recomputation with flash-attn: this is not implemented.")
             self.core_attention_flash = FlashSelfAttention(
                 causal=True, attention_dropout=args.attention_dropout
             )
@@ -774,15 +774,7 @@ def forward(self, hidden_states, attention_mask,
         # ==================================
         # core attention computation
         # ==================================
-        if not self.use_flash_attn:
-            if self.checkpoint_core_attention:
-                context_layer = self._checkpointed_attention_forward(
-                    query_layer, key_layer, value_layer, attention_mask, alibi)
-            else:
-                context_layer = self.core_attention(
-                    query_layer, key_layer, value_layer, attention_mask, alibi)
-        else:
-            
+        if self.use_flash_attn:
             if self.attention_head_type == "multiquery":
                 sq, b, np, hn = query_layer.size()
                 # Expand kv to be compatible with flash-attn implementation
@@ -791,13 +783,21 @@ def forward(self, hidden_states, attention_mask,
                 value_layer = value_layer.expand((sq, b, np, hn))
             q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
                        for x in (query_layer, key_layer, value_layer)]
-            if not self.sequence_parallel:
+            if self.sequence_parallel:
+                context_layer = self.core_attention_flash(q, k, v)
+            else:
                 with mpu.get_cuda_rng_tracker().fork():
                     context_layer = self.core_attention_flash(q, k, v)
-            else:
-                context_layer = self.core_attention_flash(q, k, v)
             context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
 
+        else:
+            if self.checkpoint_core_attention:
+                context_layer = self._checkpointed_attention_forward(
+                    query_layer, key_layer, value_layer, attention_mask, alibi)
+            else:
+                context_layer = self.core_attention(
+                    query_layer, key_layer, value_layer, attention_mask, alibi)
+
 
         # =================
         # Output. [sq, b, h]

From 0ff574679723ddf31eb2c87cc8377f0be1a76ca2 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Fri, 24 Mar 2023 18:36:40 -0400
Subject: [PATCH 097/144] change back to warning

---
 megatron/model/transformer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index ae8e43d853..2e437a901a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -604,7 +604,8 @@ def __init__(self, init_method,
                 raise ImportError('einops is not installed, please install with pip install einops')
             
             if self.checkpoint_core_attention:
-                raise NotImplementedError("Using selective recomputation with flash-attn: this is not implemented.")
+                print_rank_0("  Warning, using selective recomputation with flash-attn: this is already handled in the "
+                             "flash-attn library and has no effect.")
             self.core_attention_flash = FlashSelfAttention(
                 causal=True, attention_dropout=args.attention_dropout
             )

From 36d04356120284f4444462c08de6f6e5fe4ef66b Mon Sep 17 00:00:00 2001
From: leandro von werra <leandro@hf.co>
Date: Wed, 29 Mar 2023 10:12:13 +0000
Subject: [PATCH 098/144] add token/s/gpu to wandb

---
 megatron/training.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/megatron/training.py b/megatron/training.py
index 4c3c5fff05..14cdbb6ea5 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -619,6 +619,10 @@ def add_to_logging(name):
     if iteration % args.log_interval == 0:
         elapsed_time = timers('interval-time').elapsed()
         elapsed_time_per_iteration = elapsed_time / total_iterations
+        
+        num_gpus = args.data_parallel_size * args.tensor_model_parallel_size * args.pipeline_model_parallel_size
+        tokens_per_sec_per_gpu = (args.seq_length * batch_size) / num_gpus / elapsed_time_per_iteration
+
         tflops = get_tflops(batch_size, elapsed_time_per_iteration)
         if writer:
             if args.log_timers_to_tensorboard:
@@ -672,6 +676,7 @@ def add_to_logging(name):
             'loss-scale': loss_scale,
             'grad-norm': grad_norm,
             'tflops': tflops,
+            'tokens-per-second-per-gpu': tokens_per_sec_per_gpu, 
             **loss_dict
         }
         wandb.log(metrics, step=iteration)

From b691302988026188c5ac94b7a4aada05fef2c3c6 Mon Sep 17 00:00:00 2001
From: leandro von werra <leandro@hf.co>
Date: Wed, 29 Mar 2023 10:32:07 +0000
Subject: [PATCH 099/144] fix distributed optimizer

---
 megatron/training.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 4c3c5fff05..e186ce715f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -391,9 +391,6 @@ def setup_model_and_optimizer(model_provider_func,
         torch.distributed.barrier()
         timers('load-checkpoint').stop()
         timers.log(['load-checkpoint'])
-        # This is critical when only model is loaded. We should make sure
-        # main parameters are also updated.
-        optimizer.reload_model_params()
     else:
         args.iteration = 0
 

From a8e64f6f79cd5ca31db5f11336af78baeaa5282c Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 3 Apr 2023 13:26:31 -0400
Subject: [PATCH 100/144] support checkpoints with distrib optimizer in
 checkpoint-util

---
 megatron/checkpointing.py           | 17 ++++++++++-------
 tools/checkpoint_loader_megatron.py |  3 +++
 tools/checkpoint_util.py            |  3 +++
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index c3359ed18c..7d245d42fb 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -92,7 +92,7 @@ def ensure_directory_exists(filename):
 
 
 def get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer, release=False,
-                        pipeline_parallel=None, tensor_rank=None, pipeline_rank=None):
+                        pipeline_parallel=None, tensor_rank=None, pipeline_rank=None, only_model=False):
     """Determine the directory name for this rank's checkpoint."""
     if release:
         directory = 'release'
@@ -119,8 +119,9 @@ def get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer,
 
     if use_distributed_optimizer:
         model_name = os.path.join(common_path, "model_rng.pt")
+        data_parallel_rank = 0 if only_model else mpu.get_data_parallel_rank()
         optim_name = os.path.join(
-            common_path + "_%03d" % mpu.get_data_parallel_rank(),
+            common_path + "_%03d" % data_parallel_rank,
             "optim.pt")
     else:
         model_name = optim_name = os.path.join(common_path, "model_optim_rng.pt")
@@ -139,14 +140,14 @@ def find_checkpoint_rank_0(checkpoints_path, iteration, use_distributed_optimize
     # Look for checkpoint with no pipelining
     filenames = get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer, release,
                                      pipeline_parallel=False,
-                                     tensor_rank=0, pipeline_rank=0)
+                                     tensor_rank=0, pipeline_rank=0, only_model=True)
     if os.path.isfile(filenames[0]):
         return filenames
 
     # Look for checkpoint with pipelining
     filenames = get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer, release,
                                     pipeline_parallel=True,
-                                    tensor_rank=0, pipeline_rank=0)
+                                    tensor_rank=0, pipeline_rank=0, only_model=True)
     if os.path.isfile(filenames[0]):
         return filenames
 
@@ -379,10 +380,11 @@ def fix_query_key_value_ordering(model, checkpoint_version):
         print_rank_0(" succesfully fixed query-key-values ordering for"
                     " checkpoint version {}".format(checkpoint_version))
 
-def _load_base_checkpoint(load_dir, use_distributed_optimizer, rank0=False, iteration=None, release=None):
+def _load_base_checkpoint(load_dir, use_distributed_optimizer, rank0=False, iteration=None, release=None, no_load_optim=False):
     """ Load the base state_dict from the given directory
 
     If rank0 is true, just loads rank 0 checkpoint, ignoring arguments.
+    If rank0 is true or no_load_optim is true, we do not care about the optimizer, only the model checkpoint.
     """
 
     # Read the tracker file and set the iteration.
@@ -408,7 +410,7 @@ def _load_base_checkpoint(load_dir, use_distributed_optimizer, rank0=False, iter
                                                   release)
     else:
         checkpoint_names = get_checkpoint_names(load_dir, iteration, use_distributed_optimizer,
-                                                release)
+                                                release, only_model=no_load_optim)
         if release:
             print_rank_0(f' loading release checkpoint from {load_dir}')
         else:
@@ -572,7 +574,8 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                               use_distributed_optimizer=args.use_distributed_optimizer,
                               rank0=False,
                               iteration=iteration,
-                              release=release)
+                              release=release,
+                              no_load_optim=args.no_load_optim)
 
     if model_state_dict is None:
         return 0
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index 5d6849f730..bb701e1572 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -50,6 +50,9 @@ def _load_checkpoint(queue, args):
                 '--no-initialization',
                 '--load', args.load_dir
                 ]
+    if args.use_distributed_optimizer:
+        sys.argv.append("--use-distributed-optimizer")
+
 
     margs = parse_args()
     margs = load_args_from_checkpoint(margs)
diff --git a/tools/checkpoint_util.py b/tools/checkpoint_util.py
index 628ce47c62..94f2fec3d8 100644
--- a/tools/checkpoint_util.py
+++ b/tools/checkpoint_util.py
@@ -124,6 +124,9 @@ def main():
     parser.add_argument('--no-checking', action='store_false',
                         help='Do not perform checking on the name and ordering of weights',
                         dest='checking')
+    
+    parser.add_argument('--use-distributed-optimizer', action='store_true',
+                       help='Loaded checkpoint uses distributed optimizer.')
 
     known_args, _ = parser.parse_known_args()
     loader = load_plugin('loader', known_args.loader)

From 57f21b7919a1959deee8019cee8678beaf23a50e Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 3 Apr 2023 15:49:41 -0400
Subject: [PATCH 101/144] don't load optimizer instead of arbitrarily loading
 dp-rank 0

---
 megatron/checkpointing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 7d245d42fb..7e5b5ea485 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -119,9 +119,8 @@ def get_checkpoint_names(checkpoints_path, iteration, use_distributed_optimizer,
 
     if use_distributed_optimizer:
         model_name = os.path.join(common_path, "model_rng.pt")
-        data_parallel_rank = 0 if only_model else mpu.get_data_parallel_rank()
-        optim_name = os.path.join(
-            common_path + "_%03d" % data_parallel_rank,
+        optim_name = None if only_model else os.path.join(
+            common_path + "_%03d" % mpu.get_data_parallel_rank(),
             "optim.pt")
     else:
         model_name = optim_name = os.path.join(common_path, "model_optim_rng.pt")
@@ -421,7 +420,9 @@ def _load_base_checkpoint(load_dir, use_distributed_optimizer, rank0=False, iter
     # Load the checkpoint.
     try:
         model_state_dict = torch.load(model_checkpoint_name, map_location='cpu')
-        if use_distributed_optimizer:
+        if rank0 or no_load_optim:
+            optim_state_dict = None
+        elif use_distributed_optimizer:
             optim_state_dict = torch.load(optim_checkpoint_name, map_location='cpu')
         else:
             optim_state_dict = model_state_dict

From 22b86119ef3d42879ac949cdf1a37056b0156049 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal19999@gmail.com>
Date: Fri, 14 Apr 2023 10:07:04 +0000
Subject: [PATCH 102/144] add bigcode model slurm script

---
 examples/pretrain_bigcode_model.slurm | 140 ++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 examples/pretrain_bigcode_model.slurm

diff --git a/examples/pretrain_bigcode_model.slurm b/examples/pretrain_bigcode_model.slurm
new file mode 100644
index 0000000000..bca3671203
--- /dev/null
+++ b/examples/pretrain_bigcode_model.slurm
@@ -0,0 +1,140 @@
+#!/bin/bash
+#SBATCH --job-name=bigcode-training
+#SBATCH --nodes=64
+#SBATCH --ntasks-per-node=1          
+#SBATCH --cpus-per-task=96
+#SBATCH --gres=gpu:8
+#SBATCH --exclusive
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/bigcode/bigcode-training/logs/run-%x-%j.out
+
+set -x -e
+source /admin/home/loubna/.bashrc
+
+conda activate megatron
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
+pushd $SCRIPT_REPO
+
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/6672
+TOKENIZER_FILE=/fsx/loubna/data/tokenizer/tokenizer-the-stack-march-sample-v3-no-prefix-spaces/tokenizer.json
+WEIGHTS_TRAIN=/fsx/loubna/code/bigcode-data-mix/data/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/loubna/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+       --sequence-parallel \
+       --num-layers 40 \
+       --hidden-size 6144 \
+       --num-attention-heads 48 \
+       --attention-head-type multiquery \
+       --init-method-std 0.01275 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 512 \
+       --lr 0.0003 \
+       --min-lr 0.00003 \
+       --train-iters 250000 \
+       --lr-decay-iters 250000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 2000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 2500 \
+       --eval-interval 2500 \
+       --eval-iters 2 \
+       --use-distributed-optimizer \
+       --valid-num-workers 0 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+CMD=" \
+    /fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name loubnabnl \
+    --wandb-project-name bigcode-pretraining \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+export CUDA_HOME=/usr/local/cuda-11.6
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+rm -rf $CHECKPOINT_PATH
+
+echo "END TIME: $(date)"
\ No newline at end of file

From c988cf251fab7846df6aab93d242deb833586575 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Fri, 12 May 2023 17:41:15 +0200
Subject: [PATCH 103/144] Update slurm script

---
 examples/pretrain_bigcode_model.slurm | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/pretrain_bigcode_model.slurm b/examples/pretrain_bigcode_model.slurm
index bca3671203..b9f9f19cdd 100644
--- a/examples/pretrain_bigcode_model.slurm
+++ b/examples/pretrain_bigcode_model.slurm
@@ -135,6 +135,4 @@ SRUN_ARGS=" \
 # py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
 clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
 
-rm -rf $CHECKPOINT_PATH
-
-echo "END TIME: $(date)"
\ No newline at end of file
+echo "END TIME: $(date)"

From 0048491b1c06893d63bdf179e7e0f14e92423424 Mon Sep 17 00:00:00 2001
From: Leandro von Werra <lvwerra@users.noreply.github.com>
Date: Tue, 16 May 2023 19:57:29 +0200
Subject: [PATCH 104/144] Finetune StarCoder Megatron

---
 examples/finetune_bigcode_model.slurm | 144 ++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 examples/finetune_bigcode_model.slurm

diff --git a/examples/finetune_bigcode_model.slurm b/examples/finetune_bigcode_model.slurm
new file mode 100644
index 0000000000..603b7e24fa
--- /dev/null
+++ b/examples/finetune_bigcode_model.slurm
@@ -0,0 +1,144 @@
+#!/bin/bash
+#SBATCH --job-name=starcoderpy
+#SBATCH --nodes=64
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --gres=gpu:8
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/leandro/logs/starcoderpy/bcs-%x-%j.out
+
+set -x -e
+source /admin/home/leandro/.bashrc
+
+conda activate megatron
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/leandro/git/Megatron-LM-BC
+pushd $SCRIPT_REPO
+
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+STARCODER_PATH=/fsx/boomcode/starcoder/
+CHECKPOINT_PATH=/fsx/boomcode/starcoderpy/$SLURM_JOB_ID
+TOKENIZER_FILE=/fsx/boomcode/tokenizer-starcoder/tokenizer.json
+WEIGHTS_TRAIN=/fsx/boomcode/datamix_python/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/boomcode/datamix_python/valid_data_paths.txt.tmp
+DATA_PATH=/fsx/boomcode/tokenized/python/
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+        --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+       --sequence-parallel \
+       --num-layers 40 \
+       --hidden-size 6144 \
+       --num-attention-heads 48 \
+       --attention-head-type multiquery \
+       --init-method-std 0.01275 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 512 \
+       --lr 0.00005 \
+       --min-lr 0.000005 \
+       --train-iters 258500 \
+       --lr-decay-iters 8500 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 500 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 2500 \
+       --eval-interval 100 \
+       --eval-iters 10 \
+       --valid-num-workers 0 \
+       --override-opt_param-scheduler \
+       --no-load-optim \
+       --no-load-rng \
+       --finetune \
+"
+
+# --dataloader-type cyclic\
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+CMD=" \
+    $SCRIPT_REPO/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $STARCODER_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name lvwerra \
+    --wandb-project-name starcoder-py \
+    "
+
+# --data-path $DATA_PATH\gpt2-preprocessed_content_document 
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+export CUDA_HOME=/usr/local/cuda-11.6
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From 893aaa5e14df115c6844c29f91e4bec5c76ef81e Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 22 May 2023 13:55:56 +0530
Subject: [PATCH 105/144] assert Flash Attention doesn't get arbitrary mask

---
 megatron/arguments.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 8af1a9b60a..39d0c8f825 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -378,6 +378,10 @@ def validate_args(args, defaults={}):
     if args.sequence_parallel:
         args.async_tensor_model_parallel_allreduce = False
 
+    if args.use_flash_attn:
+        assert not args.reset_attention_mask, \
+            "Flash Attention doesn't support arbitrary attention masks. Please turn off reset-attention-mask"
+
     _print_args(args)
     return args
 

From d06e73733105e798125ce6620b8e9b27f42b8c2b Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 22 May 2023 14:22:22 +0530
Subject: [PATCH 106/144] fix dtypes for new numpy versions

---
 megatron/data/indexed_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index 2f6e1b845c..9ab814eeff 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -95,8 +95,8 @@ def write_longs(f, a):
     3: np.int16,
     4: np.int32,
     5: np.int64,
-    6: np.float,
-    7: np.double,
+    6: np.float32,
+    7: np.float64,
     8: np.uint16
 }
 
@@ -268,8 +268,8 @@ class IndexedDatasetBuilder(object):
         np.int16: 2,
         np.int32: 4,
         np.int64: 8,
-        np.float: 4,
-        np.double: 8
+        np.float32: 4,
+        np.float64: 8
     }
 
     def __init__(self, out_file, dtype=np.int32):

From 0e2415aacf9867f45dc1c5d68b032b528380ac74 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 22 May 2023 15:13:26 +0530
Subject: [PATCH 107/144] fused layer norm

---
 examples/debug.sh                             | 60 +++++++++++++++++++
 megatron/arguments.py                         |  2 +
 megatron/fused_kernels/__init__.py            |  4 +-
 .../fused_kernels/tests/test_fused_kernels.py | 12 +++-
 megatron/model/fused_layer_norm.py            |  2 +-
 tools/checkpoint_loader_megatron.py           |  2 +
 tools/checkpoint_saver_megatron.py            |  2 +
 7 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100644 examples/debug.sh

diff --git a/examples/debug.sh b/examples/debug.sh
new file mode 100644
index 0000000000..8aedab0acf
--- /dev/null
+++ b/examples/debug.sh
@@ -0,0 +1,60 @@
+export NCCL_SOCKET_IFNAME="ib,bond"
+export NCCL_IB_CUDA_SUPPORT=1
+
+MASTER_ADDR=$(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | head -n 1)
+MASTER_PORT=5${LSB_JOBID: -5:-1}
+NNODES=$(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | sed 'n; d' | wc -w)
+GPUS_PER_NODE=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -w)
+NODE_RANK=$(($(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | sed 'n; d' | grep -n -m1 $HOSTNAME | cut -d':' -f1)-1))
+
+
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+CHECKPOINT_PATH=checkpoints  # Adjust: Directory to store the checkpoints
+DATA_PATH=data/debug_text_document  # Adjust: Prefix of the preprocessed dataset.
+TOKENIZER_FILE=tokenizer.json  # Adjust
+
+GPT_ARGS="\
+--tensor-model-parallel-size 1 \
+--pipeline-model-parallel-size 1 \
+--recompute-activations \
+--num-layers 24 \
+--hidden-size 2048 \
+--num-attention-heads 16 \
+--attention-head-type multiquery \
+--init-method-std 0.022 \
+--seq-length 2048 \
+--max-position-embeddings 2048 \
+--attention-dropout 0.1 \
+--hidden-dropout 0.1 \
+--micro-batch-size 2 \
+--global-batch-size 192 \
+--lr 0.0002 \
+--train-iters 3000 \
+--lr-decay-iters 600000 \
+--lr-decay-style cosine \
+--lr-warmup-fraction 0.02 \
+--weight-decay .1 \
+--adam-beta2 .95 \
+--clip-grad 1.0 \
+--fp16 \
+--log-interval 10 \
+--save-interval 4000 \
+--eval-interval 200 \
+--eval-iters 10 \
+--initial-loss-scale 65536 \
+--fim-rate 0.5 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+torchrun $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       $GPT_ARGS \
+       --tokenizer-type TokenizerFromFile \
+       --tokenizer-file $TOKENIZER_FILE \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       $TENSORBOARD_ARGS
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 8af1a9b60a..5732b4e5be 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -637,6 +637,8 @@ def _add_training_args(parser):
                        'training if SIGTERM is received')
     group.add_argument('--tensorboard-dir', type=str, default=None,
                        help='Write TensorBoard logs to this directory.')
+    group.add_argument("--use-kernels-from-apex", action="store_true",
+                       help="use Apex kernels instead of Megatron")
     group.add_argument('--no-masked-softmax-fusion',
                        action='store_false',
                        help='Disable fusion of query_key_value scaling, '
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 6a44db2282..51da7b1f0b 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -24,6 +24,8 @@
 
 
 def load(args):
+    if args.use_kernels_from_apex:
+        return
 
     # Check if cuda 11 is installed for compute capability 8.0
     cc_flag = []
@@ -88,7 +90,7 @@ def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
     sources=[srcpath / 'layer_norm_cuda.cpp',
              srcpath / 'layer_norm_cuda_kernel.cu']
     fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
-        "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags)
+        "fused_layer_norm_cuda", sources, extra_cuda_flags)
 
     # =================================
     # Fused gradient accumulation to weight gradient computation of linear layer
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index 524ce6f0ea..4f49207dd0 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -11,7 +11,7 @@
 
 def test_load_fused_kernels():
     try:
-        import fused_mix_prec_layer_norm_cuda
+        import fused_layer_norm_cuda
         import scaled_masked_softmax_cuda
         import scaled_upper_triang_masked_softmax_cuda
         import torch
@@ -279,6 +279,13 @@ def test_layer_norm():
         )
 
 
+class DummyArgs:
+    rank: int = 0
+    masked_softmax_fusion: bool = True
+    gradient_accumulation_fusion: bool = True
+    use_kernels_from_apex: bool = True
+
+
 if __name__ == "__main__":
     try:
         from transformers import BertTokenizer, GPT2Tokenizer
@@ -294,6 +301,9 @@ def test_layer_norm():
         print("\n[Fail] Please install `transformers` package to test fused kernels\n")
         exit(-1)
 
+    from megatron.fused_kernels import load
+    load(DummyArgs())
+
     test_load_fused_kernels()
     test_fused_softmax()
     test_fused_upper_triangle_mask_softmax()
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 53f3fd516a..92c7e9fe3e 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -76,7 +76,7 @@ def __init__(self, normalized_shape, eps=1e-5,
 
         global fused_mix_prec_layer_norm_cuda
         fused_mix_prec_layer_norm_cuda = importlib.import_module(
-          "fused_mix_prec_layer_norm_cuda")
+          "fused_layer_norm_cuda")
 
         # List of hiddens sizes supported in the persistent layer norm kernel
         # If the hidden size is not supported, fall back to the non-persistent
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index bb701e1572..fde13da560 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -15,6 +15,8 @@ def add_arguments(parser):
                        'trim padding from the embedding table.')
     group.add_argument('--megatron-path', type=str, default=None,
                        help='Base directory of deepspeed repository')
+    group.add_argument("--use-kernels-from-apex", action="store_true",
+                       help="use Apex kernels instead of Megatron")
 
 def _load_checkpoint(queue, args):
 
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index 95b34a01d1..dcc032925e 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -18,6 +18,8 @@ def add_arguments(parser):
     group.add_argument('--target-pipeline-parallel-size', type=int,
                        help='Target tensor model parallel size, default to the pipeline parall size '
                        'in the input checkpoint if provided by the loader, otherwise to 1')
+    group.add_argument("--use-kernels-from-apex", action="store_true",
+                       help="use Apex kernels instead of Megatron")
 
 def save_checkpoint(queue, args):
 

From 041b7330687ca54d42991c51d698d9046daf6ea5 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 22 May 2023 17:48:19 +0530
Subject: [PATCH 108/144] move cuda kernels

---
 megatron/fused_kernels/{ => cuda}/__init__.py               | 0
 megatron/fused_kernels/{ => cuda}/compat.h                  | 0
 .../{ => cuda}/fused_weight_gradient_dense.cpp              | 0
 .../fused_kernels/{ => cuda}/fused_weight_gradient_dense.cu | 0
 megatron/fused_kernels/{ => cuda}/layer_norm_cuda.cpp       | 0
 megatron/fused_kernels/{ => cuda}/layer_norm_cuda_kernel.cu | 0
 megatron/fused_kernels/{ => cuda}/scaled_masked_softmax.cpp | 0
 megatron/fused_kernels/{ => cuda}/scaled_masked_softmax.h   | 0
 .../fused_kernels/{ => cuda}/scaled_masked_softmax_cuda.cu  | 0
 megatron/fused_kernels/{ => cuda}/scaled_softmax.cpp        | 0
 megatron/fused_kernels/{ => cuda}/scaled_softmax_cuda.cu    | 0
 .../{ => cuda}/scaled_upper_triang_masked_softmax.cpp       | 0
 .../{ => cuda}/scaled_upper_triang_masked_softmax.h         | 0
 .../{ => cuda}/scaled_upper_triang_masked_softmax_cuda.cu   | 0
 megatron/fused_kernels/{ => cuda}/type_shim.h               | 0
 megatron/fused_kernels/tests/test_fused_kernels.py          | 2 +-
 megatron/initialize.py                                      | 6 +++---
 tools/checkpoint_loader_megatron.py                         | 6 ++++--
 tools/checkpoint_saver_megatron.py                          | 6 ++++--
 19 files changed, 12 insertions(+), 8 deletions(-)
 rename megatron/fused_kernels/{ => cuda}/__init__.py (100%)
 rename megatron/fused_kernels/{ => cuda}/compat.h (100%)
 rename megatron/fused_kernels/{ => cuda}/fused_weight_gradient_dense.cpp (100%)
 rename megatron/fused_kernels/{ => cuda}/fused_weight_gradient_dense.cu (100%)
 rename megatron/fused_kernels/{ => cuda}/layer_norm_cuda.cpp (100%)
 rename megatron/fused_kernels/{ => cuda}/layer_norm_cuda_kernel.cu (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_masked_softmax.cpp (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_masked_softmax.h (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_masked_softmax_cuda.cu (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_softmax.cpp (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_softmax_cuda.cu (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_upper_triang_masked_softmax.cpp (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_upper_triang_masked_softmax.h (100%)
 rename megatron/fused_kernels/{ => cuda}/scaled_upper_triang_masked_softmax_cuda.cu (100%)
 rename megatron/fused_kernels/{ => cuda}/type_shim.h (100%)

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/cuda/__init__.py
similarity index 100%
rename from megatron/fused_kernels/__init__.py
rename to megatron/fused_kernels/cuda/__init__.py
diff --git a/megatron/fused_kernels/compat.h b/megatron/fused_kernels/cuda/compat.h
similarity index 100%
rename from megatron/fused_kernels/compat.h
rename to megatron/fused_kernels/cuda/compat.h
diff --git a/megatron/fused_kernels/fused_weight_gradient_dense.cpp b/megatron/fused_kernels/cuda/fused_weight_gradient_dense.cpp
similarity index 100%
rename from megatron/fused_kernels/fused_weight_gradient_dense.cpp
rename to megatron/fused_kernels/cuda/fused_weight_gradient_dense.cpp
diff --git a/megatron/fused_kernels/fused_weight_gradient_dense.cu b/megatron/fused_kernels/cuda/fused_weight_gradient_dense.cu
similarity index 100%
rename from megatron/fused_kernels/fused_weight_gradient_dense.cu
rename to megatron/fused_kernels/cuda/fused_weight_gradient_dense.cu
diff --git a/megatron/fused_kernels/layer_norm_cuda.cpp b/megatron/fused_kernels/cuda/layer_norm_cuda.cpp
similarity index 100%
rename from megatron/fused_kernels/layer_norm_cuda.cpp
rename to megatron/fused_kernels/cuda/layer_norm_cuda.cpp
diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/cuda/layer_norm_cuda_kernel.cu
similarity index 100%
rename from megatron/fused_kernels/layer_norm_cuda_kernel.cu
rename to megatron/fused_kernels/cuda/layer_norm_cuda_kernel.cu
diff --git a/megatron/fused_kernels/scaled_masked_softmax.cpp b/megatron/fused_kernels/cuda/scaled_masked_softmax.cpp
similarity index 100%
rename from megatron/fused_kernels/scaled_masked_softmax.cpp
rename to megatron/fused_kernels/cuda/scaled_masked_softmax.cpp
diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/cuda/scaled_masked_softmax.h
similarity index 100%
rename from megatron/fused_kernels/scaled_masked_softmax.h
rename to megatron/fused_kernels/cuda/scaled_masked_softmax.h
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/cuda/scaled_masked_softmax_cuda.cu
similarity index 100%
rename from megatron/fused_kernels/scaled_masked_softmax_cuda.cu
rename to megatron/fused_kernels/cuda/scaled_masked_softmax_cuda.cu
diff --git a/megatron/fused_kernels/scaled_softmax.cpp b/megatron/fused_kernels/cuda/scaled_softmax.cpp
similarity index 100%
rename from megatron/fused_kernels/scaled_softmax.cpp
rename to megatron/fused_kernels/cuda/scaled_softmax.cpp
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/cuda/scaled_softmax_cuda.cu
similarity index 100%
rename from megatron/fused_kernels/scaled_softmax_cuda.cu
rename to megatron/fused_kernels/cuda/scaled_softmax_cuda.cu
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/megatron/fused_kernels/cuda/scaled_upper_triang_masked_softmax.cpp
similarity index 100%
rename from megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
rename to megatron/fused_kernels/cuda/scaled_upper_triang_masked_softmax.cpp
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/cuda/scaled_upper_triang_masked_softmax.h
similarity index 100%
rename from megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
rename to megatron/fused_kernels/cuda/scaled_upper_triang_masked_softmax.h
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/cuda/scaled_upper_triang_masked_softmax_cuda.cu
similarity index 100%
rename from megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
rename to megatron/fused_kernels/cuda/scaled_upper_triang_masked_softmax_cuda.cu
diff --git a/megatron/fused_kernels/type_shim.h b/megatron/fused_kernels/cuda/type_shim.h
similarity index 100%
rename from megatron/fused_kernels/type_shim.h
rename to megatron/fused_kernels/cuda/type_shim.h
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index 4f49207dd0..95ac16568f 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -301,7 +301,7 @@ class DummyArgs:
         print("\n[Fail] Please install `transformers` package to test fused kernels\n")
         exit(-1)
 
-    from megatron.fused_kernels import load
+    from megatron.fused_kernels.cuda import load
     load(DummyArgs())
 
     test_load_fused_kernels()
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 3170ba6236..5ec87e4158 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -31,7 +31,7 @@
 except ModuleNotFoundError:
     print('Wandb import failed', flush=True)
 
-from megatron import fused_kernels
+from megatron.fused_kernels import cuda
 from megatron import get_adlr_autoresume
 from megatron import get_args
 from megatron import get_tensorboard_writer
@@ -198,11 +198,11 @@ def _compile_dependencies():
     if torch.distributed.get_rank() == 0:
         start_time = time.time()
         print('> compiling and loading fused kernels ...', flush=True)
-        fused_kernels.load(args)
+        cuda.load(args)
         torch.distributed.barrier()
     else:
         torch.distributed.barrier()
-        fused_kernels.load(args)
+        cuda.load(args)
     # Simple barrier to make sure all ranks have passed the
     # compilation phase successfully before moving on to the
     # rest of the program. We think this might ensure that
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index fde13da560..c67cf565a4 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -5,6 +5,8 @@
 
 import torch
 
+from megatron.fused_kernels import cuda
+
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
 
@@ -32,7 +34,7 @@ def _load_checkpoint(queue, args):
         from megatron.global_vars import set_args, set_global_variables
         from megatron.checkpointing import load_args_from_checkpoint, load_checkpoint
         from megatron.model import ModelType, module
-        from megatron import mpu, fused_kernels
+        from megatron import mpu
     except ModuleNotFoundError:
         print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
         queue.put("exit")
@@ -131,7 +133,7 @@ def get_models(count, dtype, pre_process, post_process):
     set_global_variables(margs)
     mpu.initialize.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
     mpu.initialize.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
-    fused_kernels.load(margs)
+    cuda.load(margs)
 
     # Get true (non-padded) vocab size
     if args.true_vocab_size is not None:
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index dcc032925e..0a370befc1 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -6,6 +6,8 @@
 
 import torch
 
+from megatron.fused_kernels import cuda
+
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron saver')
 
@@ -36,7 +38,7 @@ def save_checkpoint(queue, args):
         from megatron.global_vars import set_global_variables, get_args
         from megatron.model import ModelType
         from megatron.tokenizer.tokenizer import _vocab_size_with_padding
-        from megatron import mpu, fused_kernels
+        from megatron import mpu
     except ModuleNotFoundError:
         print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
         exit(1)
@@ -159,7 +161,7 @@ def get_models(count, dtype, pre_process, post_process):
     mpu.initialize.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size)
     mpu.initialize.set_tensor_model_parallel_rank(0)
     mpu.initialize.set_pipeline_model_parallel_rank(0)
-    fused_kernels.load(margs)
+    cuda.load(margs)
 
     # Embeddings
     #-----------

From 28780a79619df9768c255e9e6b49e5ecac523dec Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 22 May 2023 18:29:07 +0530
Subject: [PATCH 109/144] add rocm

---
 megatron/fused_kernels/__init__.py            |  10 +
 megatron/fused_kernels/cuda/__init__.py       |  13 +-
 megatron/fused_kernels/rocm/__init__.py       |  80 ++
 megatron/fused_kernels/rocm/compat.h          |  31 +
 .../fused_kernels/rocm/layer_norm_cuda.cpp    | 201 ++++
 .../rocm/layer_norm_cuda_kernel.cu            | 866 ++++++++++++++++++
 .../rocm/scaled_masked_softmax.cpp            |  77 ++
 .../rocm/scaled_masked_softmax.h              | 492 ++++++++++
 .../rocm/scaled_masked_softmax_cuda.cu        | 114 +++
 .../scaled_upper_triang_masked_softmax.cpp    |  72 ++
 .../rocm/scaled_upper_triang_masked_softmax.h | 511 +++++++++++
 ...scaled_upper_triang_masked_softmax_cuda.cu | 100 ++
 megatron/fused_kernels/rocm/type_shim.h       |  91 ++
 megatron/fused_kernels/utils.py               |   9 +
 14 files changed, 2655 insertions(+), 12 deletions(-)
 create mode 100644 megatron/fused_kernels/__init__.py
 create mode 100644 megatron/fused_kernels/rocm/__init__.py
 create mode 100644 megatron/fused_kernels/rocm/compat.h
 create mode 100644 megatron/fused_kernels/rocm/layer_norm_cuda.cpp
 create mode 100644 megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu
 create mode 100644 megatron/fused_kernels/rocm/scaled_masked_softmax.cpp
 create mode 100644 megatron/fused_kernels/rocm/scaled_masked_softmax.h
 create mode 100644 megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
 create mode 100644 megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.cpp
 create mode 100644 megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
 create mode 100644 megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu
 create mode 100644 megatron/fused_kernels/rocm/type_shim.h
 create mode 100644 megatron/fused_kernels/utils.py

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
new file mode 100644
index 0000000000..8f097e7d7d
--- /dev/null
+++ b/megatron/fused_kernels/__init__.py
@@ -0,0 +1,10 @@
+def load(args):
+    if args.use_kernels_from_apex:
+        return
+
+    if args.device == "cuda":
+        from megatron.fused_kernels.cuda import load as load_kernels
+    elif args.device == "rocm":
+        from megatron.fused_kernels.rocm import load as load_kernels
+
+    load_kernels(args)
diff --git a/megatron/fused_kernels/cuda/__init__.py b/megatron/fused_kernels/cuda/__init__.py
index 51da7b1f0b..5e81da7c6f 100644
--- a/megatron/fused_kernels/cuda/__init__.py
+++ b/megatron/fused_kernels/cuda/__init__.py
@@ -13,20 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import pathlib
 import subprocess
 
 from torch.utils import cpp_extension
+from megatron.fused_kernels.utils import _create_build_dir
 
 # Do not override TORCH_CUDA_ARCH_LIST to allow for pre-compilation in Dockerfile
 # os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 
 
 def load(args):
-    if args.use_kernels_from_apex:
-        return
-
     # Check if cuda 11 is installed for compute capability 8.0
     cc_flag = []
     _, bare_metal_major, _ = _get_cuda_bare_metal_version(
@@ -113,11 +110,3 @@ def _get_cuda_bare_metal_version(cuda_dir):
     bare_metal_minor = release[1][0]
 
     return raw_output, bare_metal_major, bare_metal_minor
-
-
-def _create_build_dir(buildpath):
-    try:
-        os.mkdir(buildpath)
-    except OSError:
-        if not os.path.isdir(buildpath):
-            print(f"Creation of the build directory {buildpath} failed")
diff --git a/megatron/fused_kernels/rocm/__init__.py b/megatron/fused_kernels/rocm/__init__.py
new file mode 100644
index 0000000000..311eb5a902
--- /dev/null
+++ b/megatron/fused_kernels/rocm/__init__.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pathlib
+from torch.utils import cpp_extension
+from megatron.fused_kernels.utils import _create_build_dir
+
+
+# Setting this param to a list has a problem of generating different
+# compilation commands (with diferent order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicity in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
+def load(args):
+
+    # Build path
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / 'build'
+    _create_build_dir(buildpath)
+
+    # Helper function to build the kernels.
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths):
+        return cpp_extension.load(
+            name=name,
+            sources=sources,
+            build_directory=buildpath,
+            extra_cflags=['-O3'],
+            extra_cuda_cflags=['-O3'] + extra_cuda_flags,
+            extra_include_paths=extra_include_paths,
+            verbose=(args.rank == 0)
+        )
+
+    # ==============
+    # Fused softmax.
+    # ==============
+
+    extra_include_paths=[os.path.abspath(srcpath)]
+
+    if args.masked_softmax_fusion:
+        extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1', '-D__HIP_NO_HALF_CONVERSIONS__=1']
+        
+        # Upper triangular softmax.
+        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
+                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
+        scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
+            "scaled_upper_triang_masked_softmax_cuda",
+            sources, extra_cuda_flags, extra_include_paths)
+
+        # Masked softmax.
+        sources=[srcpath / 'scaled_masked_softmax.cpp',
+                 srcpath / 'scaled_masked_softmax_cuda.cu']
+        scaled_masked_softmax_cuda = _cpp_extention_load_helper(
+            "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths)
+
+    # =================================
+    # Mixed precision fused layer norm.
+    # =================================
+
+    extra_cuda_flags = []
+
+    sources=[srcpath / 'layer_norm_cuda.cpp',
+             srcpath / 'layer_norm_cuda_kernel.cu']
+    fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
+        "fused_layer_norm_cuda", sources, extra_cuda_flags, extra_include_paths)
diff --git a/megatron/fused_kernels/rocm/compat.h b/megatron/fused_kernels/rocm/compat.h
new file mode 100644
index 0000000000..92e7eb7723
--- /dev/null
+++ b/megatron/fused_kernels/rocm/compat.h
@@ -0,0 +1,31 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
diff --git a/megatron/fused_kernels/rocm/layer_norm_cuda.cpp b/megatron/fused_kernels/rocm/layer_norm_cuda.cpp
new file mode 100644
index 0000000000..8f28e7b4ad
--- /dev/null
+++ b/megatron/fused_kernels/rocm/layer_norm_cuda.cpp
@@ -0,0 +1,201 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#include <torch/extension.h>
+#include <vector>
+#include <cassert>
+#include "compat.h"
+
+namespace {
+
+void compute_n1_n2(
+    at::Tensor input,
+    at::IntArrayRef normalized_shape,
+    int& n1,
+    int& n2) {
+    int idiff = input.ndimension() - normalized_shape.size();
+    n2 = 1;
+    for (int i = 0;  i < (int)normalized_shape.size();  ++i) {
+	    assert( input.sizes()[i+idiff] == normalized_shape[i] );
+	    n2 *= normalized_shape[i];
+    }
+    n1 = 1;
+    for (int i = 0;  i < idiff;  ++i) {
+	    n1 *= input.sizes()[i];
+    }
+}
+
+void check_args(
+    at::IntArrayRef normalized_shape,
+    at::Tensor gamma,
+    at::Tensor beta
+    )
+{
+    TORCH_CHECK(!gamma.defined() || gamma.sizes().equals(normalized_shape));
+    TORCH_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape));
+}
+
+void check_args(
+    at::Tensor input,
+    at::IntArrayRef normalized_shape,
+    int& n1,
+    int& n2
+    )
+{
+    int64_t normalized_ndim = normalized_shape.size();
+
+    if (normalized_ndim < 1) {
+      std::stringstream ss;
+      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
+         << "containing at least one element, but got normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    auto input_shape = input.sizes();
+    auto input_ndim = input.dim();
+
+    if (input_ndim < normalized_ndim ||
+        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Given normalized_shape=" << normalized_shape
+         << ", expected input with shape [*";
+      for (auto size : normalized_shape) {
+        ss << ", " << size;
+      }
+      ss << "], but got input of size" << input_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    compute_n1_n2(input,normalized_shape,n1,n2);
+}
+
+
+void check_args(
+    at::Tensor input,
+    at::IntArrayRef normalized_shape,
+    at::Tensor gamma,
+    at::Tensor beta,
+    int& n1,
+    int& n2
+    )
+{
+    check_args(input,normalized_shape,n1,n2);
+    check_args(normalized_shape,gamma,beta);
+}
+}
+
+void cuda_layer_norm(
+    at::Tensor* output,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    at::IntArrayRef normalized_shape,
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon);
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> layer_norm_affine(
+    at::Tensor input,
+    at::IntArrayRef normalized_shape,
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+  
+  CHECK_INPUT(input);
+  CHECK_INPUT(gamma);
+  CHECK_INPUT(beta);
+  int n1, n2;
+  check_args(input, normalized_shape, gamma, beta, n1, n2);
+
+  at::Tensor output = at::empty_like(
+      input, gamma.options().dtype(gamma.scalar_type()));
+  at::Tensor mean = at::empty(
+      {n1}, input.options().dtype(at::ScalarType::Float));
+  at::Tensor invvar = at::empty_like(mean);
+
+  cuda_layer_norm(&output, &mean, &invvar, &input, n1, n2,
+      normalized_shape, &gamma, &beta, epsilon);
+
+  return {output, mean, invvar};
+
+}
+
+
+void cuda_layer_norm_gradient(
+    at::Tensor* dout,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    at::IntArrayRef normalized_shape,
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon,
+    at::Tensor* grad_input,
+    at::Tensor* grad_gamma,
+    at::Tensor* grad_beta
+    );
+
+std::vector<at::Tensor> layer_norm_gradient_affine(
+    at::Tensor dout,
+    at::Tensor mean,
+    at::Tensor invvar,
+    at::Tensor input,
+    at::IntArrayRef normalized_shape,
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+
+  CHECK_INPUT(dout);
+  CHECK_INPUT(mean);
+  CHECK_INPUT(invvar);
+  CHECK_INPUT(input);
+  CHECK_INPUT(gamma);
+  CHECK_INPUT(beta);
+  int n1, n2;
+  check_args(input, normalized_shape, gamma, beta, n1, n2);
+
+  at::Tensor grad_input = at::empty_like(input);
+  at::Tensor grad_gamma = at::empty_like(gamma);
+  at::Tensor grad_beta = at::empty_like(beta);
+
+  cuda_layer_norm_gradient(&dout, &mean, &invvar, &input, n1, n2,
+      normalized_shape, &gamma, &beta, epsilon,
+      &grad_input, &grad_gamma, &grad_beta);
+
+  return {grad_input, grad_gamma, grad_beta};
+
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward_affine", &layer_norm_affine,
+	"LayerNorm forward (CUDA)");
+  m.def("backward_affine", &layer_norm_gradient_affine,
+	"LayerNorm backward (CUDA)");
+}
diff --git a/megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu
new file mode 100644
index 0000000000..8a07806b13
--- /dev/null
+++ b/megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu
@@ -0,0 +1,866 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/DeviceUtils.cuh"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "type_shim.h"
+
+template<typename U> __device__
+void cuWelfordOnlineSum(
+  const U curr,
+  U& mu,
+  U& sigma2,
+  U& count)
+{
+  count = count + U(1);
+  U delta = curr - mu;
+  U lmean = mu + delta / count;
+  mu = lmean;
+  U delta2 = curr - lmean;
+  sigma2 = sigma2 + delta * delta2;
+}
+
+template<typename U> __device__
+void cuChanOnlineSum(
+  const U muB,
+  const U sigma2B,
+  const U countB,
+  U& mu,
+  U& sigma2,
+  U& count)
+{
+  U delta = muB - mu;
+  U nA = count;
+  U nB = countB;
+  count = count + countB;
+  U nX = count;
+  if (nX > U(0)) {
+    nA = nA / nX;
+    nB = nB / nX;
+    mu = nA*mu + nB*muB;
+    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
+  } else {
+    mu = U(0);
+    sigma2 = U(0);
+  }
+}
+
+template<typename T, typename U> __device__
+void cuWelfordMuSigma2(
+  const T* __restrict__ vals,
+  const int n1,
+  const int n2,
+  const int i1,
+  U& mu,
+  U& sigma2,
+  U* buf,
+  const int GPU_WARP_SIZE)
+{
+  // Assumptions:
+  // 1) blockDim.x == warpSize
+  // 2) Tensor is contiguous
+  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
+  //
+  // compute variance and mean over n2
+  U count = U(0);
+  mu= U(0);
+  sigma2 = U(0);
+  if (i1 < n1) {
+    // one warp normalizes one n1 index,
+    // synchronization is implicit
+    // initialize with standard Welford algorithm
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const T* lvals = vals + i1*n2;
+    int l = 4*thrx;
+    for (;  l+3 < n2;  l+=4*numx) {
+      for (int k = 0;  k < 4;  ++k) {
+        U curr = static_cast<U>(lvals[l+k]);
+        cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
+      }
+    }
+    for (;  l < n2;  ++l) {
+      U curr = static_cast<U>(lvals[l]);
+      cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
+    }
+    // intra-warp reductions
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      U muB = WARP_SHFL_DOWN(mu, stride);
+      U countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
+    }
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      U* ubuf = (U*)buf;
+      U* ibuf = (U*)(ubuf + blockDim.y);
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_y = threadIdx.y - offset;
+          ubuf[2*wrt_y] = mu;
+          ubuf[2*wrt_y+1] = sigma2;
+          ibuf[wrt_y] = count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          U muB = ubuf[2*threadIdx.y];
+          U sigma2B = ubuf[2*threadIdx.y+1];
+          U countB = ibuf[threadIdx.y];
+          cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
+        }
+        __syncthreads();
+      }
+      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ubuf[0] = mu;
+        ubuf[1] = sigma2;
+      }
+      __syncthreads();
+      mu = ubuf[0];
+      sigma2 = ubuf[1]/U(n2);
+      // don't care about final value of count, we know count == n2
+    } else {
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2/U(n2), 0);
+    }
+  }
+}
+
+template<> __device__
+void cuWelfordMuSigma2(
+  const at::Half* __restrict__ vals,
+  const int n1,
+  const int n2,
+  const int i1,
+  float& mu,
+  float& sigma2,
+  float* buf,
+  const int GPU_WARP_SIZE)
+{
+  // Assumptions:
+  // 1) blockDim.x == warpSize
+  // 2) Tensor is contiguous
+  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
+  //
+  // compute variance and mean over n2
+  float count = 0.0f;
+  mu= float(0);
+  sigma2 = float(0);
+  if (i1 < n1) {
+    // one warp normalizes one n1 index,
+    // synchronization is implicit
+    // initialize with standard Welford algorithm
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const at::Half* lvals = vals + i1*n2;
+    int l = 8*thrx;
+    if ((((size_t)lvals)&3) != 0) {
+      // 16 bit alignment
+      // first thread consumes first point
+      if (thrx == 0) {
+        float curr = static_cast<float>(lvals[0]);
+        cuWelfordOnlineSum(curr,mu,sigma2,count);
+      }
+      ++l;
+    }
+    // at this point, lvals[l] are 32 bit aligned for all threads.
+    for (;  l+7 < n2;  l+=8*numx) {
+      for (int k = 0;  k < 8;  k+=2) {
+        float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
+        cuWelfordOnlineSum(curr.x,mu,sigma2,count);
+	cuWelfordOnlineSum(curr.y,mu,sigma2,count);
+      }
+    }
+    for (;  l < n2;  ++l) {
+      float curr = static_cast<float>(lvals[l]);
+      cuWelfordOnlineSum(curr,mu,sigma2,count);
+    }
+    // intra-warp reductions
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      float muB = WARP_SHFL_DOWN(mu, stride);
+      float countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
+    }
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      float* ubuf = (float*)buf;
+      float* ibuf = (float*)(ubuf + blockDim.y);
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_y = threadIdx.y - offset;
+          ubuf[2*wrt_y] = mu;
+          ubuf[2*wrt_y+1] = sigma2;
+          ibuf[wrt_y] = count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          float muB = ubuf[2*threadIdx.y];
+          float sigma2B = ubuf[2*threadIdx.y+1];
+          float countB = ibuf[threadIdx.y];
+          cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
+        }
+        __syncthreads();
+      }
+      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ubuf[0] = mu;
+        ubuf[1] = sigma2;
+      }
+      __syncthreads();
+      mu = ubuf[0];
+      sigma2 = ubuf[1]/float(n2);
+      // don't care about final value of count, we know count == n2
+    } else {
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2/float(n2), 0);
+    }
+  }
+}
+#ifndef __HIP_PLATFORM_HCC__
+template<typename U> U rsqrt(U v) {
+#else
+template<typename U> __device__ U rsqrt(U v) {
+#endif
+  return U(1) / sqrt(v);
+}
+#ifndef __HIP_PLATFORM_HCC__
+template<> float rsqrt(float v) {
+#else
+template<> __device__ float rsqrt(float v) {
+#endif
+  return rsqrtf(v);
+}
+#ifndef __HIP_PLATFORM_HCC__
+template<> double rsqrt(double v) {
+#else
+template<> __device__ double rsqrt(double v) {
+#endif
+  return rsqrt(v);
+}
+
+namespace {
+// This is the un-specialized struct.  Note that we prevent instantiation of this
+// struct by putting an undefined symbol in the function body so it won't compile.
+//  template <typename T>
+//  struct SharedMemory
+//  {
+//      // Ensure that we won't compile any un-specialized types
+//      __device__ T *getPointer()
+//      {
+//          extern __device__ void error(void);
+//          error();
+//          return NULL;
+//      }
+//  };
+// https://github.com/NVIDIA/apex/issues/246
+template <typename T>
+struct SharedMemory;
+
+template <>
+struct SharedMemory <float>
+{
+    __device__ float *getPointer()
+    {
+        extern __shared__ float s_float[];
+        return s_float;
+    }
+};
+
+}
+
+template<typename T, typename U, typename V> __global__
+void cuApplyLayerNorm(
+  V* __restrict__ output_vals,
+  U* __restrict__ mean,
+  U* __restrict__ invvar,
+  const T* __restrict__ vals,
+  const int n1,
+  const int n2,
+  const U epsilon,
+  const V* __restrict__ gamma,
+  const V* __restrict__ beta,
+  const int GPU_WARP_SIZE
+  ) 
+{
+  // Assumptions:
+  // 1) blockDim.x == warpSize
+  // 2) Tensors are contiguous
+  //
+#ifndef __HIP_PLATFORM_HCC__
+  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#else
+  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#endif
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer();
+    U mu,sigma2;
+    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE);
+    const T* lvals = vals + i1*n2;
+    V* ovals = output_vals + i1*n2;
+    U c_invvar = rsqrt(sigma2 + epsilon);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    if (gamma != NULL && beta != NULL) {
+      for (int i = thrx;  i < n2;  i+=numx) {
+        U curr = static_cast<U>(lvals[i]);
+        ovals[i] = gamma[i] * static_cast<V>(c_invvar * (curr - mu)) + beta[i];
+      }
+    } else {
+      for (int i = thrx;  i < n2;  i+=numx) {
+        U curr = static_cast<U>(lvals[i]);
+        ovals[i] = static_cast<V>(c_invvar * (curr - mu));
+      }
+    }
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      mean[i1] = mu;
+      invvar[i1] = c_invvar;
+    }
+    __syncthreads();
+  }
+}
+
+template<typename T, typename U, typename V> __device__
+void cuLoadWriteStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    U* warp_buf1,
+    U* warp_buf2,
+    const T* input,
+    const V* dout,
+    const int i1_end,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar
+    )
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    U curr_mean = mean[i1];
+    U curr_invvar = invvar[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*n2+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<n2) {
+        U curr_input = static_cast<U>(input[load_idx]);
+	U curr_dout = static_cast<U>(dout[load_idx]);
+	warp_buf1[write_idx] = curr_dout;
+	warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
+      } else {
+        warp_buf1[write_idx] = U(0);
+        warp_buf2[write_idx] = U(0);
+      }
+    }
+  } else {
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      warp_buf1[write_idx] = U(0);
+      warp_buf2[write_idx] = U(0);
+    }
+  }
+}
+
+template<typename T, typename U, typename V> __device__
+void cuLoadAddStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    U* warp_buf1,
+    U* warp_buf2,
+    const T* input,
+    const V* dout,
+    const int i1_end,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar
+    )
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    U curr_mean = mean[i1];
+    U curr_invvar = invvar[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*n2+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<n2) {
+        U curr_input = static_cast<U>(input[load_idx]);
+	U curr_dout = static_cast<U>(dout[load_idx]);
+	warp_buf1[write_idx] += curr_dout;
+	warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
+      }
+    }
+  }
+}
+
+template<typename T, typename U, typename V> __global__
+void cuComputePartGradGammaBeta(
+    const V* __restrict__ dout,
+    const T* __restrict__ input,
+    const int n1,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar,
+    U epsilon,
+    U* part_grad_gamma,
+    U* part_grad_beta)
+{
+    const int numsegs_n1 = (n1+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
+    const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
+    const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
+    const int row_stride = blockDim.x+1;
+    const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
+    const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
+    const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer(); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
+    U* warp_buf1 = (U*)buf;
+    U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
+    // compute partial sums from strided inputs
+    // do this to increase number of loads in flight
+    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
+    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
+      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
+    }
+    __syncthreads();
+    // inter-warp reductions
+    // sum within each warp
+    U acc1 = U(0);
+    U acc2 = U(0);
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int row1 = threadIdx.y + k*blockDim.y;
+      int idx1 = row1*row_stride + threadIdx.x;
+      acc1 += warp_buf1[idx1];
+      acc2 += warp_buf2[idx1];
+    }
+    warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
+    warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
+    __syncthreads();
+    // sum all warps
+    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
+      if (threadIdx.y < offset) {
+        int row1 = threadIdx.y;
+	int row2 = threadIdx.y + offset;
+	int idx1 = row1*row_stride + threadIdx.x;
+	int idx2 = row2*row_stride + threadIdx.x;
+	warp_buf1[idx1] += warp_buf1[idx2];
+	warp_buf2[idx1] += warp_buf2[idx2];
+      }
+      __syncthreads();
+    }
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIdx.y == 0 && i2 < n2) {
+      int row1 = threadIdx.y;
+      int row2 = threadIdx.y + 1;
+      int idx1 = row1*row_stride + threadIdx.x;
+      int idx2 = row2*row_stride + threadIdx.x;
+      part_grad_beta[blockIdx.y*n2+i2] = warp_buf1[idx1] + warp_buf1[idx2];
+      part_grad_gamma[blockIdx.y*n2+i2] = warp_buf2[idx1] + warp_buf2[idx2];
+    }
+}
+
+template<typename U, typename V> __global__
+void cuComputeGradGammaBeta(
+    const U* part_grad_gamma,
+    const U* part_grad_beta,
+    const int part_size,
+    const int n1,
+    const int n2,
+    V* grad_gamma,
+    V* grad_beta)
+{
+    // sum partial gradients for gamma and beta
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer(); 
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i2 < n2) {
+      // each warp does sequential reductions until reduced part_size is num_warps
+      int num_warp_reductions = part_size / blockDim.y;
+      U sum_gamma = U(0);
+      U sum_beta = U(0);
+      const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
+      const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
+      for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
+        sum_gamma += part_grad_gamma_ptr[warp_offset*n2];
+        sum_beta += part_grad_beta_ptr[warp_offset*n2];
+      }
+      // inter-warp reductions
+      const int nbsize3 = blockDim.x * blockDim.y / 2;
+      for (int offset = blockDim.y/2;  offset >= 1;  offset /= 2) {
+        // top half write to shared memory
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[write_idx] = sum_gamma;
+          buf[write_idx+nbsize3] = sum_beta;
+        }
+        __syncthreads();
+        // bottom half sums
+        if (threadIdx.y < offset) {
+          const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_gamma += buf[read_idx];
+          sum_beta += buf[read_idx+nbsize3];
+        }
+        __syncthreads();
+      }
+      // write out fully summed gradients
+      if (threadIdx.y == 0) {
+        grad_gamma[i2] = sum_gamma;
+        grad_beta[i2] = sum_beta;
+      }
+    }
+}
+
+template<typename T, typename U, typename V> __global__
+void cuComputeGradInput(
+    const V* __restrict__ dout,
+    const T* __restrict__ input,
+    const int n1,
+    const int n2,
+    const U* __restrict__ mean,
+    const U* __restrict__ invvar,
+    U epsilon,
+    const V* gamma,
+    T* grad_input)
+{
+#ifndef __HIP_PLATFORM_HCC__
+  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#else
+  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#endif
+    U sum_loss1 = U(0);
+    U sum_loss2 = U(0);
+    const U c_mean = mean[i1];
+    const U c_invvar = invvar[i1];
+    const T* k_input = input + i1*n2;
+    const V* k_dout = dout + i1*n2;
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    if (gamma != NULL) {
+      int l = 4*thrx;
+      for (;  l+3 < n2;  l+=4*numx) {
+        for (int k = 0;  k < 4;  ++k) {
+          const U c_h = static_cast<U>(k_input[l+k]);
+          const U c_loss = static_cast<U>(k_dout[l+k]);
+          sum_loss1 += c_loss * gamma[l+k];
+          sum_loss2 += c_loss * gamma[l+k] * (c_h - c_mean) * c_invvar;
+        }
+      }
+      for (;  l < n2;  ++l) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        sum_loss1 += c_loss * gamma[l];
+        sum_loss2 += c_loss * gamma[l] * (c_h - c_mean) * c_invvar;
+      }
+    } else {
+      int l = 4*thrx;
+      for (;  l+3 < n2;  l+=4*numx) {
+        for (int k = 0;  k < 4;  ++k) {
+          const U c_h = static_cast<U>(k_input[l+k]);
+          const U c_loss = static_cast<U>(k_dout[l+k]);
+          sum_loss1 += c_loss;
+          sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
+        }
+      }
+      for (;  l < n2;  ++l) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        sum_loss1 += c_loss;
+        sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
+      }
+    }
+    // intra-warp reductions
+    for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
+      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
+      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
+    }
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      SharedMemory<U> shared;
+      U* buf = shared.getPointer(); 
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[2*wrt_i] = sum_loss1;
+          buf[2*wrt_i+1] = sum_loss2;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.y < offset) {
+          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_loss1 += buf[2*read_i];
+          sum_loss2 += buf[2*read_i+1];
+        }
+        __syncthreads();
+      }
+      if (threadIdx.y == 0) {
+        buf[2*threadIdx.x] = sum_loss1;
+        buf[2*threadIdx.x+1] = sum_loss2;
+      }
+      __syncthreads();
+      if (threadIdx.y !=0) {
+        sum_loss1 = buf[2*threadIdx.x];
+        sum_loss2 = buf[2*threadIdx.x+1];
+      } 
+    }
+    // all threads now have the two sums over l
+    U fH = (U)n2;
+    U term1 = (U(1) / fH) * c_invvar;
+    T* k_grad_input = grad_input + i1*n2;
+    if (gamma != NULL) {
+      for (int l = thrx;  l < n2;  l+=numx) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        U f_grad_input = fH * c_loss * gamma[l];
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    } else {
+      for (int l = thrx;  l < n2;  l+=numx) {
+        const U c_h = static_cast<U>(k_input[l]);
+        const U c_loss = static_cast<U>(k_dout[l]);
+        U f_grad_input = fH * c_loss;
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    }
+    // prevent race where buf is written again before reads are done
+    __syncthreads();
+  }
+}
+
+
+
+
+template<typename T, typename U, typename V> 
+void HostApplyLayerNorm(
+    V* output,
+    U* mean,
+    U* invvar,
+    const T* input,
+    int n1,
+    int n2,
+    double epsilon,
+    const V* gamma,
+    const V* beta
+    )
+{
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    const int warp_size = at::cuda::warp_size();
+    dim3 threads(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads.y = 1;
+#endif
+    const uint64_t maxGridY =
+      at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+    const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
+    int nshared = 
+        threads.y > 1 ? 
+	    threads.y*sizeof(U)+(threads.y/2)*sizeof(U) : 
+	    0;
+    cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(
+		    output,
+		    mean,
+		    invvar,
+		    input,
+		    n1,n2,
+		    U(epsilon),
+                    gamma,
+		    beta,
+		    warp_size);
+}
+
+
+void cuda_layer_norm(
+    at::Tensor* output,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon)
+{
+    using namespace at;
+    DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
+        input->scalar_type(), output->scalar_type(), "cuda_layer_norm_kernel",
+        HostApplyLayerNorm(
+	    output->DATA_PTR<scalar_t_out>(),
+	    mean->DATA_PTR<float>(),
+	    invvar->DATA_PTR<float>(),
+	    input->DATA_PTR<scalar_t_in>(),
+	    n1,n2,
+	    epsilon,
+	    gamma != NULL ? gamma->DATA_PTR<scalar_t_out>() : NULL,
+	    beta != NULL ? beta->DATA_PTR<scalar_t_out>() : NULL);
+      )
+}
+
+
+template<typename T, typename U, typename V>
+void HostLayerNormGradient(
+    const V* dout,
+    const U* mean,
+    const U* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    const V* gamma,
+    const V* beta,
+    double epsilon,
+    T* grad_input,
+    V* grad_gamma,
+    V* grad_beta
+    )
+{
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    const int warp_size = at::cuda::warp_size();
+
+    if (gamma != NULL && beta != NULL) {
+      // compute grad_gamma(j) and grad_beta(j)
+#ifndef __HIP_PLATFORM_HCC__
+      const int part_size = warp_size;
+#else
+      const int part_size = 16;
+#endif
+      const dim3 threads2(warp_size,4,1);
+      const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
+      const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y *
+	(threads2.x + 1);
+      const int nshared2_b = threads2.x * threads2.y * sizeof(U);
+      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
+      at::Tensor part_grad_gamma = at::empty(
+	  {part_size,n2}, input->options().dtype(at::ScalarType::Float));
+      at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
+      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
+		      dout,
+		      input->DATA_PTR<T>(),
+		      n1,n2,
+		      mean,
+		      invvar,
+		      U(epsilon),
+		      part_grad_gamma.DATA_PTR<U>(),
+		      part_grad_beta.DATA_PTR<U>());
+
+      const dim3 threads3(warp_size,8,1);
+      const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
+      const int nshared3 = threads3.x * threads3.y * sizeof(U);
+      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
+		      part_grad_gamma.DATA_PTR<U>(),
+		      part_grad_beta.DATA_PTR<U>(),
+		      part_size,
+		      n1,n2,
+		      grad_gamma,
+		      grad_beta);
+    }
+
+    // compute grad_input
+    const uint64_t maxGridY =
+      at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+    const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
+    dim3 threads1(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads1.y = 2;
+#endif
+    int nshared =
+	    threads1.y > 1 ?
+	    threads1.y*threads1.x*sizeof(U) :
+	    0;
+    cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
+            dout,
+            input->DATA_PTR<T>(),
+            n1,n2,
+            mean,
+            invvar,
+            U(epsilon),
+            gamma,
+            grad_input);
+}
+
+
+void cuda_layer_norm_gradient(
+    at::Tensor* dout,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon,
+    at::Tensor* grad_input,
+    at::Tensor* grad_gamma,
+    at::Tensor* grad_beta)
+{
+    using namespace at;
+    DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
+        input->scalar_type(), gamma->scalar_type(),
+	"cuda_layer_norm_gradient_kernel",
+        HostLayerNormGradient(
+	    dout->DATA_PTR<scalar_t_out>(),
+	    mean->DATA_PTR<float>(),
+	    invvar->DATA_PTR<float>(),
+	    input,
+	    n1,n2,
+            // TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
+            // if gamma Tensor is NULL on input.
+	    gamma != NULL ? gamma->DATA_PTR<scalar_t_out>() : NULL,
+	    gamma != NULL ? beta->DATA_PTR<scalar_t_out>() : NULL,
+	    epsilon,
+	    grad_input->DATA_PTR<scalar_t_in>(),
+	    gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_out>() : NULL,
+	    gamma != NULL ? grad_beta->DATA_PTR<scalar_t_out>() : NULL);
+      )
+}
diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp b/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp
new file mode 100644
index 0000000000..d5334710cf
--- /dev/null
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp
@@ -0,0 +1,77 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    torch::Tensor const& mask,
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+torch::Tensor fwd(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor) {
+  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
+  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+	     (input.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
+
+  return fwd_cuda(input, mask, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
+
+  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+} // end namespace scaled_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("backward", 
+        &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax.h b/megatron/fused_kernels/rocm/scaled_masked_softmax.h
new file mode 100644
index 0000000000..78e97e4ec6
--- /dev/null
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax.h
@@ -0,0 +1,492 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Explicit masking
+ */	
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src,
+    const uint8_t *mask, 
+    const acc_t scale, 
+    int micro_batch_size, 
+    int element_count,
+    int pad_batches) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
+    int pad_first_batch = 0;
+    if (pad_batches != 1) { // bert style
+        pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
+    } else { // gpt2 style
+        pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    }
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    mask += pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    uint8_t temp_mask[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                int itr_idx = i*element_count+it*WARP_SIZE;
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
+                copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
+
+                #pragma unroll
+                  for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                      if (temp_mask[element] != 1) {
+                          elements[i][it + element] = (acc_t)temp_data[element] * scale;
+                      } else {
+                          elements[i][it + element] = -10000.0;
+                      }
+                  }
+            } else {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+            sum[i] += elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = elements[i][it + element] / sum[i];
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);  
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int micro_batch_size, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count + it * WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    output_reg[i][it + element] = (acc_t)temp_output[element];
+                }
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
+                }
+            } 
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, out);
+            } 
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const uint8_t *mask,
+    const input_t scale, 
+    int query_seq_len, 
+    int key_seq_len, 
+    int batches,
+    int attn_heads,
+    int pad_batches)
+{
+    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 2048 );
+    if (key_seq_len == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches * attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0);
+        dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int query_seq_len, 
+    int key_seq_len, 
+    int batches,
+    int attn_heads)
+{
+    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 2048 );
+    if (key_seq_len == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches *  attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = batch_count/batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            default:
+                break;
+        }
+    }
+}
diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
new file mode 100644
index 0000000000..c034dc3ad7
--- /dev/null
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
@@ -0,0 +1,114 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor)
+{
+  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = input.size(0);
+  const int pad_batches = mask.size(0);
+  const int attn_heads = input.size(1);
+  const int query_seq_len = input.size(2);
+  const int key_seq_len = input.size(3);
+  TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(query_seq_len > 1);
+  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
+  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
+  TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
+  TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* mask_ptr = static_cast<void*>(mask.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  DISPATCH_HALF_AND_BFLOAT(
+      input.scalar_type(),
+      "dispatch_scaled_masked_softmax_forward",
+      dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(softmax_results_ptr),
+	  reinterpret_cast<const scalar_t*>(input_ptr),
+	  reinterpret_cast<const uint8_t*>(mask_ptr),
+	  scale_factor,
+	  query_seq_len,
+	  key_seq_len,
+	  batches,
+	  attn_heads,
+	  pad_batches);
+      );
+  return softmax_results;
+}
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = output_grads.size(0);
+  const int attn_heads = output_grads.size(1);
+  const int query_seq_len = output_grads.size(2);
+  const int key_seq_len = output_grads.size(3);
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  DISPATCH_HALF_AND_BFLOAT(
+      output_grads_.scalar_type(),
+      "dispatch_scaled_masked_softmax_backward",
+      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+	  scale_factor,
+	  query_seq_len,
+	  key_seq_len,
+	  batches,
+	  attn_heads);
+			   );
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
diff --git a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.cpp b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.cpp
new file mode 100644
index 0000000000..ea283588db
--- /dev/null
+++ b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.cpp
@@ -0,0 +1,72 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
+  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+	     (input.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return fwd_cuda(input, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
+
+  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+} // end namespace scaled_upper_triang_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", 
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("backward", 
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
diff --git a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
new file mode 100644
index 0000000000..addca0a0a3
--- /dev/null
+++ b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
@@ -0,0 +1,511 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
+  
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_zero_vector(Datatype *dst);
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 1>(c10::BFloat16 *dst) { *dst = 0.0; }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 4>(c10::BFloat16 *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 1>(c10::Half *dst) { *dst = 0.0; }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 4>(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
+
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Implicit time (diagonal masking)
+ */
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const acc_t scale, 
+    int micro_batch_size, 
+    int stride, 
+    int element_count) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE;
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + i*element_count*stride + it*WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if ((element_index + element) < batch_element_count) {
+                        elements[i][it+element] = (acc_t)temp_data[element] * scale;
+                    } else {
+                        elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                    }
+                }
+            } else {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (it < warp_iteration_limit) {
+                elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+                sum[i] += elements[i][it];
+            } 
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < local_seq) {
+
+                #pragma unroll  
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < local_seq) {
+                        out[element] = elements[i][it + element] / sum[i];
+                    } else {
+                        out[element] = 0;
+                    }
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE, out);
+            } else if (element_index < element_count) {
+                copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE);
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int micro_batch_size, 
+    int stride, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count * stride + it * WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        output_reg[i][it + element] = (acc_t)temp_output[element];
+                    }
+                }
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
+                    }
+                }
+            }
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count * stride + it * WARP_SIZE, out);
+            } 
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const input_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
diff --git a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu
new file mode 100644
index 0000000000..59e4525849
--- /dev/null
+++ b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu
@@ -0,0 +1,100 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_upper_triang_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor)
+{
+  // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = input.size(0);
+  const int seq_len = input.size(1);
+  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({attn_batches, seq_len, seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  DISPATCH_HALF_AND_BFLOAT(
+      input.scalar_type(),
+      "dispatch_scaled_upper_triang_masked_softmax_forward",
+      dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
+	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
+	  reinterpret_cast<const scalar_t*>(input_ptr),
+	  scale_factor,
+	  seq_len,
+	  seq_len,
+	  attn_batches);
+      );
+  return softmax_results;
+}
+				      
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = output_grads.size(0);
+  const int seq_len = output_grads.size(1);
+  TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  DISPATCH_HALF_AND_BFLOAT(
+      output_grads_.scalar_type(),
+      "dispatch_scaled_upper_triang_masked_softmax_backward",
+      dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+	  scale_factor,
+	  seq_len,
+	  seq_len,
+	  attn_batches);
+      );
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
diff --git a/megatron/fused_kernels/rocm/type_shim.h b/megatron/fused_kernels/rocm/type_shim.h
new file mode 100644
index 0000000000..6437dcc7c7
--- /dev/null
+++ b/megatron/fused_kernels/rocm/type_shim.h
@@ -0,0 +1,91 @@
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <ATen/ATen.h>
+#include "compat.h"
+
+
+#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
+  switch(TYPE)								\
+    {									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t = at::BFloat16;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
+      }
+
+
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
+  switch(TYPEIN)							\
+    {									\
+    case at::ScalarType::Float:						\
+      {									\
+	using scalar_t_in = float;					\
+	switch(TYPEOUT)							\
+	  {								\
+	  case at::ScalarType::Float:					\
+	    {								\
+	      using scalar_t_out = float;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::Half:					\
+	    {								\
+	      using scalar_t_out = at::Half;				\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  case at::ScalarType::BFloat16:				\
+	    {								\
+	      using scalar_t_out = at::BFloat16;			\
+	      __VA_ARGS__;						\
+	      break;							\
+	    }								\
+	  default:							\
+	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+	  }								\
+	break;								\
+      }									\
+    case at::ScalarType::Half:						\
+      {									\
+	using scalar_t_in = at::Half;					\
+	using scalar_t_out = at::Half;					\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    case at::ScalarType::BFloat16:					\
+      {									\
+	using scalar_t_in = at::BFloat16;				\
+	using scalar_t_out = at::BFloat16;				\
+	__VA_ARGS__;							\
+	break;								\
+      }									\
+    default:								\
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
+    }
+
diff --git a/megatron/fused_kernels/utils.py b/megatron/fused_kernels/utils.py
new file mode 100644
index 0000000000..2425aae9c2
--- /dev/null
+++ b/megatron/fused_kernels/utils.py
@@ -0,0 +1,9 @@
+import os
+
+
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")

From 22de429cef7f41cd4cf643945262a2c8eb6e16c7 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Fri, 26 May 2023 10:09:45 +0200
Subject: [PATCH 110/144] Add tokens-per-second-per-gpu to the printed logs
 instead of just wandb logs

---
 megatron/training.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/training.py b/megatron/training.py
index 468a600291..c02a91f39f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -655,6 +655,7 @@ def add_to_logging(name):
         log_string += ' number of nan iterations: {:3d} |'.format(
             total_loss_dict[nan_iters_key])
         log_string += ' TFLOPs: {:.2f} |'.format(tflops)
+        log_string += ' tokens-per-second-per-gpu: {:.2f} |'.format(tokens_per_sec_per_gpu)
         total_loss_dict[advanced_iters_key] = 0
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[nan_iters_key] = 0

From 6a77fd0867be569aaac470ea59a200723102e949 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Fri, 26 May 2023 14:36:38 +0530
Subject: [PATCH 111/144] fix

---
 megatron/fused_kernels/__init__.py  | 11 +++++++++--
 megatron/initialize.py              |  6 +++---
 tools/checkpoint_loader_megatron.py |  4 ++--
 tools/checkpoint_saver_megatron.py  |  4 ++--
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 8f097e7d7d..8730321172 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -1,10 +1,17 @@
+import torch
+
+
 def load(args):
     if args.use_kernels_from_apex:
         return
 
-    if args.device == "cuda":
+    if torch.version.hip is None:
+        if torch.distributed.get_rank() == 0:
+            print("running on CUDA devices")
         from megatron.fused_kernels.cuda import load as load_kernels
-    elif args.device == "rocm":
+    else:
+        if torch.distributed.get_rank() == 0:
+            print("running on ROCm devices")
         from megatron.fused_kernels.rocm import load as load_kernels
 
     load_kernels(args)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 5ec87e4158..3db0a209ef 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -31,7 +31,7 @@
 except ModuleNotFoundError:
     print('Wandb import failed', flush=True)
 
-from megatron.fused_kernels import cuda
+import megatron.fused_kernels as fused_kernels
 from megatron import get_adlr_autoresume
 from megatron import get_args
 from megatron import get_tensorboard_writer
@@ -198,11 +198,11 @@ def _compile_dependencies():
     if torch.distributed.get_rank() == 0:
         start_time = time.time()
         print('> compiling and loading fused kernels ...', flush=True)
-        cuda.load(args)
+        fused_kernels.load(args)
         torch.distributed.barrier()
     else:
         torch.distributed.barrier()
-        cuda.load(args)
+        fused_kernels.load(args)
     # Simple barrier to make sure all ranks have passed the
     # compilation phase successfully before moving on to the
     # rest of the program. We think this might ensure that
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index c67cf565a4..84bf441a13 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from megatron.fused_kernels import cuda
+import megatron.fused_kernels as fused_kernels
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
@@ -133,7 +133,7 @@ def get_models(count, dtype, pre_process, post_process):
     set_global_variables(margs)
     mpu.initialize.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
     mpu.initialize.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
-    cuda.load(margs)
+    fused_kernels.load(margs)
 
     # Get true (non-padded) vocab size
     if args.true_vocab_size is not None:
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index 0a370befc1..edb7791927 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from megatron.fused_kernels import cuda
+import megatron.fused_kernels as fused_kernels
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron saver')
@@ -161,7 +161,7 @@ def get_models(count, dtype, pre_process, post_process):
     mpu.initialize.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size)
     mpu.initialize.set_tensor_model_parallel_rank(0)
     mpu.initialize.set_pipeline_model_parallel_rank(0)
-    cuda.load(margs)
+    fused_kernels.load(margs)
 
     # Embeddings
     #-----------

From 9008fbeef76835390ba5009c4a4356e17f394d68 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Fri, 26 May 2023 14:49:13 +0530
Subject: [PATCH 112/144] fused

---
 megatron/fused_kernels/__init__.py  | 6 ++----
 megatron/initialize.py              | 2 +-
 tools/checkpoint_loader_megatron.py | 2 +-
 tools/checkpoint_saver_megatron.py  | 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 8730321172..b833745da5 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -6,12 +6,10 @@ def load(args):
         return
 
     if torch.version.hip is None:
-        if torch.distributed.get_rank() == 0:
-            print("running on CUDA devices")
+        print("running on CUDA devices")
         from megatron.fused_kernels.cuda import load as load_kernels
     else:
-        if torch.distributed.get_rank() == 0:
-            print("running on ROCm devices")
+        print("running on ROCm devices")
         from megatron.fused_kernels.rocm import load as load_kernels
 
     load_kernels(args)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 3db0a209ef..3170ba6236 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -31,7 +31,7 @@
 except ModuleNotFoundError:
     print('Wandb import failed', flush=True)
 
-import megatron.fused_kernels as fused_kernels
+from megatron import fused_kernels
 from megatron import get_adlr_autoresume
 from megatron import get_args
 from megatron import get_tensorboard_writer
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index 84bf441a13..ed5f7ac80f 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -5,7 +5,7 @@
 
 import torch
 
-import megatron.fused_kernels as fused_kernels
+from megatron import fused_kernels
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index edb7791927..a3e699080b 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -6,7 +6,7 @@
 
 import torch
 
-import megatron.fused_kernels as fused_kernels
+from megatron import fused_kernels
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron saver')

From 23cf759e5592a176c3f4b476139acad0c5e01d86 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Jun 2023 03:06:23 +0530
Subject: [PATCH 113/144] add missing get_batch_per_block

---
 .../rocm/scaled_masked_softmax.cpp            | 22 ++++++++++++++++++-
 .../rocm/scaled_masked_softmax.h              | 15 ++++++++++++-
 .../rocm/scaled_masked_softmax_cuda.cu        |  5 +++++
 .../rocm/scaled_upper_triang_masked_softmax.h |  1 +
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp b/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp
index d5334710cf..1852aee6fd 100644
--- a/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax.cpp
@@ -32,6 +32,12 @@ torch::Tensor bwd_cuda(
     torch::Tensor const& softmax_results,
     float scale_factor);
 
+int get_batch_per_block_cuda(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads);
+
 torch::Tensor fwd(
     torch::Tensor const& input,
     torch::Tensor const& mask,
@@ -63,6 +69,14 @@ torch::Tensor bwd(
   return bwd_cuda(output_grads, softmax_results, scale_factor);
 }
 
+int get_batch_per_block(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads) {
+    return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
 } // end namespace scaled_masked_softmax
 } // end namespace fused_softmax
 } // end namespace multihead_attn
@@ -71,7 +85,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("forward", 
         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
-  m.def("backward", 
+
+  m.def("backward",
         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+
+  m.def("get_batch_per_block",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
+        "Return Batch per block size."
+  );
 }
diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax.h b/megatron/fused_kernels/rocm/scaled_masked_softmax.h
index 78e97e4ec6..0866e7258b 100644
--- a/megatron/fused_kernels/rocm/scaled_masked_softmax.h
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax.h
@@ -310,9 +310,22 @@ __global__ void scaled_masked_softmax_warp_backward(
         }
     }
 }
-
 } // end of anonymous namespace
 
+int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads){
+    int log2_elements = log2_ceil(key_seq_len);
+    const int next_power_of_two = 1 << log2_elements;
+
+    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+    constexpr int threads_per_block = 128;
+    int warps_per_block = (threads_per_block / warp_size);
+    int batches_per_block = warps_per_block * batches_per_warp;
+
+    return batches_per_block;
+}
+
 template<typename input_t, typename output_t, typename acc_t>
 void dispatch_scaled_masked_softmax_forward(
     output_t *dst, 
diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
index c034dc3ad7..590f1d250c 100644
--- a/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
@@ -30,6 +30,11 @@ namespace multihead_attn {
 namespace fused_softmax {
 namespace scaled_masked_softmax {
 
+int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
+    return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+
 torch::Tensor fwd_cuda(
     torch::Tensor const& input,
     torch::Tensor const& mask,
diff --git a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
index addca0a0a3..341e8edcd7 100644
--- a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
@@ -451,6 +451,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
         int warps_per_block = (threads_per_block / warp_size);
         int batches_per_block = warps_per_block * batches_per_warp;
         TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+
         int blocks_per_seq = attn_batches / batches_per_block;
         dim3 blocks(seq_len, blocks_per_seq, 1);
         dim3 threads(warp_size, warps_per_block, 1);

From f20d10ab80acd09d1abdf41e56b8e285207f4382 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Thu, 1 Jun 2023 03:33:37 +0530
Subject: [PATCH 114/144] increase sequence length to 8k

---
 .../rocm/scaled_masked_softmax.h              | 21 +++++++++++++++++--
 .../rocm/scaled_masked_softmax_cuda.cu        |  2 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax.h b/megatron/fused_kernels/rocm/scaled_masked_softmax.h
index 0866e7258b..835ffe55d4 100644
--- a/megatron/fused_kernels/rocm/scaled_masked_softmax.h
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax.h
@@ -338,7 +338,7 @@ void dispatch_scaled_masked_softmax_forward(
     int attn_heads,
     int pad_batches)
 {
-    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 2048 );
+    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 );
     if (key_seq_len == 0) {
         return;
     } else {
@@ -410,6 +410,14 @@ void dispatch_scaled_masked_softmax_forward(
                 scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
                 break;
+            case 12: // 4096
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 13: // 8192
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
             default:
                 break;
         }
@@ -427,7 +435,7 @@ void dispatch_scaled_masked_softmax_backward(
     int batches,
     int attn_heads)
 {
-    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 2048 );
+    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 8192 );
     if (key_seq_len == 0) {
        return;
     } else {
@@ -498,6 +506,15 @@ void dispatch_scaled_masked_softmax_backward(
                 scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
                 break;
+			case 12: // 4096
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 13: // 8192
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+
             default:
                 break;
         }
diff --git a/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
index 590f1d250c..3b88b9c605 100644
--- a/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/rocm/scaled_masked_softmax_cuda.cu
@@ -46,7 +46,7 @@ torch::Tensor fwd_cuda(
   const int attn_heads = input.size(1);
   const int query_seq_len = input.size(2);
   const int key_seq_len = input.size(3);
-  TORCH_INTERNAL_ASSERT(key_seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(key_seq_len <= 8192);
   TORCH_INTERNAL_ASSERT(query_seq_len > 1);
   TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
   TORCH_INTERNAL_ASSERT(mask.size(1) == 1);

From cc965d98e9048c87df5d299ac6b8ad0aa7cd3970 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Fri, 2 Jun 2023 09:04:41 +0530
Subject: [PATCH 115/144] don't use Apex kernels

---
 megatron/fused_kernels/tests/test_fused_kernels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index 95ac16568f..8256cd60a4 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -283,7 +283,7 @@ class DummyArgs:
     rank: int = 0
     masked_softmax_fusion: bool = True
     gradient_accumulation_fusion: bool = True
-    use_kernels_from_apex: bool = True
+    use_kernels_from_apex: bool = False
 
 
 if __name__ == "__main__":

From e9a7e7e1ca2ea123e72150b2ccbd430f7e49545f Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Fri, 2 Jun 2023 10:21:10 +0530
Subject: [PATCH 116/144] 8192 upper

---
 .../rocm/scaled_upper_triang_masked_softmax.h | 21 +++++++++++++++++--
 ...scaled_upper_triang_masked_softmax_cuda.cu |  2 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
index 341e8edcd7..d4b913d7c0 100644
--- a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax.h
@@ -340,7 +340,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
         return;
     } else {
@@ -361,6 +361,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
         int warps_per_block = (threads_per_block / warp_size);
         int batches_per_block = warps_per_block * batches_per_warp;
         TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+
         int blocks_per_seq = attn_batches / batches_per_block;
         dim3 blocks(seq_len, blocks_per_seq, 1);
         dim3 threads(warp_size, warps_per_block, 1);
@@ -414,6 +415,14 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
                 scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
@@ -430,7 +439,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
        return;
     } else {
@@ -505,6 +514,14 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
                 scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
diff --git a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu
index 59e4525849..4aa9a702a5 100644
--- a/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/rocm/scaled_upper_triang_masked_softmax_cuda.cu
@@ -37,7 +37,7 @@ torch::Tensor fwd_cuda(
   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
   const int attn_batches = input.size(0);
   const int seq_len = input.size(1);
-  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(seq_len <= 8192);
 
   // Output 
   auto act_options = input.options().requires_grad(false);

From 250ab293c40047f5d7385ebe90ab14d15fcc00d0 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Fri, 2 Jun 2023 10:49:21 +0530
Subject: [PATCH 117/144] 8192 upper

---
 megatron/fused_kernels/tests/test_fused_kernels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index 8256cd60a4..778fba1b82 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -301,7 +301,7 @@ class DummyArgs:
         print("\n[Fail] Please install `transformers` package to test fused kernels\n")
         exit(-1)
 
-    from megatron.fused_kernels.cuda import load
+    from megatron.fused_kernels import load
     load(DummyArgs())
 
     test_load_fused_kernels()

From e8c74d5456a42d5583d53d8ccc6e7bf8229a340e Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Fri, 2 Jun 2023 11:09:35 +0530
Subject: [PATCH 118/144] drop useless script

---
 examples/debug.sh | 60 -----------------------------------------------
 1 file changed, 60 deletions(-)
 delete mode 100644 examples/debug.sh

diff --git a/examples/debug.sh b/examples/debug.sh
deleted file mode 100644
index 8aedab0acf..0000000000
--- a/examples/debug.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-export NCCL_SOCKET_IFNAME="ib,bond"
-export NCCL_IB_CUDA_SUPPORT=1
-
-MASTER_ADDR=$(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | head -n 1)
-MASTER_PORT=5${LSB_JOBID: -5:-1}
-NNODES=$(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | sed 'n; d' | wc -w)
-GPUS_PER_NODE=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -w)
-NODE_RANK=$(($(echo ${LSB_MCPU_HOSTS} | tr ' ' '\n' | sed 'n; d' | grep -n -m1 $HOSTNAME | cut -d':' -f1)-1))
-
-
-
-DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-CHECKPOINT_PATH=checkpoints  # Adjust: Directory to store the checkpoints
-DATA_PATH=data/debug_text_document  # Adjust: Prefix of the preprocessed dataset.
-TOKENIZER_FILE=tokenizer.json  # Adjust
-
-GPT_ARGS="\
---tensor-model-parallel-size 1 \
---pipeline-model-parallel-size 1 \
---recompute-activations \
---num-layers 24 \
---hidden-size 2048 \
---num-attention-heads 16 \
---attention-head-type multiquery \
---init-method-std 0.022 \
---seq-length 2048 \
---max-position-embeddings 2048 \
---attention-dropout 0.1 \
---hidden-dropout 0.1 \
---micro-batch-size 2 \
---global-batch-size 192 \
---lr 0.0002 \
---train-iters 3000 \
---lr-decay-iters 600000 \
---lr-decay-style cosine \
---lr-warmup-fraction 0.02 \
---weight-decay .1 \
---adam-beta2 .95 \
---clip-grad 1.0 \
---fp16 \
---log-interval 10 \
---save-interval 4000 \
---eval-interval 200 \
---eval-iters 10 \
---initial-loss-scale 65536 \
---fim-rate 0.5 \
-"
-
-TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
-
-torchrun $DISTRIBUTED_ARGS \
-       pretrain_gpt.py \
-       $GPT_ARGS \
-       --tokenizer-type TokenizerFromFile \
-       --tokenizer-file $TOKENIZER_FILE \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       $TENSORBOARD_ARGS

From 4a33f292ee06688a1fbbc4cf7dc4f661db7202e3 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sat, 3 Jun 2023 23:35:30 +0530
Subject: [PATCH 119/144] fused kernel import

---
 tools/checkpoint_loader_megatron.py | 3 +--
 tools/checkpoint_saver_megatron.py  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index ed5f7ac80f..b8681aca89 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -5,7 +5,6 @@
 
 import torch
 
-from megatron import fused_kernels
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
@@ -34,7 +33,7 @@ def _load_checkpoint(queue, args):
         from megatron.global_vars import set_args, set_global_variables
         from megatron.checkpointing import load_args_from_checkpoint, load_checkpoint
         from megatron.model import ModelType, module
-        from megatron import mpu
+        from megatron import mpu, fused_kernels
     except ModuleNotFoundError:
         print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
         queue.put("exit")
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index a3e699080b..c6b29012f9 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -6,7 +6,6 @@
 
 import torch
 
-from megatron import fused_kernels
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron saver')
@@ -38,7 +37,7 @@ def save_checkpoint(queue, args):
         from megatron.global_vars import set_global_variables, get_args
         from megatron.model import ModelType
         from megatron.tokenizer.tokenizer import _vocab_size_with_padding
-        from megatron import mpu
+        from megatron import mpu, fused_kernels
     except ModuleNotFoundError:
         print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
         exit(1)

From 21045b59127cd2d5509f1ca27d81fae7b485bd22 Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Sat, 3 Jun 2023 23:39:04 +0530
Subject: [PATCH 120/144] drop use_kernels_from_apex

---
 megatron/arguments.py                              | 2 --
 megatron/fused_kernels/__init__.py                 | 3 ---
 megatron/fused_kernels/tests/test_fused_kernels.py | 1 -
 tools/checkpoint_loader_megatron.py                | 3 ---
 tools/checkpoint_saver_megatron.py                 | 3 ---
 5 files changed, 12 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index ac47a212e4..39d0c8f825 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -641,8 +641,6 @@ def _add_training_args(parser):
                        'training if SIGTERM is received')
     group.add_argument('--tensorboard-dir', type=str, default=None,
                        help='Write TensorBoard logs to this directory.')
-    group.add_argument("--use-kernels-from-apex", action="store_true",
-                       help="use Apex kernels instead of Megatron")
     group.add_argument('--no-masked-softmax-fusion',
                        action='store_false',
                        help='Disable fusion of query_key_value scaling, '
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index b833745da5..450c43072b 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -2,9 +2,6 @@
 
 
 def load(args):
-    if args.use_kernels_from_apex:
-        return
-
     if torch.version.hip is None:
         print("running on CUDA devices")
         from megatron.fused_kernels.cuda import load as load_kernels
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index 778fba1b82..8f0fe38006 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -283,7 +283,6 @@ class DummyArgs:
     rank: int = 0
     masked_softmax_fusion: bool = True
     gradient_accumulation_fusion: bool = True
-    use_kernels_from_apex: bool = False
 
 
 if __name__ == "__main__":
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index b8681aca89..bb701e1572 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -5,7 +5,6 @@
 
 import torch
 
-
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
 
@@ -16,8 +15,6 @@ def add_arguments(parser):
                        'trim padding from the embedding table.')
     group.add_argument('--megatron-path', type=str, default=None,
                        help='Base directory of deepspeed repository')
-    group.add_argument("--use-kernels-from-apex", action="store_true",
-                       help="use Apex kernels instead of Megatron")
 
 def _load_checkpoint(queue, args):
 
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index c6b29012f9..95b34a01d1 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -6,7 +6,6 @@
 
 import torch
 
-
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron saver')
 
@@ -19,8 +18,6 @@ def add_arguments(parser):
     group.add_argument('--target-pipeline-parallel-size', type=int,
                        help='Target tensor model parallel size, default to the pipeline parall size '
                        'in the input checkpoint if provided by the loader, otherwise to 1')
-    group.add_argument("--use-kernels-from-apex", action="store_true",
-                       help="use Apex kernels instead of Megatron")
 
 def save_checkpoint(queue, args):
 

From 972f301b3bfd94404279f90467e6179d7d4cdb75 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 5 Jun 2023 12:59:42 -0400
Subject: [PATCH 121/144] remove unused kernels

---
 .../fused_kernels/rocm/layer_norm_cuda.cpp    | 201 ----
 .../rocm/layer_norm_cuda_kernel.cu            | 866 ------------------
 2 files changed, 1067 deletions(-)
 delete mode 100644 megatron/fused_kernels/rocm/layer_norm_cuda.cpp
 delete mode 100644 megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu

diff --git a/megatron/fused_kernels/rocm/layer_norm_cuda.cpp b/megatron/fused_kernels/rocm/layer_norm_cuda.cpp
deleted file mode 100644
index 8f28e7b4ad..0000000000
--- a/megatron/fused_kernels/rocm/layer_norm_cuda.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*This code is copied fron NVIDIA apex:
- *     https://github.com/NVIDIA/apex
- *     with minor changes. */
-
-#include <torch/extension.h>
-#include <vector>
-#include <cassert>
-#include "compat.h"
-
-namespace {
-
-void compute_n1_n2(
-    at::Tensor input,
-    at::IntArrayRef normalized_shape,
-    int& n1,
-    int& n2) {
-    int idiff = input.ndimension() - normalized_shape.size();
-    n2 = 1;
-    for (int i = 0;  i < (int)normalized_shape.size();  ++i) {
-	    assert( input.sizes()[i+idiff] == normalized_shape[i] );
-	    n2 *= normalized_shape[i];
-    }
-    n1 = 1;
-    for (int i = 0;  i < idiff;  ++i) {
-	    n1 *= input.sizes()[i];
-    }
-}
-
-void check_args(
-    at::IntArrayRef normalized_shape,
-    at::Tensor gamma,
-    at::Tensor beta
-    )
-{
-    TORCH_CHECK(!gamma.defined() || gamma.sizes().equals(normalized_shape));
-    TORCH_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape));
-}
-
-void check_args(
-    at::Tensor input,
-    at::IntArrayRef normalized_shape,
-    int& n1,
-    int& n2
-    )
-{
-    int64_t normalized_ndim = normalized_shape.size();
-
-    if (normalized_ndim < 1) {
-      std::stringstream ss;
-      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
-         << "containing at least one element, but got normalized_shape="
-         << normalized_shape;
-      throw std::runtime_error(ss.str());
-    }
-
-    auto input_shape = input.sizes();
-    auto input_ndim = input.dim();
-
-    if (input_ndim < normalized_ndim ||
-        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
-      std::stringstream ss;
-      ss << "Given normalized_shape=" << normalized_shape
-         << ", expected input with shape [*";
-      for (auto size : normalized_shape) {
-        ss << ", " << size;
-      }
-      ss << "], but got input of size" << input_shape;
-      throw std::runtime_error(ss.str());
-    }
-
-    compute_n1_n2(input,normalized_shape,n1,n2);
-}
-
-
-void check_args(
-    at::Tensor input,
-    at::IntArrayRef normalized_shape,
-    at::Tensor gamma,
-    at::Tensor beta,
-    int& n1,
-    int& n2
-    )
-{
-    check_args(input,normalized_shape,n1,n2);
-    check_args(normalized_shape,gamma,beta);
-}
-}
-
-void cuda_layer_norm(
-    at::Tensor* output,
-    at::Tensor* mean,
-    at::Tensor* invvar,
-    at::Tensor* input,
-    int n1,
-    int n2,
-    at::IntArrayRef normalized_shape,
-    at::Tensor* gamma,
-    at::Tensor* beta,
-    double epsilon);
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-std::vector<at::Tensor> layer_norm_affine(
-    at::Tensor input,
-    at::IntArrayRef normalized_shape,
-    at::Tensor gamma,
-    at::Tensor beta,
-    double epsilon) {
-  
-  CHECK_INPUT(input);
-  CHECK_INPUT(gamma);
-  CHECK_INPUT(beta);
-  int n1, n2;
-  check_args(input, normalized_shape, gamma, beta, n1, n2);
-
-  at::Tensor output = at::empty_like(
-      input, gamma.options().dtype(gamma.scalar_type()));
-  at::Tensor mean = at::empty(
-      {n1}, input.options().dtype(at::ScalarType::Float));
-  at::Tensor invvar = at::empty_like(mean);
-
-  cuda_layer_norm(&output, &mean, &invvar, &input, n1, n2,
-      normalized_shape, &gamma, &beta, epsilon);
-
-  return {output, mean, invvar};
-
-}
-
-
-void cuda_layer_norm_gradient(
-    at::Tensor* dout,
-    at::Tensor* mean,
-    at::Tensor* invvar,
-    at::Tensor* input,
-    int n1,
-    int n2,
-    at::IntArrayRef normalized_shape,
-    at::Tensor* gamma,
-    at::Tensor* beta,
-    double epsilon,
-    at::Tensor* grad_input,
-    at::Tensor* grad_gamma,
-    at::Tensor* grad_beta
-    );
-
-std::vector<at::Tensor> layer_norm_gradient_affine(
-    at::Tensor dout,
-    at::Tensor mean,
-    at::Tensor invvar,
-    at::Tensor input,
-    at::IntArrayRef normalized_shape,
-    at::Tensor gamma,
-    at::Tensor beta,
-    double epsilon) {
-
-  CHECK_INPUT(dout);
-  CHECK_INPUT(mean);
-  CHECK_INPUT(invvar);
-  CHECK_INPUT(input);
-  CHECK_INPUT(gamma);
-  CHECK_INPUT(beta);
-  int n1, n2;
-  check_args(input, normalized_shape, gamma, beta, n1, n2);
-
-  at::Tensor grad_input = at::empty_like(input);
-  at::Tensor grad_gamma = at::empty_like(gamma);
-  at::Tensor grad_beta = at::empty_like(beta);
-
-  cuda_layer_norm_gradient(&dout, &mean, &invvar, &input, n1, n2,
-      normalized_shape, &gamma, &beta, epsilon,
-      &grad_input, &grad_gamma, &grad_beta);
-
-  return {grad_input, grad_gamma, grad_beta};
-
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward_affine", &layer_norm_affine,
-	"LayerNorm forward (CUDA)");
-  m.def("backward_affine", &layer_norm_gradient_affine,
-	"LayerNorm backward (CUDA)");
-}
diff --git a/megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu
deleted file mode 100644
index 8a07806b13..0000000000
--- a/megatron/fused_kernels/rocm/layer_norm_cuda_kernel.cu
+++ /dev/null
@@ -1,866 +0,0 @@
-/* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*This code is copied fron NVIDIA apex:
- *     https://github.com/NVIDIA/apex
- *     with minor changes. */
-
-#include "ATen/ATen.h"
-#include "ATen/AccumulateType.h"
-#include "ATen/cuda/CUDAContext.h"
-#include "ATen/cuda/DeviceUtils.cuh"
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "type_shim.h"
-
-template<typename U> __device__
-void cuWelfordOnlineSum(
-  const U curr,
-  U& mu,
-  U& sigma2,
-  U& count)
-{
-  count = count + U(1);
-  U delta = curr - mu;
-  U lmean = mu + delta / count;
-  mu = lmean;
-  U delta2 = curr - lmean;
-  sigma2 = sigma2 + delta * delta2;
-}
-
-template<typename U> __device__
-void cuChanOnlineSum(
-  const U muB,
-  const U sigma2B,
-  const U countB,
-  U& mu,
-  U& sigma2,
-  U& count)
-{
-  U delta = muB - mu;
-  U nA = count;
-  U nB = countB;
-  count = count + countB;
-  U nX = count;
-  if (nX > U(0)) {
-    nA = nA / nX;
-    nB = nB / nX;
-    mu = nA*mu + nB*muB;
-    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
-  } else {
-    mu = U(0);
-    sigma2 = U(0);
-  }
-}
-
-template<typename T, typename U> __device__
-void cuWelfordMuSigma2(
-  const T* __restrict__ vals,
-  const int n1,
-  const int n2,
-  const int i1,
-  U& mu,
-  U& sigma2,
-  U* buf,
-  const int GPU_WARP_SIZE)
-{
-  // Assumptions:
-  // 1) blockDim.x == warpSize
-  // 2) Tensor is contiguous
-  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
-  //
-  // compute variance and mean over n2
-  U count = U(0);
-  mu= U(0);
-  sigma2 = U(0);
-  if (i1 < n1) {
-    // one warp normalizes one n1 index,
-    // synchronization is implicit
-    // initialize with standard Welford algorithm
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    const T* lvals = vals + i1*n2;
-    int l = 4*thrx;
-    for (;  l+3 < n2;  l+=4*numx) {
-      for (int k = 0;  k < 4;  ++k) {
-        U curr = static_cast<U>(lvals[l+k]);
-        cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
-      }
-    }
-    for (;  l < n2;  ++l) {
-      U curr = static_cast<U>(lvals[l]);
-      cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
-    }
-    // intra-warp reductions
-    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
-      U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
-      U muB = WARP_SHFL_DOWN(mu, stride);
-      U countB = WARP_SHFL_DOWN(count, stride);
-      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
-    }
-    // threadIdx.x == 0 has correct values for each warp
-    // inter-warp reductions
-    if (blockDim.y > 1) {
-      U* ubuf = (U*)buf;
-      U* ibuf = (U*)(ubuf + blockDim.y);
-      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
-        // upper half of warps write to shared
-        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
-          const int wrt_y = threadIdx.y - offset;
-          ubuf[2*wrt_y] = mu;
-          ubuf[2*wrt_y+1] = sigma2;
-          ibuf[wrt_y] = count;
-        }
-        __syncthreads();
-        // lower half merges
-        if (threadIdx.x == 0 && threadIdx.y < offset) {
-          U muB = ubuf[2*threadIdx.y];
-          U sigma2B = ubuf[2*threadIdx.y+1];
-          U countB = ibuf[threadIdx.y];
-          cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
-        }
-        __syncthreads();
-      }
-      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        ubuf[0] = mu;
-        ubuf[1] = sigma2;
-      }
-      __syncthreads();
-      mu = ubuf[0];
-      sigma2 = ubuf[1]/U(n2);
-      // don't care about final value of count, we know count == n2
-    } else {
-      mu = WARP_SHFL(mu, 0);
-      sigma2 = WARP_SHFL(sigma2/U(n2), 0);
-    }
-  }
-}
-
-template<> __device__
-void cuWelfordMuSigma2(
-  const at::Half* __restrict__ vals,
-  const int n1,
-  const int n2,
-  const int i1,
-  float& mu,
-  float& sigma2,
-  float* buf,
-  const int GPU_WARP_SIZE)
-{
-  // Assumptions:
-  // 1) blockDim.x == warpSize
-  // 2) Tensor is contiguous
-  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
-  //
-  // compute variance and mean over n2
-  float count = 0.0f;
-  mu= float(0);
-  sigma2 = float(0);
-  if (i1 < n1) {
-    // one warp normalizes one n1 index,
-    // synchronization is implicit
-    // initialize with standard Welford algorithm
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    const at::Half* lvals = vals + i1*n2;
-    int l = 8*thrx;
-    if ((((size_t)lvals)&3) != 0) {
-      // 16 bit alignment
-      // first thread consumes first point
-      if (thrx == 0) {
-        float curr = static_cast<float>(lvals[0]);
-        cuWelfordOnlineSum(curr,mu,sigma2,count);
-      }
-      ++l;
-    }
-    // at this point, lvals[l] are 32 bit aligned for all threads.
-    for (;  l+7 < n2;  l+=8*numx) {
-      for (int k = 0;  k < 8;  k+=2) {
-        float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
-        cuWelfordOnlineSum(curr.x,mu,sigma2,count);
-	cuWelfordOnlineSum(curr.y,mu,sigma2,count);
-      }
-    }
-    for (;  l < n2;  ++l) {
-      float curr = static_cast<float>(lvals[l]);
-      cuWelfordOnlineSum(curr,mu,sigma2,count);
-    }
-    // intra-warp reductions
-    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
-      float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
-      float muB = WARP_SHFL_DOWN(mu, stride);
-      float countB = WARP_SHFL_DOWN(count, stride);
-      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
-    }
-    // threadIdx.x == 0 has correct values for each warp
-    // inter-warp reductions
-    if (blockDim.y > 1) {
-      float* ubuf = (float*)buf;
-      float* ibuf = (float*)(ubuf + blockDim.y);
-      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
-        // upper half of warps write to shared
-        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
-          const int wrt_y = threadIdx.y - offset;
-          ubuf[2*wrt_y] = mu;
-          ubuf[2*wrt_y+1] = sigma2;
-          ibuf[wrt_y] = count;
-        }
-        __syncthreads();
-        // lower half merges
-        if (threadIdx.x == 0 && threadIdx.y < offset) {
-          float muB = ubuf[2*threadIdx.y];
-          float sigma2B = ubuf[2*threadIdx.y+1];
-          float countB = ibuf[threadIdx.y];
-          cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
-        }
-        __syncthreads();
-      }
-      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        ubuf[0] = mu;
-        ubuf[1] = sigma2;
-      }
-      __syncthreads();
-      mu = ubuf[0];
-      sigma2 = ubuf[1]/float(n2);
-      // don't care about final value of count, we know count == n2
-    } else {
-      mu = WARP_SHFL(mu, 0);
-      sigma2 = WARP_SHFL(sigma2/float(n2), 0);
-    }
-  }
-}
-#ifndef __HIP_PLATFORM_HCC__
-template<typename U> U rsqrt(U v) {
-#else
-template<typename U> __device__ U rsqrt(U v) {
-#endif
-  return U(1) / sqrt(v);
-}
-#ifndef __HIP_PLATFORM_HCC__
-template<> float rsqrt(float v) {
-#else
-template<> __device__ float rsqrt(float v) {
-#endif
-  return rsqrtf(v);
-}
-#ifndef __HIP_PLATFORM_HCC__
-template<> double rsqrt(double v) {
-#else
-template<> __device__ double rsqrt(double v) {
-#endif
-  return rsqrt(v);
-}
-
-namespace {
-// This is the un-specialized struct.  Note that we prevent instantiation of this
-// struct by putting an undefined symbol in the function body so it won't compile.
-//  template <typename T>
-//  struct SharedMemory
-//  {
-//      // Ensure that we won't compile any un-specialized types
-//      __device__ T *getPointer()
-//      {
-//          extern __device__ void error(void);
-//          error();
-//          return NULL;
-//      }
-//  };
-// https://github.com/NVIDIA/apex/issues/246
-template <typename T>
-struct SharedMemory;
-
-template <>
-struct SharedMemory <float>
-{
-    __device__ float *getPointer()
-    {
-        extern __shared__ float s_float[];
-        return s_float;
-    }
-};
-
-}
-
-template<typename T, typename U, typename V> __global__
-void cuApplyLayerNorm(
-  V* __restrict__ output_vals,
-  U* __restrict__ mean,
-  U* __restrict__ invvar,
-  const T* __restrict__ vals,
-  const int n1,
-  const int n2,
-  const U epsilon,
-  const V* __restrict__ gamma,
-  const V* __restrict__ beta,
-  const int GPU_WARP_SIZE
-  ) 
-{
-  // Assumptions:
-  // 1) blockDim.x == warpSize
-  // 2) Tensors are contiguous
-  //
-#ifndef __HIP_PLATFORM_HCC__
-  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
-#else
-  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
-#endif
-    SharedMemory<U> shared;
-    U* buf = shared.getPointer();
-    U mu,sigma2;
-    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE);
-    const T* lvals = vals + i1*n2;
-    V* ovals = output_vals + i1*n2;
-    U c_invvar = rsqrt(sigma2 + epsilon);
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    if (gamma != NULL && beta != NULL) {
-      for (int i = thrx;  i < n2;  i+=numx) {
-        U curr = static_cast<U>(lvals[i]);
-        ovals[i] = gamma[i] * static_cast<V>(c_invvar * (curr - mu)) + beta[i];
-      }
-    } else {
-      for (int i = thrx;  i < n2;  i+=numx) {
-        U curr = static_cast<U>(lvals[i]);
-        ovals[i] = static_cast<V>(c_invvar * (curr - mu));
-      }
-    }
-    if (threadIdx.x == 0 && threadIdx.y == 0) {
-      mean[i1] = mu;
-      invvar[i1] = c_invvar;
-    }
-    __syncthreads();
-  }
-}
-
-template<typename T, typename U, typename V> __device__
-void cuLoadWriteStridedInputs(
-    const int i1_block,
-    const int thr_load_row_off,
-    const int thr_load_col_off,
-    const int i2_off,
-    const int row_stride,
-    U* warp_buf1,
-    U* warp_buf2,
-    const T* input,
-    const V* dout,
-    const int i1_end,
-    const int n2,
-    const U* __restrict__ mean,
-    const U* __restrict__ invvar
-    )
-{
-  int i1 = i1_block+thr_load_row_off;
-  if (i1 < i1_end) {
-    U curr_mean = mean[i1];
-    U curr_invvar = invvar[i1];
-    for (int k = 0;  k < blockDim.y;  ++k) {
-      int i2 = i2_off + k;
-      int load_idx = i1*n2+i2;
-      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
-      if (i2<n2) {
-        U curr_input = static_cast<U>(input[load_idx]);
-	U curr_dout = static_cast<U>(dout[load_idx]);
-	warp_buf1[write_idx] = curr_dout;
-	warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
-      } else {
-        warp_buf1[write_idx] = U(0);
-        warp_buf2[write_idx] = U(0);
-      }
-    }
-  } else {
-    for (int k = 0;  k < blockDim.y;  ++k) {
-      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
-      warp_buf1[write_idx] = U(0);
-      warp_buf2[write_idx] = U(0);
-    }
-  }
-}
-
-template<typename T, typename U, typename V> __device__
-void cuLoadAddStridedInputs(
-    const int i1_block,
-    const int thr_load_row_off,
-    const int thr_load_col_off,
-    const int i2_off,
-    const int row_stride,
-    U* warp_buf1,
-    U* warp_buf2,
-    const T* input,
-    const V* dout,
-    const int i1_end,
-    const int n2,
-    const U* __restrict__ mean,
-    const U* __restrict__ invvar
-    )
-{
-  int i1 = i1_block+thr_load_row_off;
-  if (i1 < i1_end) {
-    U curr_mean = mean[i1];
-    U curr_invvar = invvar[i1];
-    for (int k = 0;  k < blockDim.y;  ++k) {
-      int i2 = i2_off + k;
-      int load_idx = i1*n2+i2;
-      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
-      if (i2<n2) {
-        U curr_input = static_cast<U>(input[load_idx]);
-	U curr_dout = static_cast<U>(dout[load_idx]);
-	warp_buf1[write_idx] += curr_dout;
-	warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
-      }
-    }
-  }
-}
-
-template<typename T, typename U, typename V> __global__
-void cuComputePartGradGammaBeta(
-    const V* __restrict__ dout,
-    const T* __restrict__ input,
-    const int n1,
-    const int n2,
-    const U* __restrict__ mean,
-    const U* __restrict__ invvar,
-    U epsilon,
-    U* part_grad_gamma,
-    U* part_grad_beta)
-{
-    const int numsegs_n1 = (n1+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
-    const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
-    const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
-    const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
-    const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
-    const int row_stride = blockDim.x+1;
-    const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
-    const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
-    const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
-    SharedMemory<U> shared;
-    U* buf = shared.getPointer(); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
-    U* warp_buf1 = (U*)buf;
-    U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
-    // compute partial sums from strided inputs
-    // do this to increase number of loads in flight
-    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
-    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
-      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
-    }
-    __syncthreads();
-    // inter-warp reductions
-    // sum within each warp
-    U acc1 = U(0);
-    U acc2 = U(0);
-    for (int k = 0;  k < blockDim.y;  ++k) {
-      int row1 = threadIdx.y + k*blockDim.y;
-      int idx1 = row1*row_stride + threadIdx.x;
-      acc1 += warp_buf1[idx1];
-      acc2 += warp_buf2[idx1];
-    }
-    warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
-    warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
-    __syncthreads();
-    // sum all warps
-    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
-      if (threadIdx.y < offset) {
-        int row1 = threadIdx.y;
-	int row2 = threadIdx.y + offset;
-	int idx1 = row1*row_stride + threadIdx.x;
-	int idx2 = row2*row_stride + threadIdx.x;
-	warp_buf1[idx1] += warp_buf1[idx2];
-	warp_buf2[idx1] += warp_buf2[idx2];
-      }
-      __syncthreads();
-    }
-    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
-    if (threadIdx.y == 0 && i2 < n2) {
-      int row1 = threadIdx.y;
-      int row2 = threadIdx.y + 1;
-      int idx1 = row1*row_stride + threadIdx.x;
-      int idx2 = row2*row_stride + threadIdx.x;
-      part_grad_beta[blockIdx.y*n2+i2] = warp_buf1[idx1] + warp_buf1[idx2];
-      part_grad_gamma[blockIdx.y*n2+i2] = warp_buf2[idx1] + warp_buf2[idx2];
-    }
-}
-
-template<typename U, typename V> __global__
-void cuComputeGradGammaBeta(
-    const U* part_grad_gamma,
-    const U* part_grad_beta,
-    const int part_size,
-    const int n1,
-    const int n2,
-    V* grad_gamma,
-    V* grad_beta)
-{
-    // sum partial gradients for gamma and beta
-    SharedMemory<U> shared;
-    U* buf = shared.getPointer(); 
-    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i2 < n2) {
-      // each warp does sequential reductions until reduced part_size is num_warps
-      int num_warp_reductions = part_size / blockDim.y;
-      U sum_gamma = U(0);
-      U sum_beta = U(0);
-      const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
-      const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
-      for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
-        sum_gamma += part_grad_gamma_ptr[warp_offset*n2];
-        sum_beta += part_grad_beta_ptr[warp_offset*n2];
-      }
-      // inter-warp reductions
-      const int nbsize3 = blockDim.x * blockDim.y / 2;
-      for (int offset = blockDim.y/2;  offset >= 1;  offset /= 2) {
-        // top half write to shared memory
-        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
-          const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
-          buf[write_idx] = sum_gamma;
-          buf[write_idx+nbsize3] = sum_beta;
-        }
-        __syncthreads();
-        // bottom half sums
-        if (threadIdx.y < offset) {
-          const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
-          sum_gamma += buf[read_idx];
-          sum_beta += buf[read_idx+nbsize3];
-        }
-        __syncthreads();
-      }
-      // write out fully summed gradients
-      if (threadIdx.y == 0) {
-        grad_gamma[i2] = sum_gamma;
-        grad_beta[i2] = sum_beta;
-      }
-    }
-}
-
-template<typename T, typename U, typename V> __global__
-void cuComputeGradInput(
-    const V* __restrict__ dout,
-    const T* __restrict__ input,
-    const int n1,
-    const int n2,
-    const U* __restrict__ mean,
-    const U* __restrict__ invvar,
-    U epsilon,
-    const V* gamma,
-    T* grad_input)
-{
-#ifndef __HIP_PLATFORM_HCC__
-  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
-#else
-  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
-#endif
-    U sum_loss1 = U(0);
-    U sum_loss2 = U(0);
-    const U c_mean = mean[i1];
-    const U c_invvar = invvar[i1];
-    const T* k_input = input + i1*n2;
-    const V* k_dout = dout + i1*n2;
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    if (gamma != NULL) {
-      int l = 4*thrx;
-      for (;  l+3 < n2;  l+=4*numx) {
-        for (int k = 0;  k < 4;  ++k) {
-          const U c_h = static_cast<U>(k_input[l+k]);
-          const U c_loss = static_cast<U>(k_dout[l+k]);
-          sum_loss1 += c_loss * gamma[l+k];
-          sum_loss2 += c_loss * gamma[l+k] * (c_h - c_mean) * c_invvar;
-        }
-      }
-      for (;  l < n2;  ++l) {
-        const U c_h = static_cast<U>(k_input[l]);
-        const U c_loss = static_cast<U>(k_dout[l]);
-        sum_loss1 += c_loss * gamma[l];
-        sum_loss2 += c_loss * gamma[l] * (c_h - c_mean) * c_invvar;
-      }
-    } else {
-      int l = 4*thrx;
-      for (;  l+3 < n2;  l+=4*numx) {
-        for (int k = 0;  k < 4;  ++k) {
-          const U c_h = static_cast<U>(k_input[l+k]);
-          const U c_loss = static_cast<U>(k_dout[l+k]);
-          sum_loss1 += c_loss;
-          sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
-        }
-      }
-      for (;  l < n2;  ++l) {
-        const U c_h = static_cast<U>(k_input[l]);
-        const U c_loss = static_cast<U>(k_dout[l]);
-        sum_loss1 += c_loss;
-        sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
-      }
-    }
-    // intra-warp reductions
-    for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
-      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
-      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
-    }
-    // inter-warp reductions
-    if (blockDim.y > 1) {
-      SharedMemory<U> shared;
-      U* buf = shared.getPointer(); 
-      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
-        // upper half of warps write to shared
-        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
-          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
-          buf[2*wrt_i] = sum_loss1;
-          buf[2*wrt_i+1] = sum_loss2;
-        }
-        __syncthreads();
-        // lower half merges
-        if (threadIdx.y < offset) {
-          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
-          sum_loss1 += buf[2*read_i];
-          sum_loss2 += buf[2*read_i+1];
-        }
-        __syncthreads();
-      }
-      if (threadIdx.y == 0) {
-        buf[2*threadIdx.x] = sum_loss1;
-        buf[2*threadIdx.x+1] = sum_loss2;
-      }
-      __syncthreads();
-      if (threadIdx.y !=0) {
-        sum_loss1 = buf[2*threadIdx.x];
-        sum_loss2 = buf[2*threadIdx.x+1];
-      } 
-    }
-    // all threads now have the two sums over l
-    U fH = (U)n2;
-    U term1 = (U(1) / fH) * c_invvar;
-    T* k_grad_input = grad_input + i1*n2;
-    if (gamma != NULL) {
-      for (int l = thrx;  l < n2;  l+=numx) {
-        const U c_h = static_cast<U>(k_input[l]);
-        const U c_loss = static_cast<U>(k_dout[l]);
-        U f_grad_input = fH * c_loss * gamma[l];
-        f_grad_input -= sum_loss1;
-        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
-        f_grad_input *= term1;
-        k_grad_input[l] = static_cast<T>(f_grad_input);
-      }
-    } else {
-      for (int l = thrx;  l < n2;  l+=numx) {
-        const U c_h = static_cast<U>(k_input[l]);
-        const U c_loss = static_cast<U>(k_dout[l]);
-        U f_grad_input = fH * c_loss;
-        f_grad_input -= sum_loss1;
-        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
-        f_grad_input *= term1;
-        k_grad_input[l] = static_cast<T>(f_grad_input);
-      }
-    }
-    // prevent race where buf is written again before reads are done
-    __syncthreads();
-  }
-}
-
-
-
-
-template<typename T, typename U, typename V> 
-void HostApplyLayerNorm(
-    V* output,
-    U* mean,
-    U* invvar,
-    const T* input,
-    int n1,
-    int n2,
-    double epsilon,
-    const V* gamma,
-    const V* beta
-    )
-{
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    const int warp_size = at::cuda::warp_size();
-    dim3 threads(warp_size,4,1);
-#ifndef __HIP_PLATFORM_HCC__
-    threads.y = 1;
-#endif
-    const uint64_t maxGridY =
-      at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-    const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
-    int nshared = 
-        threads.y > 1 ? 
-	    threads.y*sizeof(U)+(threads.y/2)*sizeof(U) : 
-	    0;
-    cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(
-		    output,
-		    mean,
-		    invvar,
-		    input,
-		    n1,n2,
-		    U(epsilon),
-                    gamma,
-		    beta,
-		    warp_size);
-}
-
-
-void cuda_layer_norm(
-    at::Tensor* output,
-    at::Tensor* mean,
-    at::Tensor* invvar,
-    at::Tensor* input,
-    int n1,
-    int n2,
-    #ifdef VERSION_GE_1_1
-    at::IntArrayRef normalized_shape,
-    #else
-    at::IntList normalized_shape,
-    #endif
-    at::Tensor* gamma,
-    at::Tensor* beta,
-    double epsilon)
-{
-    using namespace at;
-    DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
-        input->scalar_type(), output->scalar_type(), "cuda_layer_norm_kernel",
-        HostApplyLayerNorm(
-	    output->DATA_PTR<scalar_t_out>(),
-	    mean->DATA_PTR<float>(),
-	    invvar->DATA_PTR<float>(),
-	    input->DATA_PTR<scalar_t_in>(),
-	    n1,n2,
-	    epsilon,
-	    gamma != NULL ? gamma->DATA_PTR<scalar_t_out>() : NULL,
-	    beta != NULL ? beta->DATA_PTR<scalar_t_out>() : NULL);
-      )
-}
-
-
-template<typename T, typename U, typename V>
-void HostLayerNormGradient(
-    const V* dout,
-    const U* mean,
-    const U* invvar,
-    at::Tensor* input,
-    int n1,
-    int n2,
-    const V* gamma,
-    const V* beta,
-    double epsilon,
-    T* grad_input,
-    V* grad_gamma,
-    V* grad_beta
-    )
-{
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    const int warp_size = at::cuda::warp_size();
-
-    if (gamma != NULL && beta != NULL) {
-      // compute grad_gamma(j) and grad_beta(j)
-#ifndef __HIP_PLATFORM_HCC__
-      const int part_size = warp_size;
-#else
-      const int part_size = 16;
-#endif
-      const dim3 threads2(warp_size,4,1);
-      const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
-      const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y *
-	(threads2.x + 1);
-      const int nshared2_b = threads2.x * threads2.y * sizeof(U);
-      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
-      at::Tensor part_grad_gamma = at::empty(
-	  {part_size,n2}, input->options().dtype(at::ScalarType::Float));
-      at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
-      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
-		      dout,
-		      input->DATA_PTR<T>(),
-		      n1,n2,
-		      mean,
-		      invvar,
-		      U(epsilon),
-		      part_grad_gamma.DATA_PTR<U>(),
-		      part_grad_beta.DATA_PTR<U>());
-
-      const dim3 threads3(warp_size,8,1);
-      const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
-      const int nshared3 = threads3.x * threads3.y * sizeof(U);
-      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
-		      part_grad_gamma.DATA_PTR<U>(),
-		      part_grad_beta.DATA_PTR<U>(),
-		      part_size,
-		      n1,n2,
-		      grad_gamma,
-		      grad_beta);
-    }
-
-    // compute grad_input
-    const uint64_t maxGridY =
-      at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-    const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
-    dim3 threads1(warp_size,4,1);
-#ifndef __HIP_PLATFORM_HCC__
-    threads1.y = 2;
-#endif
-    int nshared =
-	    threads1.y > 1 ?
-	    threads1.y*threads1.x*sizeof(U) :
-	    0;
-    cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
-            dout,
-            input->DATA_PTR<T>(),
-            n1,n2,
-            mean,
-            invvar,
-            U(epsilon),
-            gamma,
-            grad_input);
-}
-
-
-void cuda_layer_norm_gradient(
-    at::Tensor* dout,
-    at::Tensor* mean,
-    at::Tensor* invvar,
-    at::Tensor* input,
-    int n1,
-    int n2,
-    #ifdef VERSION_GE_1_1
-    at::IntArrayRef normalized_shape,
-    #else
-    at::IntList normalized_shape,
-    #endif
-    at::Tensor* gamma,
-    at::Tensor* beta,
-    double epsilon,
-    at::Tensor* grad_input,
-    at::Tensor* grad_gamma,
-    at::Tensor* grad_beta)
-{
-    using namespace at;
-    DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(
-        input->scalar_type(), gamma->scalar_type(),
-	"cuda_layer_norm_gradient_kernel",
-        HostLayerNormGradient(
-	    dout->DATA_PTR<scalar_t_out>(),
-	    mean->DATA_PTR<float>(),
-	    invvar->DATA_PTR<float>(),
-	    input,
-	    n1,n2,
-            // TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
-            // if gamma Tensor is NULL on input.
-	    gamma != NULL ? gamma->DATA_PTR<scalar_t_out>() : NULL,
-	    gamma != NULL ? beta->DATA_PTR<scalar_t_out>() : NULL,
-	    epsilon,
-	    grad_input->DATA_PTR<scalar_t_in>(),
-	    gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_out>() : NULL,
-	    gamma != NULL ? grad_beta->DATA_PTR<scalar_t_out>() : NULL);
-      )
-}

From b291323585739d45a0ae07db73a73a1a7c43a2cd Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Mon, 12 Jun 2023 20:28:35 +0200
Subject: [PATCH 122/144] Create finetune_starcoderplus.slurm

---
 examples/finetune_starcoderplus.slurm | 141 ++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 examples/finetune_starcoderplus.slurm

diff --git a/examples/finetune_starcoderplus.slurm b/examples/finetune_starcoderplus.slurm
new file mode 100644
index 0000000000..e99c04dde9
--- /dev/null
+++ b/examples/finetune_starcoderplus.slurm
@@ -0,0 +1,141 @@
+#!/bin/bash
+#SBATCH --job-name=starcoderplus
+#SBATCH --nodes=64
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --gres=gpu:8
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/leandro/logs/starcoderplus/bcs-%x-%j.out
+
+set -x -e
+source /admin/home/leandro/.bashrc
+
+conda activate megatron
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/leandro/git/Megatron-LM-BC
+pushd $SCRIPT_REPO
+
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+STARCODER_PATH=/fsx/boomcode/starcoder/
+CHECKPOINT_PATH=/fsx/boomcode/starcoderplus/$SLURM_JOB_ID
+TOKENIZER_FILE=/fsx/boomcode/tokenizer-starcoder/tokenizer.json
+WEIGHTS_TRAIN=/fsx/boomcode/datamix/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/boomcode/datamix/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+        --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+       --sequence-parallel \
+       --num-layers 40 \
+       --hidden-size 6144 \
+       --num-attention-heads 48 \
+       --attention-head-type multiquery \
+       --init-method-std 0.01275 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 512 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --train-iters 400000 \
+       --lr-decay-iters 150000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 1000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 2500 \
+       --eval-interval 2500 \
+       --eval-iters 2 \
+       --valid-num-workers 0 \
+       --override-opt_param-scheduler \
+       --no-load-optim \
+       --no-load-rng \
+       --finetune \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+CMD=" \
+    $SCRIPT_REPO/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $STARCODER_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name lvwerra \
+    --wandb-project-name starcoder-plus \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+export CUDA_HOME=/usr/local/cuda-11.6
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From 203b0712b617a2f9157de10c6ec295b73801f93e Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-01.elementai.net>
Date: Mon, 12 Jun 2023 21:07:07 +0000
Subject: [PATCH 123/144] try with LayerNorm import from megatron.model

---
 megatron/model/__init__.py    | 1 +
 megatron/model/transformer.py | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 0502c4ade3..f5025bf25d 100644
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 
+from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 
 from .distributed import DistributedDataParallel
 from .bert_model import BertModel
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 9be81da5dc..98d0ac6750 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -15,9 +15,7 @@
 from .module import MegatronModule
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
-# TODO: which import?
-# from megatron.model import LayerNorm
-from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron.model import LayerNorm
 from megatron.model.enums import AttnMaskType, LayerType, AttnType, PositionEmbeddingType
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl

From 48c80465a3a148bd6f5190383f991384ceec5991 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-01.elementai.net>
Date: Mon, 12 Jun 2023 22:01:25 +0000
Subject: [PATCH 124/144] fix the merge

---
 megatron/model/transformer.py |  2 ++
 megatron/training.py          | 23 +++++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 98d0ac6750..3c1387a8c4 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -577,6 +577,8 @@ def __init__(self, init_method,
         self.attention_head_type = args.attention_head_type
         self.sequence_parallel = args.sequence_parallel
 
+        self.use_flash_attn = args.use_flash_attn
+
         projection_size = args.kv_channels * args.num_attention_heads
 
         # Per attention head and per partition values.
diff --git a/megatron/training.py b/megatron/training.py
index 6c117a451d..dc169a9cff 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -564,6 +564,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
 
     total_iterations = total_loss_dict[advanced_iters_key] + \
                        total_loss_dict[skipped_iters_key]
+    mem_stats = None
 
     # Tensorboard values.
     # Timer requires all the ranks to call.
@@ -665,6 +666,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
         log_string += ' number of nan iterations: {:3d} |'.format(
             total_loss_dict[nan_iters_key])
         log_string += ' TFLOPs: {:.2f} |'.format(tflops)
+        if args.log_memory_to_tensorboard and mem_stats is not None:
+            log_string += ' mem-reserved (GB): {:.2f} |'.format(mem_stats["reserved_bytes.all.current"]*1e-9)
         total_loss_dict[advanced_iters_key] = 0
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[nan_iters_key] = 0
@@ -1023,14 +1026,14 @@ def build_train_valid_test_data_loaders(
                                 mpu.get_tensor_model_parallel_src_rank(),
                                 group=mpu.get_tensor_model_parallel_group())
     args.do_train = flags[0].item()
-    num_valid_ds = flags[1].item()
-    num_test_ds = flags[2].item()
-    assert num_test_ds >= 0
-    assert num_valid_ds >= 0
-    args.do_valid = num_valid_ds > 0
-    args.do_test = num_test_ds > 0
+    args.num_valid_ds = flags[1].item()
+    args.num_test_ds = flags[2].item()
+    assert args.num_test_ds >= 0
+    assert args.num_valid_ds >= 0
+    args.do_valid = args.num_valid_ds > 0
+    args.do_test = args.num_test_ds > 0
 
-    return train_dataloader, valid_dataloader, test_dataloader
+    return train_dataloader, valid_dataloaders, test_dataloaders
 
 
 def build_train_valid_test_data_iterators(
@@ -1039,7 +1042,7 @@ def build_train_valid_test_data_iterators(
     args = get_args()
 
     # Build loaders.
-    train_dataloader, valid_dataloader, test_dataloader = \
+    train_dataloader, valid_dataloaders, test_dataloaders = \
         build_train_valid_test_data_loaders(
             build_train_valid_test_datasets_provider)
 
@@ -1058,13 +1061,13 @@ def build_train_valid_test_data_iterators(
                               else iter(cyclic_iter(valid_dataloaders))
                                  for vdl in valid_dataloaders]
     else:
-        valid_data_iterators = [None] * num_valid_ds
+        valid_data_iterators = [None] * args.num_valid_ds
 
     if test_dataloaders is not None:
         test_data_iterators = [iter(tdl) if dl_type == 'single' \
                              else iter(cyclic_iter(test_dataloaders))
                             for tdl in test_dataloaders]
     else:
-        test_data_iterators = [None] * num_test_ds
+        test_data_iterators = [None] * args.num_test_ds
 
     return train_data_iterator, valid_data_iterators, test_data_iterators

From ac497ce615812aaea5ab40eb09e5d85417ee148f Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-01.elementai.net>
Date: Tue, 13 Jun 2023 21:46:04 +0000
Subject: [PATCH 125/144] move setting of TORCH_CUDA_ARCH_LIST

---
 megatron/fused_kernels/__init__.py      | 9 +++++++++
 megatron/fused_kernels/cuda/__init__.py | 3 ---
 megatron/fused_kernels/rocm/__init__.py | 8 --------
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 450c43072b..74eb94fb69 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -1,6 +1,15 @@
+import os
 import torch
 
 
+# Setting this param to a list has a problem of generating different
+# compilation commands (with diferent order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicity in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
 def load(args):
     if torch.version.hip is None:
         print("running on CUDA devices")
diff --git a/megatron/fused_kernels/cuda/__init__.py b/megatron/fused_kernels/cuda/__init__.py
index d88834f670..9bddf7233b 100644
--- a/megatron/fused_kernels/cuda/__init__.py
+++ b/megatron/fused_kernels/cuda/__init__.py
@@ -6,9 +6,6 @@
 from torch.utils import cpp_extension
 from megatron.fused_kernels.utils import _create_build_dir
 
-# Do not override TORCH_CUDA_ARCH_LIST to allow for pre-compilation in Dockerfile
-# os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
 
 def load(args):
 
diff --git a/megatron/fused_kernels/rocm/__init__.py b/megatron/fused_kernels/rocm/__init__.py
index ff5ad0ab6b..f71a47e961 100644
--- a/megatron/fused_kernels/rocm/__init__.py
+++ b/megatron/fused_kernels/rocm/__init__.py
@@ -19,14 +19,6 @@
 from megatron.fused_kernels.utils import _create_build_dir
 
 
-# Setting this param to a list has a problem of generating different
-# compilation commands (with diferent order of architectures) and
-# leading to recompilation of fused kernels. Set it to empty string
-# to avoid recompilation and assign arch flags explicity in
-# extra_cuda_cflags below
-os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
-
 def load(args):
 
     # Build path

From 04031a8b2b483f9c923ca166a49d4a81618a8550 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond@dc1-wks-01.elementai.net>
Date: Wed, 14 Jun 2023 20:13:33 +0000
Subject: [PATCH 126/144] fix call to blendable dataset

---
 megatron/data/gpt_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 607e5d0deb..0c52b14147 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -145,7 +145,8 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
             assert ds is not None, \
                 f"Got an empty split when trying to create dataset: {prefixes[i], splits[i]}"
             datasets.append(ds)
-        all_datasets = BlendableDataset(datasets, weights)
+        total_size = sum(len(ds) for ds in datasets)
+        all_datasets = BlendableDataset(datasets, weights, total_size)
 
         return all_datasets
 

From 17217f8aa6a3f4a9f231a5ee88a614a4c55a6457 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Wed, 14 Jun 2023 17:29:51 -0400
Subject: [PATCH 127/144] fix blended dataset size in dataset groups

---
 megatron/data/gpt_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 0c52b14147..183f3cd460 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -106,6 +106,7 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
     '''
 
     assert train_valid_test in ["train","valid","test"]
+    index = ["train","valid","test"].index(train_valid_test)
 
     # Single dataset.
     if len(paths) == 1:
@@ -145,8 +146,7 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
             assert ds is not None, \
                 f"Got an empty split when trying to create dataset: {prefixes[i], splits[i]}"
             datasets.append(ds)
-        total_size = sum(len(ds) for ds in datasets)
-        all_datasets = BlendableDataset(datasets, weights, total_size)
+        all_datasets = BlendableDataset(datasets, weights, train_valid_test_num_samples[index])
 
         return all_datasets
 

From 0229a695488fc9440c0c78cac43240c101ffc51d Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 19 Jun 2023 09:45:16 -0400
Subject: [PATCH 128/144] find_checkpoint_rank_0 returns a single value

---
 megatron/checkpointing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index ca5f8878f6..787668bf38 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -140,7 +140,7 @@ def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
     if os.path.isfile(filename):
         return filename
 
-    return None, None
+    return None
 
 
 def get_checkpoint_tracker_filename(checkpoints_path):

From 37353b1767b95f6fcd9d392ef3023d3c6a93b1d1 Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 19 Jun 2023 10:39:53 -0400
Subject: [PATCH 129/144] fix checkpoint merge tools

---
 megatron/checkpointing.py           | 2 +-
 megatron/global_vars.py             | 5 +++--
 tools/checkpoint_loader_megatron.py | 2 +-
 tools/checkpoint_saver_megatron.py  | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 787668bf38..74854a4985 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -233,7 +233,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler):
     checkpoint_name = get_checkpoint_name(args.save, iteration)
 
     # Save distributed optimizer's custom parameter state.
-    if args.use_distributed_optimizer:
+    if args.use_distributed_optimizer and not args.no_save_optim:
         optim_checkpoint_name = \
             get_distributed_optimizer_checkpoint_name(checkpoint_name)
         ensure_directory_exists(optim_checkpoint_name)
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index e3831167fd..4e0118e10e 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -80,7 +80,7 @@ def _set_signal_handler():
 
 
 
-def set_global_variables(args):
+def set_global_variables(args, build_tokenizer=True):
     """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
 
     assert args is not None
@@ -89,7 +89,8 @@ def set_global_variables(args):
     set_args(args)
 
     _build_num_microbatches_calculator(args)
-    _ = _build_tokenizer(args)
+    if build_tokenizer:
+        _ = _build_tokenizer(args)
     _set_tensorboard_writer(args)
     _set_adlr_autoresume(args)
     _set_timers(args)
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
index 2b39781642..18e3ddfebe 100644
--- a/tools/checkpoint_loader_megatron.py
+++ b/tools/checkpoint_loader_megatron.py
@@ -152,7 +152,7 @@ def get_models(count, dtype):
                 models[vp_rank].append(model_[vp_rank])
         return models
 
-    set_global_variables(margs)
+    set_global_variables(margs, build_tokenizer=False)
     mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
     mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
     mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
index aeab753326..47f1b6c666 100644
--- a/tools/checkpoint_saver_megatron.py
+++ b/tools/checkpoint_saver_megatron.py
@@ -164,7 +164,7 @@ def check_message(msg):
 
     validate_args(margs)
 
-    set_global_variables(margs)
+    set_global_variables(margs, build_tokenizer=False)
 
     # margs = megatron args
     margs = get_args()

From 3dbd929f28e23d69ee6cacd97d34d1d5ee6324fa Mon Sep 17 00:00:00 2001
From: Raymond Li <raymond.li@servicenow.com>
Date: Mon, 19 Jun 2023 14:33:18 -0400
Subject: [PATCH 130/144] remove --finetune-from argument to make checkpoint
 loading logic simpler

---
 megatron/arguments.py     |  3 --
 megatron/checkpointing.py | 60 ++++-----------------------------------
 2 files changed, 6 insertions(+), 57 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 45214d1701..644fbb7a51 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -959,9 +959,6 @@ def _add_checkpointing_args(parser):
     group.add_argument('--use-checkpoint-args', action='store_true',
                        help='Override any command line arguments with arguments '
                        'from the checkpoint')
-    group.add_argument('--finetune-from', type=str, default=None,
-                       help='Directory containing a model checkpoint for finetuning.'
-                       'Will be loaded if the `--load` directory contains no checkpoint')
     group.add_argument('--exit-on-missing-checkpoint', action='store_true',
                        help="If '--load' is set, but checkpoint is not found "
                        "(e.g., path typo), then exit instead of random "
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 74854a4985..5a30619cd8 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -505,57 +505,10 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
     """
     args = get_args()
     load_dir = getattr(args, load_arg)
-    
-    # TODO: remove this redundant code. the tracker is already handled in _load_base_checkpoint
-    # TODO: retire the finetune_from arguments
-    # Determine from which directory we'll try to load
-    # ======
-    if iteration is None:
-        # Read the tracker file and set the iteration.
-        tracker_filename = get_checkpoint_tracker_filename(load_dir)
-        
-        # If we can directly load from load_dir, we resume an experiment
-        if os.path.isfile(tracker_filename) and load_arg != 'finetune_from':
-            args.finetune=False
-            print_rank_0(f"Resuming from {load_dir}")
-        # Finetuning from a pretrained model
-        elif os.path.isfile(tracker_filename) and load_arg == 'finetune_from':
-            assert arg.finetune
-            print_rank_0(f"Finetuning from {load_dir}")
-        else:
-            assert not os.path.isfile(tracker_filename)
-            # No tracker file and we are in finetuning, try to load from the `finetune_from` dir
-            if args.finetune:
-                print_rank_0('WARNING: could not find the metadata file {} '.format(
-                tracker_filename))
-                print_rank_0('    will try to load from `--finetune-from` instead')
-                load_dir = getattr(args, 'finetune_from')
-                tracker_filename = get_checkpoint_tracker_filename(load_dir)
-            # If no tracker file, return iteration zero.
-            if not os.path.isfile(tracker_filename):
-                print_rank_0('WARNING: could not find the metadata file {} '.format(
-                    tracker_filename))
-                print_rank_0('    will not load any checkpoints and will start from '
-                            'random')
-                return 0
-        
-        assert os.path.isfile(tracker_filename)
-        
-        # read the tracker file and either set the iteration or
-        # mark it as a release checkpoint.
-        iteration, release = read_metadata(tracker_filename)
-    else:
-        # Iteration given as argument: do nothing
-        release = False
-    # =======
 
     model = unwrap_model(model)
 
-    state_dict, release = \
-        _load_base_checkpoint(load_dir,
-                              rank0=False,
-                              iteration=iteration,
-                              release=release)
+    state_dict, release = _load_base_checkpoint(load_dir, rank0=False, iteration=iteration)
 
     # Checkpoint not loaded.
     if state_dict is None:
@@ -593,12 +546,11 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
     if 'args' in state_dict and not args.finetune:
         checkpoint_args = state_dict['args']
         check_checkpoint_args(checkpoint_args)
-        if not args.finetune:
-            args.consumed_train_samples = getattr(checkpoint_args,
-                                                'consumed_train_samples', 0)
-            update_num_microbatches(consumed_samples=args.consumed_train_samples)
-            args.consumed_valid_samples = getattr(checkpoint_args,
-                                                'consumed_valid_samples', 0)
+        args.consumed_train_samples = getattr(checkpoint_args,
+                                              'consumed_train_samples', 0)
+        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        args.consumed_valid_samples = getattr(checkpoint_args,
+                                              'consumed_valid_samples', 0)
     else:
         print_rank_0('could not find arguments in the checkpoint ...')
 

From 8196de1648dc73582ed9d43dab28cc002e89428b Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 21 Jun 2023 12:19:06 -0400
Subject: [PATCH 131/144] Skip unnecessary compilation

---
 megatron/initialize.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/megatron/initialize.py b/megatron/initialize.py
index 40b2370e8e..e387c4ee78 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -158,6 +158,15 @@ def _compile_dependencies():
         print('>>> done with dataset index builder. Compilation time: {:.3f} '
               'seconds'.format(time.time() - start_time), flush=True)
 
+    try:
+        # Skip the rest if the kernels are unnecessary or already available (ex. from apex)
+        if args.use_flash_attn or args.masked_softmax_fusion:
+            import scaled_upper_triang_masked_softmax_cuda
+            import scaled_masked_softmax_cuda
+        return
+    except ImportError:
+        pass
+
     # ==================
     # Load fused kernels
     # ==================

From 5b06c12d267f9d30dad108f3d02fbc489d70b91e Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:51:52 +0200
Subject: [PATCH 132/144] Create pretrain_bigcode_7b.slurm

---
 examples/pretrain_bigcode_7b.slurm | 143 +++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 examples/pretrain_bigcode_7b.slurm

diff --git a/examples/pretrain_bigcode_7b.slurm b/examples/pretrain_bigcode_7b.slurm
new file mode 100644
index 0000000000..536ccc0e80
--- /dev/null
+++ b/examples/pretrain_bigcode_7b.slurm
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=7b-starcoder
+#SBATCH --nodes=64
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=38
+#SBATCH --gres=gpu:8
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/bigcode/bigcode-training/logs/7b/%x-%j.out
+
+set -x -e
+source /admin/home/loubna/.bashrc
+
+conda activate megatron
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
+pushd $SCRIPT_REPO
+
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/7b-starcoder
+# Starcoder tokenizer and data paths in /fsx/bigcode
+TOKENIZER_FILE=/fsx/bigcode/bigcode-training/tokenizer-starcoder/tokenizer.json
+WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 42 \
+       --hidden-size 4096 \
+       --num-attention-heads 32 \
+       --attention-head-type multiquery \
+       --init-method-std 0.015625 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 512 \
+       --lr 0.0003 \
+       --min-lr 0.00003 \
+       --train-iters 250000 \
+       --lr-decay-iters 250000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 2000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 2500 \
+       --eval-interval 2500 \
+       --eval-iters 2 \
+       --use-distributed-optimizer \
+       --valid-num-workers 0 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+CMD=" \
+    /fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name loubnabnl \
+    --wandb-project-name bigcode-pretraining \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+export CUDA_HOME=/usr/local/cuda-11.6
+# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
+#export PATH="/usr/local/cuda-11.6/bin:$PATH"
+#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
+#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
+#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From 513d00d11d6ce368d9a1c2bdf4fda62f20085fdc Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Mon, 3 Jul 2023 10:01:41 +0100
Subject: [PATCH 133/144] Create pretrain_bigcode_1b.slurm

---
 examples/pretrain_bigcode_1b.slurm | 142 +++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 examples/pretrain_bigcode_1b.slurm

diff --git a/examples/pretrain_bigcode_1b.slurm b/examples/pretrain_bigcode_1b.slurm
new file mode 100644
index 0000000000..c9b850211f
--- /dev/null
+++ b/examples/pretrain_bigcode_1b.slurm
@@ -0,0 +1,142 @@
+#!/bin/bash
+#SBATCH --job-name=1b-starcoder
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=38
+#SBATCH --gres=gpu:8
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/bigcode/bigcode-training/logs/1b/%x-%j.out
+
+set -x -e
+source /admin/home/loubna/.bashrc
+
+conda activate megatron
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
+pushd $SCRIPT_REPO
+
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/1b  # Adjust: Directory to store the checkpoints
+# Starcoder tokenizer and data paths in /fsx/bigcode
+TOKENIZER_FILE=/fsx/loubna/starcoder-tokenizer/15b/tokenizer.json
+WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 2048 \
+       --num-attention-heads 16 \
+       --attention-head-type multiquery \
+       --init-method-std 0.02209 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 128 \
+       --lr 0.0004 \
+       --min-lr 0.00004 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 2000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 10000 \
+       --eval-interval 10000 \
+       --eval-iters 2 \
+       --valid-num-workers 0 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/1b/tensorboard"
+
+CMD=" \
+    /fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name loubnabnl \
+    --wandb-project-name bigcode-pretraining \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+export CUDA_HOME=/usr/local/cuda-11.6
+# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
+#export PATH="/usr/local/cuda-11.6/bin:$PATH"
+#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
+#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
+#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From 5a9c239df517f57cd0e888811fcad31195c07c27 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Mon, 3 Jul 2023 10:02:10 +0100
Subject: [PATCH 134/144] Create pretrain_bigcode_3b.slurm

---
 examples/pretrain_bigcode_3b.slurm | 143 +++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 examples/pretrain_bigcode_3b.slurm

diff --git a/examples/pretrain_bigcode_3b.slurm b/examples/pretrain_bigcode_3b.slurm
new file mode 100644
index 0000000000..1d411664e5
--- /dev/null
+++ b/examples/pretrain_bigcode_3b.slurm
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=3b-bigcode
+#SBATCH --nodes=32
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40
+#SBATCH --gres=gpu:8
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/bigcode/bigcode-training/logs/3b/%x-%j.out
+
+set -x -e
+source /admin/home/loubna/.bashrc
+
+conda activate megatron
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/loubna/code/Megatron-LM
+pushd $SCRIPT_REPO
+
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/3b   # Adjust: Directory to store the checkpoints
+# Starcoder tokenizer and data paths in /fsx/bigcode
+TOKENIZER_FILE=/fsx/bigcode/bigcode-training/tokenizer-starcoder/tokenizer.json
+WEIGHTS_TRAIN=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/bigcode/bigcode-training/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 36 \
+       --hidden-size 2816 \
+       --num-attention-heads 22 \
+       --attention-head-type multiquery \
+       --init-method-std 0.01884 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 256 \
+       --lr 0.0005 \
+       --min-lr 0.00005 \
+       --train-iters 500000 \
+       --lr-decay-iters 500000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 2000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 5000 \
+       --eval-interval 5000 \
+       --eval-iters 2 \
+       --use-distributed-optimizer \
+       --valid-num-workers 0 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+CMD=" \
+    /fsx/loubna/code/Megatron-LM/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name loubnabnl \
+    --wandb-project-name bigcode-3b \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+export CUDA_HOME=/usr/local/cuda-11.6
+# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
+#export PATH="/usr/local/cuda-11.6/bin:$PATH"
+#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
+#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
+#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From a993f05468d7b07e66572001cee7bec91cdeb1cc Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Wed, 19 Jul 2023 02:58:55 +0530
Subject: [PATCH 135/144] outputs not matching non-flash case in MQA

---
 megatron/model/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 3c1387a8c4..8051dbd143 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -414,7 +414,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask, alibi):
             # alibi: (batch_size * num_attention_heads, 1, max_seq_len)
             # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk)
             matmul_input_buffer = alibi[:bs * np, :, :sk].view(bs, np, sk)
-            matmul_input_buffer = matmul_input_buffer.repeat(1, sq, 1)  # [b, np * sq, sk]
+            matmul_input_buffer = matmul_input_buffer.unsqueeze(2).expand(bs, np, sq, sk).reshape(bs, np * sq, sk) # [b, np * sq, sk]
 
         if alibi is None:
             # Raw attention scores. [b, np * sq, sk]

From ebea9f29d32ea775dc0a7f9e1b4da2b4a714be50 Mon Sep 17 00:00:00 2001
From: Mayank Mishra <32954280+mayank31398@users.noreply.github.com>
Date: Fri, 21 Jul 2023 21:02:31 +0530
Subject: [PATCH 136/144] convert reshape to view (#73)

---
 megatron/core/tensor_parallel/layers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
index f43d79f46a..15e0fbb025 100644
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -279,8 +279,7 @@ def backward(ctx, grad_output):
         # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
         grad_output = grad_output.contiguous()
         # Convert the tensor shapes to 2D for execution compatibility
-        # TODO: Is the reshape preventing us from getting a speedup here?
-        grad_output = grad_output.reshape(grad_output.shape[0] * grad_output.shape[1],
+        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
                                        grad_output.shape[2])
         total_input = total_input.view(total_input.shape[0] * total_input.shape[1],
 				       total_input.shape[2])

From 462980b69ab7a3164a3372d0bd63242adb0a8305 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 21 Jul 2023 16:56:01 -0400
Subject: [PATCH 137/144] Support flash attn 2 (#72)

---
 megatron/model/transformer.py | 41 +++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 8051dbd143..66dd08d412 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -6,9 +6,8 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-
 from typing import Optional
-from torch import nn
+from packaging.version import Version
 
 from megatron import get_timers, get_args, get_retro_args, core, get_num_microbatches
 from megatron.utils import print_rank_0
@@ -36,9 +35,15 @@
     rearrange = None
 
 try:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+    import flash_attn as _flash_attn
+    if Version(getattr(_flash_attn, "__version__", "1")) >= Version("2"):
+        from flash_attn.flash_attn_interface import flash_attn_func
+        FLASH_VERSION = 2
+    else:
+        from flash_attn.flash_attn_interface import flash_attn_unpadded_func
+        FLASH_VERSION = 1
 except ImportError:
-    flash_attn_unpadded_func = None
+    FLASH_VERSION = None
 
 
 """ We use the following notation throughout this file:
@@ -508,7 +513,7 @@ class FlashSelfAttention(torch.nn.Module):
     def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
                  device=None, dtype=None):
         super().__init__()
-        assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, '
+        assert FLASH_VERSION is not None, ('Please install FlashAttention first, '
                                                       'e.g., with pip install flash-attn')
         assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
         self.causal = causal
@@ -521,10 +526,31 @@ def forward(self, q, k, v):
         ---------
             q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
         """
-
         assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
         assert all((i.is_cuda for i in (q,k,v)))
 
+        if FLASH_VERSION==1:
+            return self._forward_v1(q,k,v)
+
+        seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+
+        if self.training:
+            # during training q,k,v always have same seqlen
+            assert seqlen_k == seqlen_q
+            is_causal = self.causal
+            dropout_p = self.dropout_p
+        else:
+            # turn off FA causal mask after first inference autoregressive iteration
+            # only on first autoregressive step q,k,v have same seqlen
+            is_causal = self.causal and (seqlen_q == seqlen_k)
+            dropout_p = 0
+
+        output = flash_attn_func(q, k, v, dropout_p,softmax_scale=self.softmax_scale, causal=is_causal)
+
+        return output
+
+
+    def _forward_v1(self, q, k, v):
         batch_size, seqlen_q = q.shape[0], q.shape[1]
         seqlen_k = k.shape[1]
 
@@ -647,7 +673,7 @@ def __init__(self, init_method,
         self.checkpoint_core_attention = args.recompute_granularity == 'selective'
         
         if self.use_flash_attn:
-            if flash_attn_unpadded_func is None:
+            if FLASH_VERSION is None:
                 raise ImportError('FlashAttention is not installed, please install with '
                                   'pip install flash-attn')
             assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
@@ -882,6 +908,7 @@ def forward(self, hidden_states, attention_mask,
                 sq, b, np, hn = query_layer.size()
                 # Expand kv to be compatible with flash-attn implementation
                 # [sq, b, 1, hn] -> [sq, b, np, hn]
+                # TODO: This should be skippable for flash 2, but getting illegal memory access.
                 key_layer = key_layer.expand((sq, b, np, hn))
                 value_layer = value_layer.expand((sq, b, np, hn))
             q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()

From ebd38e9b418d5c9c198d7b95ac2583567e179237 Mon Sep 17 00:00:00 2001
From: Binyuan Hui <binyuan.hby@alibaba-inc.com>
Date: Mon, 24 Jul 2023 22:33:35 +0800
Subject: [PATCH 138/144] Fix train-iters typo & format script (#74)

---
 examples/pretrain_gpt_1B_santacoder.sh | 38 +++++++++++++-------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/pretrain_gpt_1B_santacoder.sh b/examples/pretrain_gpt_1B_santacoder.sh
index 7602fcc335..d14f538153 100644
--- a/examples/pretrain_gpt_1B_santacoder.sh
+++ b/examples/pretrain_gpt_1B_santacoder.sh
@@ -21,32 +21,32 @@ GPT_ARGS="\
        --tensor-model-parallel-size 1 \
        --pipeline-model-parallel-size 1 \
        --recompute-activations \
---num-layers 24 \
---hidden-size 2048 \
---num-attention-heads 16 \
---attention-head-type multiquery \
---init-method-std 0.022 \
+       --num-layers 24 \
+       --hidden-size 2048 \
+       --num-attention-heads 16 \
+       --attention-head-type multiquery \
+       --init-method-std 0.022 \
        --seq-length 2048 \
        --max-position-embeddings 2048 \
---attention-dropout 0.1 \
---hidden-dropout 0.1 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
        --micro-batch-size 2 \
        --global-batch-size 192 \
---lr 0.0002 \
---train-iters 3000 \
---lr-decay-iters 600000 \
---lr-decay-style cosine \
---lr-warmup-fraction 0.02 \
---weight-decay .1 \
---adam-beta2 .95 \
---clip-grad 1.0 \
---fp16 \
+       --lr 0.0002 \
+       --train-iters 300000 \
+       --lr-decay-iters 600000 \
+       --lr-decay-style cosine \
+       --lr-warmup-fraction 0.02 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --fp16 \
        --log-interval 10 \
        --save-interval 4000 \
        --eval-interval 200 \
        --eval-iters 10 \
---initial-loss-scale 65536 \
---fim-rate 0.5 \
+       --initial-loss-scale 65536 \
+       --fim-rate 0.5 \
 "
 
 TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
@@ -59,4 +59,4 @@ torchrun $DISTRIBUTED_ARGS \
        --save $CHECKPOINT_PATH \
        --load $CHECKPOINT_PATH \
        --data-path $DATA_PATH \
-       $TENSORBOARD_ARGS
\ No newline at end of file
+       $TENSORBOARD_ARGS

From f5981104be122e0507d9b5dcc42084a6c3fc0769 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Fri, 10 Nov 2023 13:26:47 +0000
Subject: [PATCH 139/144] add file level FIM and sanity check

---
 megatron/arguments.py          |   6 ++
 megatron/data/data_samplers.py |  44 ++++++++++++-
 megatron/data/gpt_dataset.py   | 116 ++++++++++++++++++++++-----------
 3 files changed, 127 insertions(+), 39 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 644fbb7a51..d4762e0c2f 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1276,6 +1276,12 @@ def __call__(self, parser, args, values, option_string=None):
     group.add_argument('--fim-spm-rate', type=float, default=0.5,
                        help='Probability that the a FIM sample uses the SPM format over the PSM format. '
                        'At 1, exclusively train with SPM. At 0, exclusively train with PSM')
+    group.add_argument('--fim-split-sample', type=str, default=None,
+                       help='String around which to split the sample for FIM. If None (default), FIM is applied on the sample-level')
+    group.add_argument('--fragment-fim-rate', type=float, default=0.5,
+                       help='Rate of FIM on each fragment when fim_split_sample is not None.')
+    group.add_argument('--sanity-check-dataloader-interval', type=int, default=0,
+                          help='Optional interval to print dataloader samples.')
 
     return parser
 
diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py
index 5435485ea6..c3ebd87f6d 100644
--- a/megatron/data/data_samplers.py
+++ b/megatron/data/data_samplers.py
@@ -41,10 +41,52 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None):
 
     num_workers = args.num_workers if num_workers is None else num_workers
     # Torch dataloader.
-    return torch.utils.data.DataLoader(dataset,
+    dataloader = torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
                                        num_workers=num_workers,
                                        pin_memory=True)
+    
+    if args.sanity_check_dataloader_interval is not None:
+        from transformers import AutoTokenizer
+        from megatron import is_last_rank
+
+        NUM_BATCHES = 10
+        sanity_check_dataloader_interval = args.sanity_check_dataloader_interval
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_file.split("tokenizer.json")[0])
+
+        if is_last_rank():
+            check_step = -1
+
+            with open("sanity_check.txt", "w") as f:
+                f.write("")
+            for i, batch in enumerate(dataloader):
+                check_step += 1
+                if i % sanity_check_dataloader_interval == 0:
+                    with open("sanity_check.txt", "a") as f:
+                        f.write("\n\n")
+                        f.write("*" * 40)
+                        f.write(f"Sanity check {check_step}")
+                        f.write("*" * 40)
+                    print(batch)
+
+                    import joblib
+
+                    joblib.dump(batch, f"sanity_check_{check_step}.pkl")
+                    texts = tokenizer.batch_decode(
+                        batch["text"], skip_special_tokens=False, clean_up_tokenization_spaces=False
+                    )
+
+                    for j, text in enumerate(texts):
+                        print(f"\n\n>>Batch {i} || Sample {j}<<\n")
+                        print(text[:1000])
+                        with open("sanity_check.txt",  "a", encoding='utf-8') as f:
+                            f.write(f"\n\n>>Batch {i} || Sample {j}<<\n")
+                            f.write(text)
+
+                    if i // sanity_check_dataloader_interval == NUM_BATCHES - 1:
+                        break
+            assert False
+    return dataloader
 
 class MegatronPretrainingSampler:
 
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 183f3cd460..16a29d0d60 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -335,7 +335,12 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
         self.args = get_args()
         self.tokenizer = get_tokenizer()
         self.np_rng = np.random.RandomState(seed=seed) # rng state for FIM
-        
+
+        self.fim_rate = self.args.fim_rate
+        self.fim_spm_rate = self.args.fim_spm_rate
+        self.fragment_fim_rate = self.args.fragment_fim_rate
+        self.fim_split_sample = self.tokenizer.vocab[self.args.fim_split_sample] if self.args.fim_split_sample is not None else None
+
         try:
             self.suffix_tok_id, self.prefix_tok_id, self.middle_tok_id, self.pad_tok_id = (self.tokenizer.special_tokens[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD])
         except KeyError:
@@ -386,38 +391,75 @@ def __getitem__(self, idx):
         # # do FIM here, if enabled
         # TODO: Do we handle the following point from FIM paper?
         # To transform data in the character space for context-level FIM, the tokenized documents have to be decoded back into strings before FIM augmentation. Depending on the vocabulary, some care has to be given to ensure decoding does not introduce any spurious characters into training. For example, utf-8 characters are encoded as multiple tokens with a BPE vocabulary; they can result in fragments from chunking and fail to decode. To prevent unforeseen errors midway through training, we encourage checking for these fragments at the beginning or end of a context and removing them.
-        fim_rate = self.args.fim_rate
-
-        if fim_rate != 0:
-            assert (fim_rate <= 1 and fim_rate >= 0), "FIM rate must be a probability 0 <= rate <= 1"
-
-            eod = self.tokenizer.eod
-            segment_breaks = np.argwhere(sample == eod) # split sample by document
-
-            if segment_breaks.shape != (0, 1): # then there is an EOD token in this example
-                curr_start_position = 0
-                new_samples = []
-                for loc in np.nditer(segment_breaks):
-                    # Only permute non-empty segments.
-                    if loc - curr_start_position > 0:
-                        # permute {prefix, suffix, middle} or {suffix, prefix, middle}
-                        permuted, self.np_rng = \
-                            permute(sample[curr_start_position:loc], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False,
-                                    suffix_tok_id=self.suffix_tok_id, prefix_tok_id=self.prefix_tok_id, middle_tok_id=self.middle_tok_id, pad_tok_id=self.pad_tok_id)
-                        new_samples += [permuted, [eod]]
-
-                    curr_start_position = loc + 1 # jump over the EOD token
-                # Permute the segment after the last EOD
-                permuted, self.np_rng = \
-                    permute(sample[curr_start_position:], self.np_rng, self.args, self.tokenizer, truncate_or_pad=False,
-                            suffix_tok_id=self.suffix_tok_id, prefix_tok_id=self.prefix_tok_id, middle_tok_id=self.middle_tok_id, pad_tok_id=self.pad_tok_id)
-                new_samples.append(permuted)
-
-                sample = np.concatenate(new_samples)
-            else:
-                sample, self.np_rng = permute(sample, self.np_rng, self.args, self.tokenizer, truncate_or_pad=False,
-                                              suffix_tok_id=self.suffix_tok_id, prefix_tok_id=self.prefix_tok_id, middle_tok_id=self.middle_tok_id, pad_tok_id=self.pad_tok_id)
-        
+        eod = self.tokenizer.eod
+        segment_breaks = np.argwhere(sample == eod) # split sample by document
+
+        if self.fim_rate == 0:
+            return sample.astype(np.int64)
+    
+        def fim_permute_sequence(sequence, rate):
+            return permute(
+                sequence,
+                self.np_rng,
+                rate,
+                self.fim_spm_rate,
+                self.tokenizer,
+                truncate_or_pad=False,
+                suffix_tok_id=self.suffix_tok_id,
+                prefix_tok_id=self.prefix_tok_id,
+                middle_tok_id=self.middle_tok_id,
+                pad_tok_id=self.pad_tok_id,
+            )
+
+        def fim_split_and_permute_sequence(sequence):
+            """
+            If self.fim_split_sample is not None, split the sequence.
+            Then apply FIM on the fragments, or the whole sequence if self.fim_split_sample is None.
+            """
+            if self.fim_split_sample is None:
+                return fim_permute_sequence(sequence, self.fim_rate)
+            # fim_split_sample is set: split the sample on this token and permute each fragment separately.
+            # Typically, if each sample is a repository, then we split again on the file level.
+            # Each fragment is a file, and we permute the files.
+            fragment_breaks = np.argwhere(sequence == self.fim_split_sample)
+            if fragment_breaks.shape == (0, 1):
+                # no split token in this sample
+                return fim_permute_sequence(sequence, self.fim_rate)
+            if not self.np_rng.binomial(1, self.fim_rate):
+                # don't do FIM preproc
+                return sequence
+            # Do FIM on each fragment
+            curr_start_position = 0
+            new_samples = []
+            for loc in np.nditer(fragment_breaks):
+                if loc - curr_start_position > 0:
+                    permuted = fim_permute_sequence(sequence[curr_start_position:loc], self.fragment_fim_rate)
+                    new_samples += [permuted, [self.fim_split_sample]]
+                curr_start_position = loc + 1  # Jump over the split token
+            # Permute the segment after the last split token
+            permuted = fim_permute_sequence(sequence[curr_start_position:], self.fragment_fim_rate)
+            new_samples.append(permuted)
+            return np.concatenate(new_samples)
+
+        if segment_breaks.shape != (0, 1):  # then there is an EOD token in this example
+            curr_start_position = 0
+            new_samples = []
+            for loc in np.nditer(segment_breaks):
+                # Only permute non-empty segments.
+                if loc - curr_start_position > 0:
+                    # permute {prefix, suffix, middle} or {suffix, prefix, middle}
+                    permuted = fim_split_and_permute_sequence(sample[curr_start_position:loc])
+                    new_samples += [permuted, [eod]]
+
+                curr_start_position = loc + 1  # jump over the EOD token
+            # Permute the segment after the last EOD
+            permuted = fim_split_and_permute_sequence(sample[curr_start_position:])
+            new_samples.append(permuted)
+
+            sample = np.concatenate(new_samples)
+        else:
+            sample = fim_split_and_permute_sequence(sample)
+            
         # Truncate or pad sequence to max-length
         diff = sample.shape[0] - sample_len
         if diff > 0: # too long
@@ -681,14 +723,12 @@ def _build_shuffle_idx(num_samples, total_size, np_rng):
 
 
 # From https://github.com/EleutherAI/gpt-neox/blob/FIM-clean/megatron/data/gpt2_dataset.py#L339
-def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True,
+def permute(sample, np_rng, fim_rate, fim_spm_rate, tokenizer, truncate_or_pad=True,
             suffix_tok_id=None, prefix_tok_id=None, middle_tok_id=None, pad_tok_id=None):
     """
     Take in a sample (np array w/ size (0,chunklength)) and perform a FIM transformation on it. 
     Maintain the same sample length (if transform creates a few extra tokens, drop them).
     """
-    fim_rate = args.fim_rate
-
     if np_rng.binomial(1, fim_rate): # sample bernoulli dist
 
         contents = tokenizer.detokenize(sample)
@@ -726,7 +766,7 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True,
             elif diff < 0: # too short
                 suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
         
-        if np_rng.binomial(1, args.fim_spm_rate):
+        if np_rng.binomial(1, fim_spm_rate):
             # SPM (variant 2 from FIM paper)
             new_sample = np.concatenate([
                 [prefix_tok_id, suffix_tok_id], suffix,
@@ -744,4 +784,4 @@ def permute(sample, np_rng, args, tokenizer, truncate_or_pad=True,
         # don't do FIM preproc
         new_sample = sample
 
-    return new_sample, np_rng
+    return new_sample

From fd6d7058d89a4e5b1c4a387498aef296dc4a0018 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Fri, 10 Nov 2023 15:28:35 +0000
Subject: [PATCH 140/144] use default None for sanity check interval

---
 megatron/arguments.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index d4762e0c2f..ea671fcffc 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1184,6 +1184,9 @@ def __call__(self, parser, args, values, option_string=None):
 
             with open(values, "r") as fi:
                 lines = fi.readlines()
+                print(f"Loading {option_string} from {values}")
+                print(f"nb lines {len(lines)}")
+                print(f"first line {lines[0]}")
                 assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
                 assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
                 values = lines[0][1:-2].split("\" \"")
@@ -1280,7 +1283,7 @@ def __call__(self, parser, args, values, option_string=None):
                        help='String around which to split the sample for FIM. If None (default), FIM is applied on the sample-level')
     group.add_argument('--fragment-fim-rate', type=float, default=0.5,
                        help='Rate of FIM on each fragment when fim_split_sample is not None.')
-    group.add_argument('--sanity-check-dataloader-interval', type=int, default=0,
+    group.add_argument('--sanity-check-dataloader-interval', type=int, default=None,
                           help='Optional interval to print dataloader samples.')
 
     return parser

From 4f8a0e428b759762d1c15d1cce3ddc2014d0351e Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Fri, 10 Nov 2023 15:31:45 +0000
Subject: [PATCH 141/144] remove extra prints

---
 megatron/arguments.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index ea671fcffc..658eaf3bc8 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1184,9 +1184,6 @@ def __call__(self, parser, args, values, option_string=None):
 
             with open(values, "r") as fi:
                 lines = fi.readlines()
-                print(f"Loading {option_string} from {values}")
-                print(f"nb lines {len(lines)}")
-                print(f"first line {lines[0]}")
                 assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
                 assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
                 values = lines[0][1:-2].split("\" \"")

From 01e9ce607105c05ce3354f4ee43fe429db125613 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Fri, 10 Nov 2023 16:54:07 +0000
Subject: [PATCH 142/144] add thera rope as arg

---
 megatron/arguments.py                  | 2 ++
 megatron/model/language_model.py       | 2 +-
 megatron/model/rotary_pos_embedding.py | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 658eaf3bc8..01040e5023 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -583,6 +583,8 @@ def _add_network_size_args(parser):
                        help='Use rotary positional embeddings or not')
     group.add_argument('--rotary-percent', type=float, default=1.0,
                        help='Percent of rotary dimension to use, default 100%')
+    group.add_argument('--rotary-theta', type=int, default=10000,
+                       help='Theta/frequency value for rotary positional embeddings')
     group.add_argument('--no-position-embedding',
                        action='store_false',
                        help='Disable position embedding.',
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
index bd589fe6b6..fc2e8fe348 100644
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -382,7 +382,7 @@ def __init__(self,
             # partial rotary embeddings, which is better than full rotary
             # Wang and Komatsuzaki et al
             # https://github.com/kingoflolz/mesh-transformer-jax/
-            self.rotary_pos_emb = RotaryEmbedding(rotary_dim)
+            self.rotary_pos_emb = RotaryEmbedding(rotary_dim, args.rotary_theta)
 
         # Encoder (usually set to True, False if part of an encoder-decoder
         # architecture and in encoder-only stage).
diff --git a/megatron/model/rotary_pos_embedding.py b/megatron/model/rotary_pos_embedding.py
index 80c74d62d4..e7f6450513 100644
--- a/megatron/model/rotary_pos_embedding.py
+++ b/megatron/model/rotary_pos_embedding.py
@@ -12,9 +12,9 @@
 __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
 
 class RotaryEmbedding(nn.Module):
-    def __init__(self, dim):
+    def __init__(self, dim, theta):
         super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
         self.register_buffer('inv_freq', inv_freq)
         if importlib.util.find_spec('einops') is None:
             raise RuntimeError("einops is required for Rotary Embedding")

From c8372cbd0fc8b757d4be93a3fb26a3f35dac6652 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 14 Nov 2023 14:47:06 +0000
Subject: [PATCH 143/144] add humaneval generations using a server

---
 .gitignore                                   |  5 ++
 examples/run_generation_server_starcoder2.sh | 46 +++++++++++
 megatron/text_generation_server.py           |  2 +-
 tools/run_requests.py                        | 85 ++++++++++++++++++++
 4 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 examples/run_generation_server_starcoder2.sh
 create mode 100644 tools/run_requests.py

diff --git a/.gitignore b/.gitignore
index cac3499524..6d2ffc62a0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,8 @@ build
 *~
 slurm*
 logs
+*.yaml
+*.json
+*.wandb
+*.pkl
+*.log
\ No newline at end of file
diff --git a/examples/run_generation_server_starcoder2.sh b/examples/run_generation_server_starcoder2.sh
new file mode 100644
index 0000000000..35bce418d1
--- /dev/null
+++ b/examples/run_generation_server_starcoder2.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# This example will start serving the 1B model.
+# You may need to adapt Flask port if it's occupied in MegatronServer class, we chnaged it from 5000 (default) to 8080
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr ip-26-0-156-56 \
+                  --master_port 6000"
+
+CHECKPOINT=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints
+CHECKPOINT=/fsx/loubna/data/extra/generations_starcoder2_1b_200k/megatron
+
+#/mp_rank_00/model_optim_rng.pt
+VOCAB_FILE=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints/conversions/vocab.json
+MERGE_FILE=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints/conversions/merges.txt
+TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json 
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+#pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 2048 \
+       --num-attention-heads 16 \
+       --attention-head-type multiquery \
+       --init-method-std 0.02209 \
+       --seq-length 4096 \
+       --use-rotary-position-embeddings \
+       --max-position-embeddings 4096 \
+       --rotary-theta 100000 \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --load ${CHECKPOINT}  \
+       --tokenizer-type TokenizerFromFile \
+       --tokenizer-file $TOKENIZER_FILE \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --out-seq-length 512  \
+       --temperature 0  \
+       --top_p 0.9  \
+       --seed 42
+       --output_file 
diff --git a/megatron/text_generation_server.py b/megatron/text_generation_server.py
index 58550f2e63..80dd0b288a 100644
--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
@@ -238,4 +238,4 @@ def __init__(self, model):
         api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
         
     def run(self, url): 
-        self.app.run(url, threaded=True, debug=False)
+        self.app.run(url, threaded=True, debug=False, port=8080)
diff --git a/tools/run_requests.py b/tools/run_requests.py
new file mode 100644
index 0000000000..115f4d4b1d
--- /dev/null
+++ b/tools/run_requests.py
@@ -0,0 +1,85 @@
+import requests
+import json
+from human_eval.data import write_jsonl, read_problems
+
+
+NUM_SAMPLES_PER_TASK = 1
+stop_tokens = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<filename>", "<file_sep>", "<|endoftext|>"]
+
+
+def query_server(prompt):
+    url = 'http://localhost:8080/api'
+    headers = {'Content-Type': 'application/json; charset=UTF-8'}
+    data = {"prompts": [prompt], "tokens_to_generate": 512}
+    response = requests.put(url, json=data, headers=headers)
+    result = json.loads(response.text)["text"]
+    return result[0]
+
+
+def stop_at_stop_token(decoded_string, stop_tokens):
+    """
+    Produces the prefix of decoded_string that ends at the first occurrence of
+    a stop_token.
+    WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
+    itself.
+    """
+    min_stop_index = len(decoded_string)
+    for stop_token in stop_tokens:
+        stop_index = decoded_string.find(stop_token)
+        if stop_index != -1 and stop_index < min_stop_index:
+            min_stop_index = stop_index
+    return decoded_string[:min_stop_index]
+
+
+def postprocess_generation(generation, prompt):
+    """Defines the postprocessing for a LM generation.
+    :param generation: str
+        code generation from LM
+    :param idx: int
+        (not used for Humaneval-Task)
+    """
+    if not generation.startswith(prompt[:20]):
+        print(f"issue with generation: {generation}")
+        print(f"origin prompt: {prompt}")
+    generation = generation[len(prompt) :]
+    return prompt + stop_at_stop_token(generation, stop_tokens)
+
+
+def main():
+    problems = read_problems()
+    prompts = [
+                problems[task_id]["prompt"]
+                for task_id in problems
+                for _ in range(NUM_SAMPLES_PER_TASK)
+            ]
+
+    errors = []
+    success = 0
+    generations = []
+    postprocessed_generations = []
+    for i, prompt in enumerate(prompts):
+        prompt = prompt.strip()  
+        try:
+            result = query_server(prompt)
+            generations.append([result])
+            postprocessed_generations.append([postprocess_generation(result, prompt)])
+            success += 1
+        except Exception as e:
+            print(f"Error processing problem '{i}': {e}")
+            errors.append(i)
+        if i % 10 == 0:
+            print(f"Processed {i} problems")
+            print(f"Failed problem generations are: {errors}")
+            #print(f"Example:\n{result}END\n")
+
+    print(f"Done! {success} successful problems out of {len(prompts)}, failed are: {errors}")
+
+    with open('megatron_generations.json', 'w') as f:
+        json.dump(generations, f)
+
+    with open('megatron_postprocessed_generations.json', 'w') as f:
+        json.dump(postprocessed_generations, f)
+    
+
+if __name__ == '__main__':
+    main()

From 7c325cd34a109880a3fc03432e8b3e4576b22701 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Tue, 14 Nov 2023 14:50:51 +0000
Subject: [PATCH 144/144] Revert "add humaneval generations using a server"

This reverts commit c8372cbd0fc8b757d4be93a3fb26a3f35dac6652.
---
 .gitignore                                   |  5 --
 examples/run_generation_server_starcoder2.sh | 46 -----------
 megatron/text_generation_server.py           |  2 +-
 tools/run_requests.py                        | 85 --------------------
 4 files changed, 1 insertion(+), 137 deletions(-)
 delete mode 100644 examples/run_generation_server_starcoder2.sh
 delete mode 100644 tools/run_requests.py

diff --git a/.gitignore b/.gitignore
index 6d2ffc62a0..cac3499524 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,8 +6,3 @@ build
 *~
 slurm*
 logs
-*.yaml
-*.json
-*.wandb
-*.pkl
-*.log
\ No newline at end of file
diff --git a/examples/run_generation_server_starcoder2.sh b/examples/run_generation_server_starcoder2.sh
deleted file mode 100644
index 35bce418d1..0000000000
--- a/examples/run_generation_server_starcoder2.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# This example will start serving the 1B model.
-# You may need to adapt Flask port if it's occupied in MegatronServer class, we chnaged it from 5000 (default) to 8080
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr ip-26-0-156-56 \
-                  --master_port 6000"
-
-CHECKPOINT=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints
-CHECKPOINT=/fsx/loubna/data/extra/generations_starcoder2_1b_200k/megatron
-
-#/mp_rank_00/model_optim_rng.pt
-VOCAB_FILE=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints/conversions/vocab.json
-MERGE_FILE=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints/conversions/merges.txt
-TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json 
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-#pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 1 \
-       --pipeline-model-parallel-size 1 \
-       --num-layers 24 \
-       --hidden-size 2048 \
-       --num-attention-heads 16 \
-       --attention-head-type multiquery \
-       --init-method-std 0.02209 \
-       --seq-length 4096 \
-       --use-rotary-position-embeddings \
-       --max-position-embeddings 4096 \
-       --rotary-theta 100000 \
-       --attention-dropout 0.1 \
-       --hidden-dropout 0.1 \
-       --load ${CHECKPOINT}  \
-       --tokenizer-type TokenizerFromFile \
-       --tokenizer-file $TOKENIZER_FILE \
-       --bf16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --out-seq-length 512  \
-       --temperature 0  \
-       --top_p 0.9  \
-       --seed 42
-       --output_file 
diff --git a/megatron/text_generation_server.py b/megatron/text_generation_server.py
index 80dd0b288a..58550f2e63 100644
--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
@@ -238,4 +238,4 @@ def __init__(self, model):
         api.add_resource(MegatronGenerate, '/api', resource_class_args=[model])
         
     def run(self, url): 
-        self.app.run(url, threaded=True, debug=False, port=8080)
+        self.app.run(url, threaded=True, debug=False)
diff --git a/tools/run_requests.py b/tools/run_requests.py
deleted file mode 100644
index 115f4d4b1d..0000000000
--- a/tools/run_requests.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import requests
-import json
-from human_eval.data import write_jsonl, read_problems
-
-
-NUM_SAMPLES_PER_TASK = 1
-stop_tokens = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```", "<filename>", "<file_sep>", "<|endoftext|>"]
-
-
-def query_server(prompt):
-    url = 'http://localhost:8080/api'
-    headers = {'Content-Type': 'application/json; charset=UTF-8'}
-    data = {"prompts": [prompt], "tokens_to_generate": 512}
-    response = requests.put(url, json=data, headers=headers)
-    result = json.loads(response.text)["text"]
-    return result[0]
-
-
-def stop_at_stop_token(decoded_string, stop_tokens):
-    """
-    Produces the prefix of decoded_string that ends at the first occurrence of
-    a stop_token.
-    WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
-    itself.
-    """
-    min_stop_index = len(decoded_string)
-    for stop_token in stop_tokens:
-        stop_index = decoded_string.find(stop_token)
-        if stop_index != -1 and stop_index < min_stop_index:
-            min_stop_index = stop_index
-    return decoded_string[:min_stop_index]
-
-
-def postprocess_generation(generation, prompt):
-    """Defines the postprocessing for a LM generation.
-    :param generation: str
-        code generation from LM
-    :param idx: int
-        (not used for Humaneval-Task)
-    """
-    if not generation.startswith(prompt[:20]):
-        print(f"issue with generation: {generation}")
-        print(f"origin prompt: {prompt}")
-    generation = generation[len(prompt) :]
-    return prompt + stop_at_stop_token(generation, stop_tokens)
-
-
-def main():
-    problems = read_problems()
-    prompts = [
-                problems[task_id]["prompt"]
-                for task_id in problems
-                for _ in range(NUM_SAMPLES_PER_TASK)
-            ]
-
-    errors = []
-    success = 0
-    generations = []
-    postprocessed_generations = []
-    for i, prompt in enumerate(prompts):
-        prompt = prompt.strip()  
-        try:
-            result = query_server(prompt)
-            generations.append([result])
-            postprocessed_generations.append([postprocess_generation(result, prompt)])
-            success += 1
-        except Exception as e:
-            print(f"Error processing problem '{i}': {e}")
-            errors.append(i)
-        if i % 10 == 0:
-            print(f"Processed {i} problems")
-            print(f"Failed problem generations are: {errors}")
-            #print(f"Example:\n{result}END\n")
-
-    print(f"Done! {success} successful problems out of {len(prompts)}, failed are: {errors}")
-
-    with open('megatron_generations.json', 'w') as f:
-        json.dump(generations, f)
-
-    with open('megatron_postprocessed_generations.json', 'w') as f:
-        json.dump(postprocessed_generations, f)
-    
-
-if __name__ == '__main__':
-    main()