bigcode-project
diff --git a/‎examples/pretrain_gpt_1B_santacoder.sh‎
Lines changed: 62 additions & 0 deletions b/‎examples/pretrain_gpt_1B_santacoder.sh‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎examples/pretrain_gpt_multilingual.sh‎
Lines changed: 65 additions & 0 deletions b/‎examples/pretrain_gpt_multilingual.sh‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎megatron/arguments.py‎
Lines changed: 126 additions & 1 deletion b/‎megatron/arguments.py‎
Lines changed: 126 additions & 1 deletion
diff --git a/‎megatron/data/data_samplers.py‎
Lines changed: 3 additions & 2 deletions b/‎megatron/data/data_samplers.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎megatron/data/dataset_utils.py‎
Lines changed: 24 additions & 3 deletions b/‎megatron/data/dataset_utils.py‎
Lines changed: 24 additions & 3 deletions
@@ -0,0 +1,62 @@
+#! /bin/bash
+
+set -u # stop on unset variables
+
+# Runs the SantaCoder 1B model
+
+GPUS_PER_NODE=8
+MASTER_ADDR=${MASTER_NODE}  # Adjust
+MASTER_PORT=6000
+NNODES=12  # Adjust
+# NODE_RANK=0  # Adjust
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+CHECKPOINT_PATH=/my/experiment/path  # Adjust: Directory to store the checkpoints
+DATA_PATH=/preprocessed/data/path  # Adjust: Prefix of the preprocessed dataset.
+TOKENIZER_FILE=/tokenizer/path  # Adjust
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --recompute-activations \
+--num-layers 24 \
+--hidden-size 2048 \
+--num-attention-heads 16 \
+--attention-head-type multiquery \
+--init-method-std 0.022 \
+       --seq-length 2048 \
+       --max-position-embeddings 2048 \
+--attention-dropout 0.1 \
+--hidden-dropout 0.1 \
+       --micro-batch-size 2 \
+       --global-batch-size 192 \
+--lr 0.0002 \
+--train-iters 3000 \
+--lr-decay-iters 600000 \
+--lr-decay-style cosine \
+--lr-warmup-fraction 0.02 \
+--weight-decay .1 \
+--adam-beta2 .95 \
+--clip-grad 1.0 \
+--fp16 \
+       --log-interval 10 \
+       --save-interval 4000 \
+       --eval-interval 200 \
+       --eval-iters 10 \
+--initial-loss-scale 65536 \
+--fim-rate 0.5 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"
+
+torchrun $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       $GPT_ARGS \
+       --tokenizer-type TokenizerFromFileWithFIM \
+       --tokenizer-file $TOKENIZER_FILE \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       $TENSORBOARD_ARGS
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1  # Adjust
+NODE_RANK=0  # Adjust
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+# paths to multilingual preprocessed datasets
+DATA_PATH_EN=<Specify path and file prefix>_text_document
+DATA_PATH_AR=<Specify path and file prefix>_text_document
+DATA_PATH_KR=<Specify path and file prefix>_text_document
+DATA_PATH_JP=<Specify path and file prefix>_text_document
+
+CHECKPOINT_PATH=<Specify path>
+
+
+torchrun $DISTRIBUTED_ARGS \
+    pretrain_gpt.py \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --train-iters 1000 \
+    --lr-decay-iters 320000 \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
+    --valid-weighted-split-paths \
+    "VALID_EN: 1 0.6:0.8 $DATA_EN" \
+    "VALID_AR: 1 0.6:0.8 $DATA_AR" \
+    "VALID_JP: 1 0.6:0.8 $DATA_KR" \
+    "VALID_KR: 1 0.6:0.8 $DATA_JP" \
+    "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
+    --test-weighted-split-paths \
+    "TEST_EN: 1 0.8:1 $DATA_EN" \
+    "TEST_AR: 1 0.8:1 $DATA_AR" \
+    "TEST_JP: 1 0.8:1 $DATA_JP" \
+    "TEST_KR: 1 0.8:1 $DATA_KR" \
+    "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --lr-warmup-fraction .01 \
+    --checkpoint-activations \
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --fp16
@@ -17,6 +17,7 @@
 
 import argparse
 import os
+import re
 
 import torch
 
@@ -100,6 +101,30 @@ def validate_args(args, defaults={}):
                     ' to be less than pipeline model parallel size ({})'.format(
                             args.pipeline_model_parallel_size)
 
+    # --data-path and --train-weighted-splits-paths
+    message = "Data loading Mode 1: --data-path and --split "\
+            "and Mode 2: --(train|valid|test)-weighted-split-paths"\
+            "are mutually exclusive i.e. cannot be set together."
+
+    if args.data_path:
+        assert args.train_weighted_split_paths is None, message
+        setattr(args, "valid_weighted_split_names", None)
+        setattr(args, "valid_weighted_split_weights", None)
+        setattr(args, "valid_weighted_split_splits", None)
+
+        setattr(args, "test_weighted_split_names", None)
+        setattr(args, "test_weighted_split_weights", None)
+        setattr(args, "test_weighted_split_splits", None)
+
+        # args.split default value in the args is None it is set here in order
+        # to check that it does not to overlap with the 2nd mode of data loading
+        if args.split is None:
+            args.split = "969, 30, 1"
+
+    if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
+                args.test_weighted_split_paths:
+        assert args.data_path is None and args.split is None, message
+
     # Deprecated arguments
     assert args.batch_size is None, '--batch-size argument is no longer ' \
         'valid, use --micro-batch-size instead'
@@ -863,16 +888,114 @@ def _add_validation_args(parser):
 def _add_data_args(parser):
     group = parser.add_argument_group(title='data and dataloader')
 
+    # option 1 for data loading  (mutually exclusive with option2)
     group.add_argument('--data-path', nargs='*', default=None,
                        help='Path to the training dataset. Accepted format:'
                        '1) a single data path, 2) multiple datasets in the'
                        'form: dataset1-weight dataset1-path dataset2-weight '
                        'dataset2-path ...')
-    group.add_argument('--split', type=str, default='969, 30, 1',
+    group.add_argument('--split', type=str, default=None,
                        help='Comma-separated list of proportions for training,'
                        ' validation, and test split. For example the split '
                        '`90,5,5` will use 90%% of data for training, 5%% for '
                        'validation and 5%% for test.')
+    # option 2 for data loading (mutually exclusive with option1)
+    # see https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/97/files
+
+    # helper class to parse the --xxx-weighted-split-paths
+    # note here two args are set: extra valid dataset paths and names
+    class parse_data_paths(argparse.Action):
+        def __call__(self, parser, args, values, option_string=None):
+
+            if option_string == "--train-weighted-split-paths":
+                assert len(values) == 1, 'Only 1 dataset group is allowed to'
+                'be passed for the argument --train-weighted-split-paths'
+
+            # make sure string given in the correct format
+            err_message = 'Each data group should be input on the following format'
+            '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+            'where START < END'
+            for v in values:
+                # each prefix consists several datasets separated by commas
+                prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
+                datasets = prefix.split(",")
+                # check if each dataset is formatted like `WEIGHT START:END PATH`
+                for d in datasets:
+                    assert len(d.split()) == 3, err_message
+                    start, end = d.split()[1].split(":")
+                    assert float(start) < float(end), err_message
+
+            names = [v.split(":")[0] for v in values]
+
+            prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
+            weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
+            splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
+            paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
+
+            # # to keep consistency with Option 1 of data loading (through --data-path)
+            # #  paths will contain strings on the following form
+            # # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
+            # # while data will be parsed in additional arguments below
+            # paths_option1_style = []
+            # for p, w in zip(paths, weights):
+            #   paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
+            # setattr(args, self.dest, paths_option1_style)
+            setattr(args, self.dest, paths)
+            setattr(args, self.dest.replace("paths", "weights"), weights)
+            setattr(args, self.dest.replace("paths", "splits"), splits)
+            setattr(args, self.dest.replace("paths","names"), names)
+
+
+    group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: ONE dataset groups could be'
+                    'submitted in the following form between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    'WEIGHT is used to up and down sample each dataset A,B,C in the group'
+                    'START:END indicates the split portion of the dataset',
+                    action=parse_data_paths)
+
+    group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: one or many dataset groups could be'
+                    'submitted in the following form each between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
+                    'validation will be run on each of those groups independently',
+                    action=parse_data_paths)
+
+    group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
+                    help='Weights, splits and paths to groups of datasets'
+                    'Accepted format: one or many dataset groups could be'
+                    'submitted in the following form each between double quotes'
+                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
+                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
+                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
+                    'test will be run on each of those groups independently',
+                    action=parse_data_paths)
+
+    class parse_data_paths_path(argparse.Action):
+        def __call__(self, parser, args, values, option_string=None):
+            expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
+            assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
+
+            with open(values, "r") as fi:
+                lines = fi.readlines()
+                assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
+                assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
+                values = lines[0][1:-2].split("\" \"")
+                weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
+                weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
+                setattr(args, weighted_split_paths_dest, values)
+                parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
+
+    # option 2-bis: load x-weighted-split-paths from a file in case this argument is very long
+    group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
+    group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
+    group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
+
     group.add_argument('--vocab-file', type=str, default=None,
                        help='Path to the vocab file.')
     group.add_argument('--merge-file', type=str, default=None,
@@ -903,6 +1026,8 @@ def _add_data_args(parser):
                        help='Warm up mmap files.')
     group.add_argument('--num-workers', type=int, default=2,
                        help="Dataloader number of workers.")
+    group.add_argument('--valid-num-workers', type=int, default=2,
+                       help="Dataloader number of workers for validation.")
     group.add_argument('--tokenizer-type', type=str,
                        default=None,
                        choices=['BertWordPieceLowerCase',
 
@@ -24,7 +24,7 @@
 from megatron import mpu
 
 
-def build_pretraining_data_loader(dataset, consumed_samples):
+def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None):
     """Buld dataloader given an input dataset."""
 
     if dataset is None:
@@ -52,10 +52,11 @@ def build_pretraining_data_loader(dataset, consumed_samples):
         raise Exception('{} dataloader type is not supported.'.format(
                 args.dataloader_type))
 
+    num_workers = args.num_workers if num_workers is None else num_workers
     # Torch dataloader.
     return torch.utils.data.DataLoader(dataset,
                                        batch_sampler=batch_sampler,
-                                       num_workers=args.num_workers,
+                                       num_workers=num_workers,
                                        pin_memory=True)
 
 class MegatronPretrainingSampler:
 
@@ -41,8 +41,7 @@
 DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
 
 
-def get_datasets_weights_and_num_samples(data_prefix,
-                                         train_valid_test_num_samples):
+def analyze_data_prefix(data_prefix):
 
     # The data prefix should be in the format of:
     #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
@@ -59,10 +58,16 @@ def get_datasets_weights_and_num_samples(data_prefix,
         weight_sum += weight
     assert weight_sum > 0.0
     weights = [weight / weight_sum for weight in weights]
+    return prefixes, weights
+
+
+def get_datasets_weights_and_num_samples(data_prefix,
+                                         train_valid_test_num_samples):
 
-    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
     # not uniformly distribute the number of samples, we still have
     # samples left to feed to the network.
+    prefixes, weights = analyze_data_prefix(data_prefix)
     datasets_train_valid_test_num_samples = []
     for weight in weights:
         datasets_train_valid_test_num_samples.append(
@@ -603,6 +608,22 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
     return indexed_dataset
 
 
+def get_split_by_range_(range_string, size):
+    """ Get dataset splits based on a range:
+    range_string is in the form  START%:END%  for e.g. 0.2:0.8
+    outputs an array of two values [start_index, end_index]
+    """
+    # some checks that range is given in the correct form
+    splits = [float(i) for i in range_string.split(":")]
+    assert len(splits) == 2, "splits should be passed as start:end"
+    assert splits[0] <= 1 and splits[1] <= 1
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits_index = [round(s * float(size)) for s in splits]
+    assert len(splits_index) == 2
+    return splits_index
+
+
 def get_train_valid_test_split_(splits_string, size):
     """ Get dataset splits from comma or '/' separated string list."""