Skip to content

Commit 17217f8

Browse files
committed
fix blended dataset size in dataset groups
1 parent 04031a8 commit 17217f8

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

megatron/data/gpt_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
106106
'''
107107

108108
assert train_valid_test in ["train","valid","test"]
109+
index = ["train","valid","test"].index(train_valid_test)
109110

110111
# Single dataset.
111112
if len(paths) == 1:
@@ -145,8 +146,7 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl,
145146
assert ds is not None, \
146147
f"Got an empty split when trying to create dataset: {prefixes[i], splits[i]}"
147148
datasets.append(ds)
148-
total_size = sum(len(ds) for ds in datasets)
149-
all_datasets = BlendableDataset(datasets, weights, total_size)
149+
all_datasets = BlendableDataset(datasets, weights, train_valid_test_num_samples[index])
150150

151151
return all_datasets
152152

0 commit comments

Comments
 (0)