From a1c82a1a7c3d6256fdfd1c14269a628ed40bb95d Mon Sep 17 00:00:00 2001 From: prasad83 Date: Wed, 28 Jun 2023 23:39:04 +0530 Subject: [PATCH 1/5] Fixed RAdam import path. RAdam is name used in setup.py --- trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer.py b/trainer.py index ab7d560..7d69219 100644 --- a/trainer.py +++ b/trainer.py @@ -6,7 +6,7 @@ import torch.nn as nn from torch.nn.parallel import DistributedDataParallel from torch.utils.tensorboard import SummaryWriter -from radam import RAdam +import RAdam from model import GPT, GPTLMHead, GPTClsHead From cfe5b2f45367e5ea7f2305355d45e091e8dc1bf9 Mon Sep 17 00:00:00 2001 From: prasad83 Date: Wed, 28 Jun 2023 23:47:11 +0530 Subject: [PATCH 2/5] Show progress converting examples to features. Showing visual progress using tqdm when converting examples to features will be useful. --- data_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data_utils.py b/data_utils.py index 2bebc28..1ca0df6 100644 --- a/data_utils.py +++ b/data_utils.py @@ -5,6 +5,7 @@ import torch import torch.distributed as dist from torch.utils.data import TensorDataset +from tqdm import tqdm class PretrainInputExample: """A single example for unsupervised pre-training. @@ -53,7 +54,7 @@ def convert_examples_to_features(examples, # Create features features = [] - for i, example in enumerate(examples): + for i, example in enumerate(tqdm(examples)): tokens = tokenizer.tokenize(example.text) tokens = [bos_token] + tokens[:args.max_seq_len-2] + [eos_token] # BOS, EOS tokens += [pad_token] * (args.max_seq_len - len(tokens)) @@ -116,4 +117,4 @@ def create_examples(args, tokenizer, mode='train'): all_label_ids = torch.tensor([feature.label_id for feature in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_label_ids) - return dataset \ No newline at end of file + return dataset From 94e4b426f8ba9087174c958e95166360f7f151a4 Mon Sep 17 00:00:00 2001 From: prasad83 Date: Thu, 29 Jun 2023 00:11:28 +0530 Subject: [PATCH 3/5] Fixed torch.device non-negative index. torch >= 2.x does not accept negative index in torch.device. --- trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer.py b/trainer.py index 7d69219..30e89a3 100644 --- a/trainer.py +++ b/trainer.py @@ -36,7 +36,7 @@ def __init__(self, args, train_loader, test_loader, tokenizer): self.vocab_size = tokenizer.vocab_size self.pad_id = tokenizer.pad_token_id self.eos_id = tokenizer.eos_token_id - self.device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu', args.local_rank) + self.device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu', max(0, args.local_rank)) self.writer = SummaryWriter() if args.local_rank in [-1, 0] else None self.n_gpus = torch.distributed.get_world_size() if args.distributed else torch.cuda.device_count() assert args.pretrain != args.finetune # Do not set both finetune and pretrain arguments to the same (True, False) From 4f8d751bbc531711376666756ec98b12118ec32c Mon Sep 17 00:00:00 2001 From: prasad83 Date: Thu, 29 Jun 2023 00:55:01 +0530 Subject: [PATCH 4/5] Fixed pretrain modulo division by zero If n_batches is less than 5, modulo division will end up with div-by-zero error. --- trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer.py b/trainer.py index 30e89a3..c44621d 100644 --- a/trainer.py +++ b/trainer.py @@ -101,7 +101,7 @@ def pretrain(self, epoch): if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/pre-train', loss.item(), ((epoch-1)*n_batches)+i) - if i % (n_batches//5) == 0 and i != 0: + if n_batches > 5 && i % (n_batches//5) == 0 and i != 0: print('Iteration {} ({}/{})\tLoss: {:.4f}'.format(i, i, n_batches, losses/i)) print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f}'.format(epoch, self.args.local_rank, losses/n_batches)) From 8e864fa7c25289b9128f0a5790cde0965bbce720 Mon Sep 17 00:00:00 2001 From: prasad83 Date: Thu, 29 Jun 2023 00:56:11 +0530 Subject: [PATCH 5/5] Fixed modulo division by zero in pretrain and evaluate --- trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trainer.py b/trainer.py index c44621d..241ff53 100644 --- a/trainer.py +++ b/trainer.py @@ -101,7 +101,7 @@ def pretrain(self, epoch): if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/pre-train', loss.item(), ((epoch-1)*n_batches)+i) - if n_batches > 5 && i % (n_batches//5) == 0 and i != 0: + if n_batches > 5 and i % (n_batches//5) == 0 and i != 0: print('Iteration {} ({}/{})\tLoss: {:.4f}'.format(i, i, n_batches, losses/i)) print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f}'.format(epoch, self.args.local_rank, losses/n_batches)) @@ -134,7 +134,7 @@ def finetune(self, epoch): if self.args.local_rank in [-1, 0]: self.writer.add_scalar('Loss/fine-tune', loss.item(), ((epoch-1)*n_batches)+i) self.writer.add_scalar('Accuracy/fine-tune', acc, ((epoch-1)*n_batches)+i) - if i % (n_batches//5) == 0 and i != 0: + if n_batches > 5 and i % (n_batches//5) == 0 and i != 0: print('Iteration {} ({}/{})\tLoss: {:.4f} Acc: {:.1f}%'.format(i, i, n_batches, losses/i, accs/i*100.)) print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f} / Acc: {:.1f}%'.format(epoch, self.args.local_rank, losses/n_batches, accs/n_batches*100.))