From a1c82a1a7c3d6256fdfd1c14269a628ed40bb95d Mon Sep 17 00:00:00 2001
From: prasad83 <prasad83@users.noreply.github.com>
Date: Wed, 28 Jun 2023 23:39:04 +0530
Subject: [PATCH 1/5] Fixed RAdam import path.

RAdam is name used in setup.py
---
 trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trainer.py b/trainer.py
index ab7d560..7d69219 100644
--- a/trainer.py
+++ b/trainer.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel
 from torch.utils.tensorboard import SummaryWriter
-from radam import RAdam
+import RAdam
 
 from model import GPT, GPTLMHead, GPTClsHead
 

From cfe5b2f45367e5ea7f2305355d45e091e8dc1bf9 Mon Sep 17 00:00:00 2001
From: prasad83 <prasad83@users.noreply.github.com>
Date: Wed, 28 Jun 2023 23:47:11 +0530
Subject: [PATCH 2/5] Show progress converting examples to features.

Showing visual progress using tqdm when converting examples to features will be useful.
---
 data_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/data_utils.py b/data_utils.py
index 2bebc28..1ca0df6 100644
--- a/data_utils.py
+++ b/data_utils.py
@@ -5,6 +5,7 @@
 import torch
 import torch.distributed as dist
 from torch.utils.data import TensorDataset
+from tqdm import tqdm
 
 class PretrainInputExample:
     """A single example for unsupervised pre-training.
@@ -53,7 +54,7 @@ def convert_examples_to_features(examples,
 
     # Create features
     features = []
-    for i, example in enumerate(examples):
+    for i, example in enumerate(tqdm(examples)):
         tokens = tokenizer.tokenize(example.text)
         tokens = [bos_token] + tokens[:args.max_seq_len-2] + [eos_token] # BOS, EOS
         tokens += [pad_token] * (args.max_seq_len - len(tokens))
@@ -116,4 +117,4 @@ def create_examples(args, tokenizer, mode='train'):
         all_label_ids = torch.tensor([feature.label_id for feature in features], dtype=torch.long)
         dataset = TensorDataset(all_input_ids, all_label_ids)
     
-    return dataset
\ No newline at end of file
+    return dataset

From 94e4b426f8ba9087174c958e95166360f7f151a4 Mon Sep 17 00:00:00 2001
From: prasad83 <prasad83@users.noreply.github.com>
Date: Thu, 29 Jun 2023 00:11:28 +0530
Subject: [PATCH 3/5] Fixed torch.device non-negative index.

torch >= 2.x does not accept negative index in torch.device.
---
 trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trainer.py b/trainer.py
index 7d69219..30e89a3 100644
--- a/trainer.py
+++ b/trainer.py
@@ -36,7 +36,7 @@ def __init__(self, args, train_loader, test_loader, tokenizer):
         self.vocab_size = tokenizer.vocab_size
         self.pad_id = tokenizer.pad_token_id
         self.eos_id = tokenizer.eos_token_id
-        self.device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu', args.local_rank)
+        self.device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu', max(0, args.local_rank))
         self.writer = SummaryWriter() if args.local_rank in [-1, 0] else None
         self.n_gpus = torch.distributed.get_world_size() if args.distributed else torch.cuda.device_count()
         assert args.pretrain != args.finetune # Do not set both finetune and pretrain arguments to the same (True, False)

From 4f8d751bbc531711376666756ec98b12118ec32c Mon Sep 17 00:00:00 2001
From: prasad83 <prasad83@users.noreply.github.com>
Date: Thu, 29 Jun 2023 00:55:01 +0530
Subject: [PATCH 4/5] Fixed pretrain modulo division by zero

If n_batches is less than 5, modulo division will end up with div-by-zero error.
---
 trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trainer.py b/trainer.py
index 30e89a3..c44621d 100644
--- a/trainer.py
+++ b/trainer.py
@@ -101,7 +101,7 @@ def pretrain(self, epoch):
 
             if self.args.local_rank in [-1, 0]:
                 self.writer.add_scalar('Loss/pre-train', loss.item(), ((epoch-1)*n_batches)+i)
-                if i % (n_batches//5) == 0 and i != 0:
+                if n_batches > 5 && i % (n_batches//5) == 0 and i != 0:
                     print('Iteration {} ({}/{})\tLoss: {:.4f}'.format(i, i, n_batches, losses/i))
         
         print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f}'.format(epoch, self.args.local_rank, losses/n_batches))

From 8e864fa7c25289b9128f0a5790cde0965bbce720 Mon Sep 17 00:00:00 2001
From: prasad83 <prasad83@users.noreply.github.com>
Date: Thu, 29 Jun 2023 00:56:11 +0530
Subject: [PATCH 5/5] Fixed modulo division by zero in pretrain and evaluate

---
 trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trainer.py b/trainer.py
index c44621d..241ff53 100644
--- a/trainer.py
+++ b/trainer.py
@@ -101,7 +101,7 @@ def pretrain(self, epoch):
 
             if self.args.local_rank in [-1, 0]:
                 self.writer.add_scalar('Loss/pre-train', loss.item(), ((epoch-1)*n_batches)+i)
-                if n_batches > 5 && i % (n_batches//5) == 0 and i != 0:
+                if n_batches > 5 and i % (n_batches//5) == 0 and i != 0:
                     print('Iteration {} ({}/{})\tLoss: {:.4f}'.format(i, i, n_batches, losses/i))
         
         print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f}'.format(epoch, self.args.local_rank, losses/n_batches))
@@ -134,7 +134,7 @@ def finetune(self, epoch):
             if self.args.local_rank in [-1, 0]:
                 self.writer.add_scalar('Loss/fine-tune', loss.item(), ((epoch-1)*n_batches)+i)
                 self.writer.add_scalar('Accuracy/fine-tune', acc, ((epoch-1)*n_batches)+i)
-                if i % (n_batches//5) == 0 and i != 0:
+                if n_batches > 5 and i % (n_batches//5) == 0 and i != 0:
                     print('Iteration {} ({}/{})\tLoss: {:.4f} Acc: {:.1f}%'.format(i, i, n_batches, losses/i, accs/i*100.))
 
         print('Train Epoch {} [rank: {}]\t>\tLoss: {:.4f} / Acc: {:.1f}%'.format(epoch, self.args.local_rank, losses/n_batches, accs/n_batches*100.))