Skip to content

Commit cec50e3

Browse files
pszemrajPeter Szemraj
andauthored
pad vocab for CUDA (#10)
cuda likes embed layer (vocab) to be a multiple of 64/128 so we do that --------- Signed-off-by: Peter Szemraj <peterszemraj+dev@gmail.com> Co-authored-by: Peter Szemraj <peterszemraj+dev@gmail.com>
1 parent ff329e4 commit cec50e3

File tree

2 files changed

+26
-5
lines changed

2 files changed

+26
-5
lines changed

annotated_mpnet/modeling/mpnet_for_pretraining.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,14 @@ class MPNetForPretraining(nn.Module):
5252
def __init__(self, args, tokenizer) -> None:
5353
super().__init__()
5454

55+
# Use padded_vocab_size if available, otherwise use the tokenizer's vocab_size
56+
vocab_size = getattr(args, "padded_vocab_size", tokenizer.vocab_size)
57+
5558
# Let's define the encoder here
5659
self.args = args
5760
self.sentence_encoder = SentenceEncoder(
5861
padding_idx=tokenizer.vocab[tokenizer.pad_token],
59-
vocab_size=tokenizer.vocab_size,
62+
vocab_size=vocab_size, # Use the padded vocab size
6063
num_encoder_layers=args.encoder_layers,
6164
embedding_dim=args.encoder_embed_dim,
6265
ffn_embedding_dim=args.encoder_ffn_dim,
@@ -71,15 +74,15 @@ def __init__(self, args, tokenizer) -> None:
7174
normalize_before=args.normalize_before,
7275
)
7376

74-
# Add the language modeling head so that we can do pretraining
77+
# Add the language modeling head
7578
self.lm_head = MPNetLMHead(
7679
embed_dim=args.encoder_embed_dim,
77-
output_dim=tokenizer.vocab_size,
80+
output_dim=vocab_size, # Use the padded vocab size
7881
activation_fn=args.activation_fn,
7982
weight=self.sentence_encoder.embed_tokens.weight,
8083
)
8184

82-
# Finally initialize the weights according to the guidelines in the original BERT paper
85+
# Initialize the weights
8386
self.apply(init_final_params)
8487

8588
def output_layer(

cli_tools/pretrain_mpnet.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,25 @@ def main(args) -> None:
139139
# Now let's instantiate the tokenizer
140140
tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base")
141141

142-
# Instantiate the tensorboard writers here as well
142+
# Check and adjust vocab_size parameter for better GPU performance
143+
original_vocab_size = tokenizer.vocab_size
144+
target_vocab_size = (
145+
(original_vocab_size + 127) // 128
146+
) * 128 # Round up to nearest multiple of 128
147+
148+
if target_vocab_size > original_vocab_size:
149+
LOGGER.info(
150+
f"Padding model's vocab_size from {original_vocab_size} to {target_vocab_size} "
151+
"(div. by 128) for GPU performance"
152+
)
153+
# Store both sizes in args for reference during conversion
154+
args.original_vocab_size = original_vocab_size
155+
args.padded_vocab_size = target_vocab_size
156+
else:
157+
args.original_vocab_size = original_vocab_size
158+
args.padded_vocab_size = original_vocab_size
159+
160+
# Instantiate the tensorboard writers here
143161
if args.tensorboard_log_dir is not None:
144162
writers = {
145163
"train": SummaryWriter(os.path.join(args.tensorboard_log_dir, "train")),

0 commit comments

Comments
 (0)