Config fixes (#13)

pszemraj · Peter Szemraj · web-flow · commit 675b95706ff1 · 2025-04-26T21:52:04.000+02:00
this PR validates important config attributes during training/export:

- token_ids during training and export
- dropout settings during export

---------

Signed-off-by: peter szemraj &lt;peterszemraj@gmail.com&gt;
Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;
Co-authored-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;
diff --git a/annotated_mpnet/transformer_modules/positional_embedding.py b/annotated_mpnet/transformer_modules/positional_embedding.py
@@ -32,9 +32,7 @@ def PositionalEmbedding(
 
     # If we specified "learned" to be True, we want to create a learned positional embedding module
     if learned:
-        # If we specify a padding index, we need to update the total number of embeddings
-        if padding_idx is not None:
-            num_embeddings = num_embeddings + padding_idx + 1
+        num_embeddings = num_embeddings + 2 # Add 2 for CLS and SEP
 
         # Instantiate the learned positional embeddings
         m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
@@ -48,7 +46,7 @@ def PositionalEmbedding(
     # Branch to create sinusoidal embeddings if "learned" is False
     else:
         m = SinusoidalPositionalEmbedding(
-            embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1
+            embedding_dim, padding_idx, init_size=num_embeddings + 2 # Add 2 for CLS and SEP
         )
 
     return m
diff --git a/cli_tools/convert_pretrained_mpnet_to_hf_model.py b/cli_tools/convert_pretrained_mpnet_to_hf_model.py
@@ -50,7 +50,7 @@ def convert_mpnet_checkpoint_to_pytorch(
     # Load up the state dicts (one for the weights and one for the args) from the provided
     # serialization path
     with safe_globals([Namespace]):
-        state_dicts = torch.load(mpnet_checkpoint_path)
+        state_dicts = torch.load(mpnet_checkpoint_path, map_location="cpu")
 
     # Extract the model args so that we can properly set the config later on
     # Extract the weights so we can set them within the constructs of the model
@@ -80,9 +80,20 @@ def convert_mpnet_checkpoint_to_pytorch(
         max_position_embeddings=mpnet_args.max_positions + 2,
         relative_attention_num_buckets=mpnet_args.relative_attention_num_buckets,
         hidden_act=mpnet_args.activation_fn,
+        # Note: there are three dropouts in MPNetForPretraining, but only two in MPNetForMaskedLM
+        hidden_dropout_prob=mpnet_args.activation_dropout,
+        attention_probs_dropout_prob=mpnet_args.attention_dropout,
         layer_norm_eps=1e-5,
     )
 
+    # if the mpnet_args contain token_ids, ensure model config matches
+    if hasattr(mpnet_args, "pad_token_id"):
+        config.pad_token_id = mpnet_args.pad_token_id
+    if hasattr(mpnet_args, "bos_token_id"):
+        config.bos_token_id = mpnet_args.bos_token_id
+    if hasattr(mpnet_args, "eos_token_id"):
+        config.eos_token_id = mpnet_args.eos_token_id
+
     # Now load the model with randomized weights
     model = MPNetForMaskedLM(config)
 
@@ -210,17 +221,27 @@ def convert_mpnet_checkpoint_to_pytorch(
     pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
     LOGGER.info(f"Saving model to {pytorch_dump_folder_path}")
 
-    # Now that the config and weights are loaded into the model class, we can use HF's builtin
-    # save_pretrained function to dump the appropriate contents to the provided dir path
-    model.save_pretrained(pytorch_dump_folder_path)
-
     if save_tokenizer and hasattr(mpnet_args, "tokenizer_name"):
         LOGGER.info(f"Saving tokenizer to {pytorch_dump_folder_path}")
         tokenizer = AutoTokenizer.from_pretrained(
             mpnet_args.tokenizer_name, model_max_length=mpnet_args.max_positions
         )
+
+        # Synchronize token IDs between tokenizer and model config
+        model.config.bos_token_id = tokenizer.bos_token_id
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        LOGGER.info(
+            f"Updated config with tokenizer IDs: BOS={tokenizer.bos_token_id}, "
+            f"EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}"
+        )
+
         tokenizer.save_pretrained(pytorch_dump_folder_path)
 
+    # Now that the config and weights are loaded into the model class, we can use HF's builtin
+    # save_pretrained function to dump the appropriate contents to the provided dir path
+    model.save_pretrained(pytorch_dump_folder_path)
+
     LOGGER.info("Done!")
 
 
diff --git a/cli_tools/pretrain_mpnet.py b/cli_tools/pretrain_mpnet.py
@@ -30,13 +30,15 @@
 
 import wandb
 from annotated_mpnet.data import (
-    DataCollatorForMaskedPermutedLanguageModeling, HFStreamingDataset,
-    MPNetDataset, RandomSamplerWithSeed)
+    DataCollatorForMaskedPermutedLanguageModeling,
+    HFStreamingDataset,
+    MPNetDataset,
+    RandomSamplerWithSeed,
+)
 from annotated_mpnet.modeling import MPNetForPretraining
 from annotated_mpnet.scheduler import PolynomialDecayLRScheduler
 from annotated_mpnet.tracking import AverageMeter
-from annotated_mpnet.utils.utils import (SUPPORTED_ACTIVATIONS,
-                                         validate_tokenizer)
+from annotated_mpnet.utils.utils import SUPPORTED_ACTIVATIONS, validate_tokenizer
 
 
 def accuracy(output: torch.Tensor, target: torch.Tensor) -> int:
@@ -160,9 +162,9 @@ def main(args) -> None:
         args.tokenizer_name, model_max_length=args.max_tokens
     )
     is_valid, details = validate_tokenizer(tokenizer)
-    assert is_valid and details["whole_word_mask"], (
-        f"Invalid tokenizer: {args.tokenizer_name}. Debug w/ verbose output from validate_tokenizer()"
-    )
+    assert (
+        is_valid and details["whole_word_mask"]
+    ), f"Invalid tokenizer: {args.tokenizer_name}. Debug w/ verbose output from validate_tokenizer()"
 
     # Check and adjust model vocab_size for better GPU performance
     original_vocab_size = tokenizer.vocab_size
@@ -182,6 +184,11 @@ def main(args) -> None:
         args.original_vocab_size = original_vocab_size
         args.padded_vocab_size = original_vocab_size
 
+    # Explicitly store token IDs in args for consistent usage
+    args.pad_token_id = tokenizer.pad_token_id
+    args.bos_token_id = tokenizer.bos_token_id
+    args.eos_token_id = tokenizer.eos_token_id
+
     # -----------------------------------
 
     # Instantiate the tensorboard writers