Save load fix (#8)

pszemraj · Peter Szemraj · web-flow · commit 4a0fcfa951ff · 2025-03-05T01:26:08.000-05:00
fixes serialization issues (save/load) both in terms of safe globals
etc, and also working with a compiled model

---------

Signed-off-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;
Co-authored-by: Peter Szemraj &lt;peterszemraj+dev@gmail.com&gt;
diff --git a/cli_tools/convert_pretrained_mpnet_to_hf_model.py b/cli_tools/convert_pretrained_mpnet_to_hf_model.py
@@ -6,7 +6,10 @@
 doing here.
 """
 
+import argparse
 import logging
+import pathlib
+from argparse import Namespace
 
 from rich.logging import RichHandler
 
@@ -16,10 +19,9 @@
 )
 LOGGER = logging.getLogger(__name__)
 
-import argparse
-import pathlib
 
 import torch
+from torch.serialization import safe_globals
 from transformers import MPNetConfig, MPNetForMaskedLM
 from transformers.models.mpnet import MPNetLayer
 from transformers.utils import logging as hf_logging
@@ -45,12 +47,21 @@ def convert_mpnet_checkpoint_to_pytorch(
 
     # Load up the state dicts (one for the weights and one for the args) from the provided
     # serialization path
-    state_dicts = torch.load(mpnet_checkpoint_path)
+    with safe_globals([Namespace]):
+        state_dicts = torch.load(mpnet_checkpoint_path)
 
     # Extract the model args so that we can properly set the config later on
     # Extract the weights so we can set them within the constructs of the model
     mpnet_args = state_dicts["args"]
+    if isinstance(mpnet_args, dict):
+        mpnet_args = Namespace(**mpnet_args)
+
     mpnet_weight = state_dicts["model_states"]
+    # Fix for torch.compile() _orig_mod prefix
+    mpnet_weight = {k.replace("_orig_mod.", ""): v for k, v in mpnet_weight.items()}
+
+    print("Keys after removing _orig_mod prefix (if present):")
+    print(list(mpnet_weight.keys())[:5])  # Print first few keys to verify
 
     # Now we use the args (and one componennt of the weight to get the vocab size) to set the
     # MPNetConfig object, which will properly instantiate the MPNetForMaskedLM model to the specs
@@ -205,8 +216,9 @@ def cli_main():
     """
     Wrapper function so we can define a CLI entrypoint when setting up this package
     """
-    parser = argparse.ArgumentParser()
-    # Required parameters
+    parser = argparse.ArgumentParser(
+        description="Convert MPNet .pt checkpoint to Huggingface model"
+    )
     parser.add_argument(
         "--mpnet-checkpoint-path",
         default=None,
diff --git a/cli_tools/pretrain_mpnet.py b/cli_tools/pretrain_mpnet.py
@@ -2,8 +2,13 @@
 Pretraining script for MPNet
 """
 
+import argparse
+import gc
 import logging
+import math
+import os
 import sys
+from argparse import Namespace
 
 from rich.logging import RichHandler
 
@@ -13,15 +18,12 @@
 )
 LOGGER = logging.getLogger(__name__)
 
-import argparse
-import gc
-import math
-import os
 
 import torch
 import torch.nn.functional as F
 from datasets import load_dataset
 from rich.progress import track
+from torch.serialization import safe_globals
 from torch.utils.tensorboard import SummaryWriter
 from transformers import AutoTokenizer
 
@@ -397,7 +399,7 @@ def main(args) -> None:
                 and steps > 0
             ):
                 torch.save(
-                    {"args": args, "model_states": model.state_dict()},
+                    {"args": vars(args), "model_states": model.state_dict()},
                     os.path.join(args.checkpoint_dir, f"checkpoint{steps + 1}.pt"),
                 )
 
@@ -595,7 +597,7 @@ def main(args) -> None:
 
             # Now let's go ahead and save this in the checkpoints directory
             torch.save(
-                {"args": args, "model_states": model.state_dict()},
+                {"args": vars(args), "model_states": model.state_dict()},
                 os.path.join(args.checkpoint_dir, "best_checkpoint.pt"),
             )
 
@@ -630,13 +632,23 @@ def main(args) -> None:
     # use the test dataloader we built above to get a final test metric using the best checkpoint
 
     # Begin by loading the model states and args from the best checkpoint
-    dicts = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint.pt"))
+    with safe_globals([Namespace]):
+        dicts = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint.pt"))
+
+    # Handle args that might be dict or Namespace
+    loaded_args = dicts["args"]
+    if isinstance(loaded_args, dict):
+        loaded_args = Namespace(**loaded_args)
+
+    # Handle potential _orig_mod prefix in state dict from compiled models
+    model_states = dicts["model_states"]
+    model_states = {k.replace("_orig_mod.", ""): v for k, v in model_states.items()}
 
     # Load an empty shell of the model architecture using those args
-    test_model = MPNetForPretraining(dicts["args"], tokenizer)
+    test_model = MPNetForPretraining(loaded_args, tokenizer)
 
     # Now apply the model states to this newly instantiated model
-    test_model.load_state_dict(dicts["model_states"])
+    test_model.load_state_dict(model_states)
 
     # Finally make sure the model is in eval mode and is sent to the proper device
     test_model.to(device)
diff --git a/setup.py b/setup.py
@@ -57,7 +57,7 @@ def include_dirs(self, dirs):
 
 setup(
     name="annotated_mpnet",
-    version="0.1.1",
+    version="0.1.2",
     description="Raw Torch, heavily annotated, pretrainable MPNet",
     url="https://github.com/pszemraj/annotated-mpnet",
     long_description=readme,