feat: add 4bit and 8bit quantization support with bitsandbytes

Rypo · Rypo · commit 99012bdaa002 · 2024-11-27T14:38:08.000-06:00
Add a quantization utility for HFQuantizers. Modify pipelines to accept quantization_config. Sets ground work for allow bf16 vae. Update requirements to include bitsandbytes. closes #45, closes #64
diff --git a/OmniGen/model.py b/OmniGen/model.py
@@ -12,9 +12,10 @@
 from huggingface_hub import snapshot_download
 from safetensors.torch import load_file
 from accelerate import init_empty_weights
+from transformers import BitsAndBytesConfig
 
 from OmniGen.transformer import Phi3Config, Phi3Transformer
-
+from OmniGen.utils import quantize_bnb
 
 def modulate(x, shift, scale):
     return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
@@ -187,9 +188,13 @@ def __init__(
 
         self.llm = Phi3Transformer(config=transformer_config)
         self.llm.config.use_cache = False
+
+        # bnb 4bit quantized models cannot be offloaded
+        self.offloadable = True
+        self.quantization_config = None
     
     @classmethod
-    def from_pretrained(cls, model_name: str|os.PathLike, dtype: torch.dtype = torch.bfloat16, low_cpu_mem_usage: bool = True,):
+    def from_pretrained(cls, model_name: str|os.PathLike, dtype: torch.dtype = torch.bfloat16, quantization_config: BitsAndBytesConfig = None, low_cpu_mem_usage: bool = True,):
         if not os.path.exists(model_name):
             cache_folder = os.getenv('HF_HUB_CACHE')
             model_name = snapshot_download(repo_id=model_name,
@@ -201,22 +206,30 @@ def from_pretrained(cls, model_name: str|os.PathLike, dtype: torch.dtype = torch
             model_path = os.path.join(model_name, 'model.pt')
             ckpt = torch.load(model_path, map_location='cpu')
         else:
-            print("Loading safetensors")
+            #print("Loading safetensors")
             ckpt = load_file(model_path, 'cpu')
 
         if low_cpu_mem_usage:
             with init_empty_weights():
                 config = Phi3Config.from_pretrained(model_name)
                 model = cls(config)
-        
-            model.load_state_dict(ckpt, assign=True)
-            model = model.to(dtype)
+            
+            if quantization_config:
+                model = quantize_bnb(model, ckpt, quantization_config=quantization_config, pre_quantized=False)
+                if getattr(quantization_config, 'load_in_4bit', None):
+                    model.offloadable = False
+                model.quantization_config = quantization_config
+            else:
+                model.load_state_dict(ckpt, assign=True)
         else:
+            if quantization_config:
+                raise ValueError('Quantization not supported for `low_cpu_mem_usage=False`.')
+            
             config = Phi3Config.from_pretrained(model_name)
             model = cls(config)
             model.load_state_dict(ckpt)
-            model = model.to(dtype)
         
+        model = model.to(dtype)
         del ckpt
         torch.cuda.empty_cache()
         gc.collect()
diff --git a/OmniGen/pipeline.py b/OmniGen/pipeline.py
@@ -1,6 +1,6 @@
 import os
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Literal
 import gc
 
 from PIL import Image
@@ -17,6 +17,7 @@
     scale_lora_layers,
     unscale_lora_layers,
 )
+from transformers import BitsAndBytesConfig
 from safetensors.torch import load_file
 
 from OmniGen import OmniGen, OmniGenProcessor, OmniGenScheduler
@@ -76,7 +77,7 @@ def __init__(
         self.model_cpu_offload = False
 
     @classmethod
-    def from_pretrained(cls, model_name, vae_path: str=None, device=None, low_cpu_mem_usage=True):
+    def from_pretrained(cls, model_name, vae_path: str=None, device=None, quantization_config:Literal['bnb_4bit','bnb_8bit']|BitsAndBytesConfig=None, low_cpu_mem_usage=True):
         if not os.path.exists(model_name) or (not os.path.exists(os.path.join(model_name, 'model.safetensors')) and model_name == "Shitao/OmniGen-v1"):
             logger.info("Model not found, downloading...")
             cache_folder = os.getenv('HF_HUB_CACHE')
@@ -87,8 +88,16 @@ def from_pretrained(cls, model_name, vae_path: str=None, device=None, low_cpu_me
         
         if device is None:
             device = best_available_device()
-
-        model = OmniGen.from_pretrained(model_name, dtype=torch.bfloat16, low_cpu_mem_usage=low_cpu_mem_usage)
+        
+        if isinstance(quantization_config, str):
+            if quantization_config == 'bnb_4bit':
+                quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float32, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=False)
+            elif quantization_config == 'bnb_8bit':
+                quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+            else:
+                raise NotImplementedError(f'Unknown `quantization_config` {quantization_config!r}')
+        
+        model = OmniGen.from_pretrained(model_name, dtype=torch.bfloat16, quantization_config=quantization_config, low_cpu_mem_usage=low_cpu_mem_usage)
         processor = OmniGenProcessor.from_pretrained(model_name)
 
         if vae_path is None:
@@ -98,7 +107,7 @@ def from_pretrained(cls, model_name, vae_path: str=None, device=None, low_cpu_me
             logger.info(f"No VAE found in {model_name}, downloading stabilityai/sdxl-vae from HF")
             vae_path = "stabilityai/sdxl-vae"
             
-        vae = AutoencoderKL.from_pretrained(vae_path).to(device)
+        vae = AutoencoderKL.from_pretrained(vae_path)
 
         return cls(vae, model, processor, device)
     
@@ -131,7 +140,8 @@ def move_to_device(self, data):
 
     def enable_model_cpu_offload(self):
         self.model_cpu_offload = True
-        self.model.to("cpu")
+        if self.model.offloadable:
+            self.model.to("cpu")
         self.vae.to("cpu")
         torch.cuda.empty_cache()  # Clear VRAM
         gc.collect()  # Run garbage collection to free system RAM
@@ -221,6 +231,7 @@ def __call__(
         if max_input_image_size != self.processor.max_image_size:
             self.processor = OmniGenProcessor(self.processor.text_tokenizer, max_image_size=max_input_image_size)
         self.model.to(dtype)
+        #self.vae.to(dtype) # Uncomment this line to allow bfloat16 VAE
         if offload_model:
             self.enable_model_cpu_offload()
         else:
@@ -250,12 +261,12 @@ def __call__(
             for temp_pixel_values in input_data['input_pixel_values']:
                 temp_input_latents = []
                 for img in temp_pixel_values:
-                    img = self.vae_encode(img.to(self.device), dtype)
+                    img = self.vae_encode(img.to(self.vae.device, self.vae.dtype), dtype)
                     temp_input_latents.append(img)
                 input_img_latents.append(temp_input_latents)
         else:
             for img in input_data['input_pixel_values']:
-                img = self.vae_encode(img.to(self.device), dtype)
+                img = self.vae_encode(img.to(self.vae.device, self.vae.dtype), dtype)
                 input_img_latents.append(img)
         if input_images is not None and self.model_cpu_offload:
             self.vae.to('cpu')
@@ -279,7 +290,7 @@ def __call__(
         else:
             func = self.model.forward_with_cfg
 
-        if self.model_cpu_offload:
+        if self.model_cpu_offload and self.model.offloadable:
             for name, param in self.model.named_parameters():
                 if 'layers' in name and 'layers.0' not in name:
                     param.data = param.data.cpu()
@@ -294,13 +305,13 @@ def __call__(
         samples = scheduler(latents, func, model_kwargs, use_kv_cache=use_kv_cache, offload_kv_cache=offload_kv_cache)
         samples = samples.chunk((1+num_cfg), dim=0)[0]
 
-        if self.model_cpu_offload:
+        if self.model_cpu_offload and self.model.offloadable:
             self.model.to('cpu')
             torch.cuda.empty_cache()  
             gc.collect()  
 
         self.vae.to(self.device)
-        samples = samples.to(torch.float32)
+        samples = samples.to(self.vae.dtype)
         if self.vae.config.shift_factor is not None:
             samples = samples / self.vae.config.scaling_factor + self.vae.config.shift_factor
         else:
diff --git a/OmniGen/utils.py b/OmniGen/utils.py
@@ -1,9 +1,14 @@
+import gc
 import logging
 
 from PIL import Image
 import torch
 import numpy as np
 
+from transformers import BitsAndBytesConfig
+from transformers.quantizers import AutoHfQuantizer
+from transformers.integrations import  replace_with_bnb_linear, set_module_quantized_tensor_to_device
+
 def create_logger(logging_dir):
     """
     Create a logger that writes to a log file and stdout.
@@ -108,3 +113,24 @@ def vae_encode_list(vae, x, weight_dtype):
         latents.append(img)
     return latents
 
+
+
+@torch.no_grad()
+def quantize_bnb(meta_model, state_dict:dict, quantization_config:BitsAndBytesConfig, pre_quantized=False):
+    # from transformers.integrations import get_keys_to_not_convert
+    
+    quantizer = AutoHfQuantizer.from_config(quantization_config, pre_quantized=pre_quantized)        
+    no_convert = [] #get_keys_to_not_convert(meta_model.llm) # might be worth investigating
+    
+    model = replace_with_bnb_linear(meta_model, modules_to_not_convert=no_convert, quantization_config=quantizer.quantization_config) 
+    
+    for param_name, param in state_dict.items():        
+        if not quantizer.check_quantized_param(model, param, param_name, state_dict):
+            set_module_quantized_tensor_to_device(model, param_name, device=0, value=param)
+        else:
+            quantizer.create_quantized_param(model, param, param_name, target_device=0, state_dict=state_dict)
+        
+    del state_dict
+    torch.cuda.empty_cache()
+    gc.collect()
+    return model
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ pillow==10.2.0
 peft==0.13.2
 diffusers==0.30.3
 timm==0.9.16
+bitsandbytes==0.44.1