[LLM Runtime] Remove use_cache in WOQ (#818)

a32543254 · web-flow · commit 4212741363da · 2023-12-01T17:17:23.000+08:00
diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md
@@ -128,7 +128,6 @@ Argument description of WeightOnlyQuantConfig:
 | scale_dtype       | String      | Data type of scales: fp32/bf16 (default fp32)                                           |
 | use_ggml          | Bool        | Enable ggml for quantization and inference (default: False)                             |
 | use_quant         | Bool        | Determine whether or not the model will be quantized. (default: True)                  |
-| use_cache         | Bool        | Use local quantized model if file exists (default: False)                               |
 
 Argument description of generate function:
 | Argument          |  Type       | Description                                                                             |
diff --git a/intel_extension_for_transformers/llm/runtime/graph/__init__.py b/intel_extension_for_transformers/llm/runtime/graph/__init__.py
@@ -75,7 +75,7 @@ def get_model_type(model_config):
             model_type = "chatglm2"
         return model_type
 
-    def init(self, model_name, use_quant=True, use_cache=False, use_gptq=False, **quant_kwargs):
+    def init(self, model_name, use_quant=True, use_gptq=False, **quant_kwargs):
         self.config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model_type = Model.get_model_type(self.config)
@@ -106,15 +106,18 @@ def init(self, model_name, use_quant=True, use_cache=False, use_gptq=False, **qu
             self.bin_file = fp32_bin
         else:
             self.bin_file = quant_bin
-        if use_cache and os.path.exists(self.bin_file):
+
+        if os.path.exists(self.bin_file):
+            print("{} existed, will use cache file. Otherwise please remove the file".
+                  format(self.bin_file))
             return
 
         if use_gptq:
             convert_model(model_name, quant_bin, "f32")
             return
 
 
-        if not use_cache or not os.path.exists(fp32_bin):
+        if not os.path.exists(fp32_bin):
             convert_model(model_name, fp32_bin, "f32")
             assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
 
@@ -125,8 +128,7 @@ def init(self, model_name, use_quant=True, use_cache=False, use_gptq=False, **qu
         assert os.path.exists(quant_bin), "Fail to quantize model"
 
         # clean
-        if not use_cache:
-            os.remove(fp32_bin)
+        os.remove(fp32_bin)
 
     def init_from_bin(self, model_type, model_path, **generate_kwargs):
         self.__import_package(model_type)
diff --git a/intel_extension_for_transformers/llm/runtime/graph/tests/test_llm_runtime.py b/intel_extension_for_transformers/llm/runtime/graph/tests/test_llm_runtime.py
@@ -55,7 +55,7 @@ def test_llm_runtime(self):
         print(tokenizer.decode(pt_generate_ids))
 
         # check output ids
-        woq_config = WeightOnlyQuantConfig(use_cache=True, use_quant=False)
+        woq_config = WeightOnlyQuantConfig(use_quant=False)
         itrex_model = AutoModel.from_pretrained(model_name, quantization_config=woq_config, use_llm_runtime=True, trust_remote_code=True)
         itrex_generate_ids = itrex_model.generate(inputs.input_ids, do_sample=False, max_new_tokens=100)[0]
         print(tokenizer.decode(itrex_generate_ids))
@@ -64,10 +64,10 @@ def test_llm_runtime(self):
 
         # check diff of logits
         woq_configs = {
-            "fp32": WeightOnlyQuantConfig(use_cache=True, use_quant=False),
-            # "ggml_int4": WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4", use_cache=True, use_ggml=True),
-            "jblas_int4": WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4", use_cache=True),
-            # "jblas_int8": WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8", use_cache=True),
+            "fp32": WeightOnlyQuantConfig(use_quant=False),
+            # "ggml_int4": WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4",use_ggml=True),
+            "jblas_int4": WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4"),
+            # "jblas_int8": WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8"),
             }
         for config_type in woq_configs:
             itrex_model = AutoModel.from_pretrained(model_name, quantization_config=woq_configs[config_type], 
@@ -98,7 +98,7 @@ def test_beam_search(self):
         pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/beam_pt_generate_ids.pth").tolist()
 
         # llm runtime fp32
-        woq_config = WeightOnlyQuantConfig(use_quant=False, use_cache=True)
+        woq_config = WeightOnlyQuantConfig(use_quant=False)
         itrex_model = AutoModelForCausalLM.from_pretrained(
             model_name, quantization_config=woq_config, trust_remote_code=True)
         itrex_generate_ids = itrex_model.generate(
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -184,7 +184,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     compute_dtype=quantization_config.compute_dtype,
                     use_ggml=quantization_config.use_ggml,
                     use_quant=quantization_config.use_quant,
-                    use_cache=quantization_config.use_cache,
                     use_gptq=quantization_config.use_gptq,
                 )
                 return model
diff --git a/intel_extension_for_transformers/transformers/utils/quantization_config.py b/intel_extension_for_transformers/transformers/utils/quantization_config.py
@@ -43,7 +43,6 @@ def __init__(
         algorithm="RTN",
         use_ggml=False,
         use_quant=True,
-        use_cache=False,
         use_gptq=False,
         **kwargs,
     ):
@@ -70,7 +69,6 @@ def __init__(
         self.calib_iters = kwargs.pop("calib_iters", 100)
         self.use_ggml = use_ggml
         self.use_quant = use_quant
-        self.use_cache = use_cache
         self.use_gptq = use_gptq
 
         if compute_dtype is None:

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, model_args, *kwargs):`
`184`	`184`	`compute_dtype=quantization_config.compute_dtype,`
`185`	`185`	`use_ggml=quantization_config.use_ggml,`
`186`	`186`	`use_quant=quantization_config.use_quant,`
`187`		`- use_cache=quantization_config.use_cache,`
`188`	`187`	`use_gptq=quantization_config.use_gptq,`
`189`	`188`	`)`
`190`	`189`	`return model`