codellama-7b working on GPUs with 24GB memory (#9)

patrickhyw · Ubuntu · web-flow · commit 37e0cc31d0a1 · 2024-04-13T14:44:34.000+08:00
* now only calling initialize_past_key_values once in rest_test.py

* fixed readme typos

---------

Co-authored-by: Ubuntu &lt;ubuntu@ip-172-31-23-53.ec2.internal&gt;
diff --git a/README.md b/README.md
@@ -73,7 +73,7 @@ python3 get_datastore_chat.py --model-path lmsys/vicuna-7b-v1.5 # get datastore_
 Build a Python code generation datastore from [The Stack](https://huggingface.co/datasets/bigcode/the-stack) within 20 minutes (requires 924MB disk storage)
 ```bash
 cd datastore
-python3 get_datastore_code.py --model-path codellama/CodeLlama-7b-instruct-hf # get datastore_code_small.idx in this folder
+python3 get_datastore_code.py --model-path codellama/CodeLlama-7b-instruct-hf # get datastore_stack_small.idx in this folder
 ```
 
 ### Build a large one
@@ -85,7 +85,7 @@ python3 get_datastore_chat.py --model-path lmsys/vicuna-7b-v1.5 --large-datastor
 (optionally) Build a Python code generation datastore from [The Stack](https://huggingface.co/datasets/bigcode/the-stack) (requires 27GB disk storage)
 ```bash
 cd datastore
-python3 get_datastore_code.py --model-path codellama/CodeLlama-7b-instruct-hf --large-datastore True # get datastore_code_large.idx in this folder
+python3 get_datastore_code.py --model-path codellama/CodeLlama-7b-instruct-hf --large-datastore True # get datastore_stack_large.idx in this folder
 ```
 
 ## Inference
@@ -99,7 +99,7 @@ RAYON_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0 python3 gen_model_answer_rest.py --mo
 ### Inference on HumanEval
 ```bash
 cd human_eval
-RAYON_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0 python3 rest_test.py --model-path codellama/CodeLlama-7b-instruct-hf --datastore-path ../datastore/datastore_code_small.idx
+RAYON_NUM_THREADS=6 CUDA_VISIBLE_DEVICES=0 python3 rest_test.py --model-path codellama/CodeLlama-7b-instruct-hf --datastore-path ../datastore/datastore_stack_small.idx
 ```
 
 ### Free Chat
diff --git a/human_eval/rest_test.py b/human_eval/rest_test.py
@@ -30,12 +30,22 @@ def run_eval(model, tokenizer, datastore, max_token_span, num_draft, temperature
         accept_lengths_tree = []
         with torch.inference_mode():
 
-            past_key_values, past_key_values_data, current_length_data = initialize_past_key_values(model.base_model)
-            model.past_key_values = past_key_values
-            model.past_key_values_data = past_key_values_data
-            model.current_length_data = current_length_data
-
-            model.current_length_data.zero_() # this is for rerun
+            # Initialize the past key and value states
+            if hasattr(model, "past_key_values"):
+                past_key_values = model.past_key_values
+                past_key_values_data = model.past_key_values_data
+                current_length_data = model.current_length_data
+                # Reset the past key and value states
+                current_length_data.zero_()
+            else:
+                (
+                    past_key_values,
+                    past_key_values_data,
+                    current_length_data,
+                ) = initialize_past_key_values(model.base_model)
+                model.past_key_values = past_key_values
+                model.past_key_values_data = past_key_values_data
+                model.current_length_data = current_length_data
 
 
             new_token = 0