turboderp-org
diff --git a/‎eval/humaneval.py‎
Lines changed: 14 additions & 1 deletion b/‎eval/humaneval.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎examples/chat.py‎
Lines changed: 6 additions & 3 deletions b/‎examples/chat.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/chat_prompts.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/chat_prompts.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/dynamic_gen.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/dynamic_gen.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/inference.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/inference.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/inference_async.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/inference_async.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/inference_banned_strings.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/inference_banned_strings.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/inference_cfg.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/inference_cfg.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/inference_dedup.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/inference_dedup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/inference_json.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/inference_json.py‎
Lines changed: 1 addition & 0 deletions
@@ -5,7 +5,7 @@
 from exllamav2 import model_init
 from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
-import argparse, contextlib
+import argparse, contextlib, subprocess
 import util
 
 # Args
@@ -20,6 +20,7 @@
 parser.add_argument("--max_tokens", type = int, default = 768, help = "Max number of tokens for each completion")
 parser.add_argument("-pf", "--prompt_format", type = str, help = "Instruct format to apply. Default is raw completion (for base models) ")
 parser.add_argument("-v", "--verbose", action = "store_true", help = "Spam completions to console while generating")
+parser.add_argument("-e", "--eval", action = "store_true", help = "Run evaluation script on output file after sampling")
 model_init.add_args(parser)
 args = parser.parse_args()
 
@@ -52,6 +53,13 @@
         "<|start_header_id|>assistant<|end_header_id|>\n\n"
         "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}    ",
         "    "
+    ),
+    "gemma": (
+        "<bos><start_of_turn>user\n"
+        "Complete the following Python function:\n\n{{problem}}<|eot_id|>"
+        "<start_of_turn>model\n"
+        "```python\n{{problem}}    ",
+        "    "
     )
 }
 
@@ -192,3 +200,8 @@
 print(f" -- Saving: {args.output}")
 write_jsonl(args.output, samples)
 
+# Optionally launch eval script
+
+if args.eval:
+    subprocess.run(["evaluate_functional_correctness", args.output])
+
@@ -61,7 +61,7 @@
 
 parser.add_argument("-ngram", "--ngram_decoding", action = "store_true", help = "Use n-gram speculative decoding")
 
-parser.add_argument("-pt", "--print_timings", action = "store_true", help = "Output timings after each prompt")
+parser.add_argument("-pt", "--print_timings", action = "store_true", help = "Output timings/stats after each prompt")
 parser.add_argument("-amnesia", "--amnesia", action = "store_true", help = "Forget context after every response")
 
 # Arrrgs
@@ -235,7 +235,9 @@ def get_tokenized_context(max_len):
 
 # Stop conditions
 
-generator.set_stop_conditions(prompt_format.stop_conditions(tokenizer))
+sc = prompt_format.stop_conditions(tokenizer)
+sc = [x for x in sc if x]
+generator.set_stop_conditions(sc)
 
 # ANSI color codes
 
@@ -393,8 +395,9 @@ def get_tokenized_context(max_len):
         else:
             sd_stats = ""
 
+        ctx_tokens = active_context.shape[-1]
         print()
-        print(col_sysprompt + f"(Response: {response_tokens} tokens, {speed:.2f} tokens/second{sd_stats})" + col_default)
+        print(col_sysprompt + f"(Context: {ctx_tokens} tokens, response: {response_tokens} tokens, {speed:.2f} tokens/second{sd_stats})" + col_default)
 
     # Optionally forget context after each response
 
 
@@ -229,6 +229,7 @@ def subs_prompt(self):
     def stop_conditions(self, tokenizer):
         return \
             [tokenizer.eos_token_id,
+             tokenizer.single_id("<|im_end|>"),
              """<|im_end|>"""]
 
     def encoding_options(self):
 
@@ -136,6 +136,7 @@ def main():
     if use_draft_model:
 
         draft_config = ExLlamaV2Config(draft_model_dir)
+        draft_config.arch_compat_overrides()
         draft_model = ExLlamaV2(draft_config)
 
         draft_cache = ExLlamaV2Cache(
@@ -155,6 +156,7 @@ def main():
     # 2048, which will also be the limit of the chunk size for prefill used by the dynamic generator.
 
     config = ExLlamaV2Config(model_dir)
+    config.arch_compat_overrides()
     config.max_input_len = max_chunk_size
     config.max_attention_size = max_chunk_size ** 2
     model = ExLlamaV2(config)
 
@@ -7,6 +7,7 @@
 
 model_dir = "/mnt/str/models/mistral-7b-exl2/4.0bpw"
 config = ExLlamaV2Config(model_dir)
+config.arch_compat_overrides()
 model = ExLlamaV2(config)
 cache = ExLlamaV2Cache(model, max_seq_len = 32768, lazy = True)
 model.load_autosplit(cache, progress = True)
 
@@ -9,6 +9,7 @@
 async def main():
     model_dir = "/mnt/str/models/llama3-8b-exl2/4.0bpw"
     config = ExLlamaV2Config(model_dir)
+    config.arch_compat_overrides()
     model = ExLlamaV2(config)
     cache = ExLlamaV2Cache(model, lazy = True)
     model.load_autosplit(cache, progress = True)
 
@@ -9,6 +9,7 @@
 
 model_dir = "/mnt/str/models/llama3-8b-instruct-exl2/6.0bpw/"
 config = ExLlamaV2Config(model_dir)
+config.arch_compat_overrides()
 model = ExLlamaV2(config)
 cache = ExLlamaV2Cache(model, lazy = True)
 model.load_autosplit(cache, progress = True)
 
@@ -8,6 +8,7 @@
 
 model_dir = "/mnt/str/models/llama3-8b-instruct-exl2/4.0bpw"
 config = ExLlamaV2Config(model_dir)
+config.arch_compat_overrides()
 model = ExLlamaV2(config)
 cache = ExLlamaV2Cache(model, max_seq_len = 32768, lazy = True)
 model.load_autosplit(cache, progress = True)
 
@@ -8,6 +8,7 @@
 
 model_dir = "/mnt/str/models/llama3-8b-instruct-exl2/4.0bpw"
 config = ExLlamaV2Config(model_dir)
+config.arch_compat_overrides()
 model = ExLlamaV2(config)
 cache = ExLlamaV2Cache(model, max_seq_len = 8192, lazy = True)
 model.load_autosplit(cache, progress = True)
 
@@ -13,6 +13,7 @@
 
 model_dir = "/mnt/str/models/mistral-7b-exl2/4.0bpw"
 config = ExLlamaV2Config(model_dir)
+config.arch_compat_overrides()
 model = ExLlamaV2(config)
 cache = ExLlamaV2Cache(model, max_seq_len = 32768, lazy = True)
 model.load_autosplit(cache, progress = True)