Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 309a594

Browse files
authored
[Doc] update README for Qwen chat (#808)
1 parent ad032c9 commit 309a594

File tree

2 files changed

+22
-5
lines changed

2 files changed

+22
-5
lines changed

intel_extension_for_transformers/llm/runtime/graph/README.md

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,7 @@ while True:
178178
break
179179
b_prompt = "[INST]{}[/INST]".format(prompt) # prompt template for llama2
180180
inputs = tokenizer(b_prompt, return_tensors="pt").input_ids
181-
outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True,
182-
num_beams=1, max_new_tokens=-1, ctx_size = 1024, do_sample=True, threads=28, repetition_penalty=1.1)
181+
outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True, do_sample=True)
183182
```
184183

185184
Chat with ChatGLM2:
@@ -199,10 +198,28 @@ while True:
199198
break
200199
prompt = tokenizer.build_prompt(prompt) # prompt template for chatglm2
201200
inputs = tokenizer([prompt], return_tensors="pt").input_ids
202-
outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True,
203-
num_beams=1, max_new_tokens=-1, ctx_size = 1024, do_sample=True, threads=28, repetition_penalty=1.1, n_keep=2)
201+
outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True, do_sample=True, n_keep=2)
204202
```
205203

204+
Chat with Qwen:
205+
```python
206+
from transformers import AutoTokenizer, TextStreamer
207+
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
208+
209+
model_name = "Qwen/Qwen-7B-Chat" # or local path to model
210+
woq_config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
211+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
212+
streamer = TextStreamer(tokenizer)
213+
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
214+
215+
while True:
216+
prompt = input("> ").strip()
217+
if prompt == "quit":
218+
break
219+
prompt = "\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n".format(prompt) # prompt template for qwen
220+
inputs = tokenizer([prompt], return_tensors="pt").input_ids
221+
outputs = model.generate(inputs, streamer=streamer, interactive=True, ignore_prompt=True, do_sample=True)
222+
```
206223

207224
## How to use: Python script
208225
Install from binary

intel_extension_for_transformers/llm/runtime/graph/scripts/convert_qwen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
110110
fout.write(struct.pack("i", 0))
111111
fout.write(struct.pack("i", 0))
112112
fout.write(struct.pack("i", 0))
113-
fout.write(struct.pack("i", tokenizer.special_tokens['<|im_start|>']))
113+
fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
114114
fout.write(struct.pack("i", tokenizer.special_tokens['<|endoftext|>']))
115115
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
116116
fout.write(struct.pack("i", tokenizer.sep_token_id if tokenizer.sep_token_id is not None else -1))

0 commit comments

Comments
 (0)