Skip to content

Commit 5abef3c

Browse files
committed
fix model name in examples
1 parent 39b98a2 commit 5abef3c

File tree

7 files changed

+28
-28
lines changed

7 files changed

+28
-28
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ vec-inf list
5353

5454
You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
5555
```bash
56-
vec-inf list Meta-Llama-3.1-70B-Instruct
56+
vec-inf list Meta-Llama-3.1-8B-Instruct
5757
```
5858
<img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/5dec7a33-ba6b-490d-af47-4cf7341d0b42">
5959

6060
`launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.
6161

6262
## Send inference requests
6363
Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
64-
> {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
64+
> {"id":"cmpl-c08d8946224747af9cce9f4d9f36ceb3","object":"text_completion","created":1725394970,"model":"Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"text":" is a question that many people may wonder. The answer is, of course, Ottawa. But if","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
6565
6666
**NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
6767

examples/inference/llm/chat_completions.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55

66
# Update the model path accordingly
77
completion = client.chat.completions.create(
8-
model="/model-weights/Meta-Llama-3-8B-Instruct",
9-
messages=[
10-
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
11-
{"role": "user", "content": "Who are you?"},
12-
]
8+
model="Meta-Llama-3-8B-Instruct",
9+
messages=[
10+
{
11+
"role": "system",
12+
"content": "You are a pirate chatbot who always responds in pirate speak!",
13+
},
14+
{"role": "user", "content": "Who are you?"},
15+
],
1316
)
1417

15-
print(completion)
18+
print(completion)

examples/inference/llm/completions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55

66
# Update the model path accordingly
77
completion = client.completions.create(
8-
model="/model-weights/Meta-Llama-3-8B",
8+
model="Meta-Llama-3.1-8B-Instruct",
99
prompt="Where is the capital of Canada?",
1010
max_tokens=20,
1111
)
1212

13-
print(completion)
13+
print(completion)

examples/inference/llm/completions.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ export API_BASE_URL=http://gpuXXX:XXXX/v1
55
curl ${API_BASE_URL}/completions \
66
-H "Content-Type: application/json" \
77
-d '{
8-
"model": "/model-weights/Meta-Llama-3-8B",
8+
"model": "Meta-Llama-3-8B",
99
"prompt": "What is the capital of Canada?",
1010
"max_tokens": 20
1111
}'

examples/inference/vlm/vision_completions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
# Update the model path accordingly
77
completion = client.chat.completions.create(
8-
model="/model-weights/llava-1.5-13b-hf",
8+
model="llava-1.5-13b-hf",
99
messages=[
1010
{
1111
"role": "user",
@@ -24,4 +24,3 @@
2424
)
2525

2626
print(completion)
27-

examples/logits/logits.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
client = OpenAI(base_url="http://gpuXXX:XXXXX/v1", api_key="EMPTY")
55

66
completion = client.completions.create(
7-
model="/model-weights/Meta-Llama-3-8B",
7+
model="Meta-Llama-3-8B",
88
prompt="Where is the capital of Canada?",
99
max_tokens=1,
10-
logprobs=32000 # Set to model vocab size to get logits
10+
logprobs=32000, # Set to model vocab size to get logits
1111
)
1212

1313
print(completion.choices[0].logprobs)

profile/gen.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
1-
import requests
21
import time
32

3+
import requests
4+
45
# Change the ENDPOINT and MODEL_PATH to match your setup
5-
ENDPOINT = "http://gpuXXX:XXXX/v1"
6+
ENDPOINT = "http://gpuXXX:XXXX/v1"
67
MODEL_PATH = "Meta-Llama-3-70B"
78

89
# Configuration
9-
API_KEY = 'EMPTY'
10+
API_KEY = "EMPTY"
1011
HEADERS = {
11-
'Authorization': f'Bearer {API_KEY}',
12-
'Content-Type': 'application/json',
12+
"Authorization": f"Bearer {API_KEY}",
13+
"Content-Type": "application/json",
1314
}
1415

1516
# Sample prompts for testing
@@ -66,15 +67,12 @@
6667
"What are the ethical implications of cloning?",
6768
"Explain the significance of the Pyramids of Giza.",
6869
"Describe the process of making wine.",
69-
"How does the GPS system work?"
70+
"How does the GPS system work?",
7071
]
7172

73+
7274
def send_request(prompt):
73-
data = {
74-
'model': f"/model-weights/{MODEL_PATH}",
75-
'prompt': prompt,
76-
'max_tokens': 100
77-
}
75+
data = {"model": f"{MODEL_PATH}", "prompt": prompt, "max_tokens": 100}
7876
start_time = time.time()
7977
response = requests.post(f"{ENDPOINT}/completions", headers=HEADERS, json=data)
8078
duration = time.time() - start_time
@@ -83,13 +81,13 @@ def send_request(prompt):
8381
else:
8482
return None
8583

84+
8685
def main():
8786
for i in range(10):
8887
print("Sending 20x requests 0-52...")
8988
send_request(PROMPTS * 20)
9089
print("Done!")
9190

9291

93-
94-
if __name__ == '__main__':
92+
if __name__ == "__main__":
9593
main()

0 commit comments

Comments
 (0)