|
44 | 44 | "endpoint_name = name_from_base(model_id.split(\"/\")[1]+\"-ep\")\n", |
45 | 45 | "model = DJLModel(\n", |
46 | 46 | " name=model_name, role=role,\n", |
47 | | - " image_uri=f\"763104351884.dkr.ecr.{boto3.Session().region_name}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128-v1.3\",\n", |
| 47 | + " image_uri=f\"763104351884.dkr.ecr.{boto3.Session().region_name}.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2\",\n", |
48 | 48 | " env={\n", |
49 | | - " \"HF_MODEL_ID\": model_id, # config: https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm \n", |
50 | | - " \"OPTION_MAX_MODEL_LEN\": f\"{1024*20}\",\n", |
| 49 | + " \"HF_MODEL_ID\": model_id, |
| 50 | + " \"OPTION_MAX_MODEL_LEN\": f\"{1024*16}\",\n", |
51 | 51 | " \"OPTION_QUANTIZE\": \"fp8\",\n", |
52 | | - " # vllm serve {model_id} --enable-auto-tool-choice --tool-call-parser hermes\n", |
53 | 52 | " \"OPTION_ROLLING_BATCH\": \"vllm\",\n", |
| 53 | + " \"OPTION_DTYPE\": \"bf16\",\n", |
| 54 | + " \"SERVING_FAIL_FAST\": \"true",\n", |
| 55 | + " \"OPTION_ROLLING_BATCH\": \"disable",\n", |
| 56 | + " \"OPTION_ASYNC_MODE\": \"true",\n", |
| 57 | + " \"OPTION_ENTRYPOINT\": \"djl_python.lmi_vllm.vllm_async_service",\n", |
54 | 58 | " \"OPTION_ENABLE_AUTO_TOOL_CHOICE\": \"true\",\n", |
55 | 59 | " \"OPTION_TOOL_CALL_PARSER\": \"hermes\",\n", |
56 | | - " # --enable-reasoning --reasoning-parser deepseek_r1\n", |
57 | | - " # \"OPTION_ENABLE_REASONING\": \"true\",\n", |
58 | | - " # \"OPTION_REASONING_PARSER\": \"qwen3\", # currently not available in djl lmi15\n", |
| 60 | + " \"OPTION_ENABLE_REASONING\": \"true\",\n", |
| 61 | + " \"OPTION_REASONING_PARSER\": \"qwen3\",\n", |
59 | 62 | " }\n", |
60 | 63 | ")\n", |
61 | 64 | "model.deploy(\n", |
|
0 commit comments