|
26 | 26 | "cell_type": "code", |
27 | 27 | "execution_count": null, |
28 | 28 | "id": "fd08268e", |
29 | | - "metadata": {}, |
30 | | - "outputs": [], |
| 29 | + "metadata": { |
| 30 | + "execution": { |
| 31 | + "iopub.execute_input": "2025-10-15T17:49:49.105299Z", |
| 32 | + "iopub.status.busy": "2025-10-15T17:49:49.105048Z", |
| 33 | + "iopub.status.idle": "2025-10-15T18:01:24.859971Z", |
| 34 | + "shell.execute_reply": "2025-10-15T18:01:24.859446Z", |
| 35 | + "shell.execute_reply.started": "2025-10-15T17:49:49.105277Z" |
| 36 | + } |
| 37 | + }, |
| 38 | + "outputs": [ |
| 39 | + { |
| 40 | + "name": "stdout", |
| 41 | + "output_type": "stream", |
| 42 | + "text": [ |
| 43 | + "sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml\n", |
| 44 | + "sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml\n", |
| 45 | + "----------------------!" |
| 46 | + ] |
| 47 | + }, |
| 48 | + { |
| 49 | + "data": { |
| 50 | + "text/plain": [ |
| 51 | + "<sagemaker.djl_inference.djl_predictor.DJLPredictor at 0x7f5e46e66d50>" |
| 52 | + ] |
| 53 | + }, |
| 54 | + "execution_count": 1, |
| 55 | + "metadata": {}, |
| 56 | + "output_type": "execute_result" |
| 57 | + } |
| 58 | + ], |
31 | 59 | "source": [ |
32 | 60 | "from sagemaker.djl_inference import DJLModel\n", |
33 | 61 | "from sagemaker.utils import name_from_base\n", |
|
44 | 72 | "endpoint_name = name_from_base(model_id.split(\"/\")[1]+\"-ep\")\n", |
45 | 73 | "model = DJLModel(\n", |
46 | 74 | " name=model_name, role=role,\n", |
47 | | - " image_uri=f\"763104351884.dkr.ecr.{boto3.Session().region_name}.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128-v1.3\",\n", |
| 75 | + " image_uri=f\"763104351884.dkr.ecr.{boto3.Session().region_name}.amazonaws.com/djl-inference:0.34.0-lmi16.0.0-cu128-v1.2\",\n", |
48 | 76 | " env={\n", |
49 | | - " \"HF_MODEL_ID\": model_id, # config: https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm \n", |
50 | | - " \"OPTION_MAX_MODEL_LEN\": f\"{1024*20}\",\n", |
| 77 | + " \"HF_MODEL_ID\": model_id,\n", |
| 78 | + " \"OPTION_MAX_MODEL_LEN\": f\"{1024*16}\",\n", |
51 | 79 | " \"OPTION_QUANTIZE\": \"fp8\",\n", |
52 | | - " # vllm serve {model_id} --enable-auto-tool-choice --tool-call-parser hermes\n", |
53 | | - " \"OPTION_ROLLING_BATCH\": \"vllm\",\n", |
| 80 | + " 'OPTION_DTYPE': 'bf16',\n", |
| 81 | + " 'SERVING_FAIL_FAST': 'true',\n", |
| 82 | + " 'OPTION_ROLLING_BATCH': 'disable',\n", |
| 83 | + " 'OPTION_ASYNC_MODE': 'true',\n", |
| 84 | + " 'OPTION_ENTRYPOINT': 'djl_python.lmi_vllm.vllm_async_service',\n", |
54 | 85 | " \"OPTION_ENABLE_AUTO_TOOL_CHOICE\": \"true\",\n", |
55 | 86 | " \"OPTION_TOOL_CALL_PARSER\": \"hermes\",\n", |
56 | | - " # --enable-reasoning --reasoning-parser deepseek_r1\n", |
57 | | - " # \"OPTION_ENABLE_REASONING\": \"true\",\n", |
58 | | - " # \"OPTION_REASONING_PARSER\": \"qwen3\", # currently not available in djl lmi15\n", |
| 87 | + " \"OPTION_ENABLE_REASONING\": \"true\",\n", |
| 88 | + " \"OPTION_REASONING_PARSER\": \"qwen3\",\n", |
59 | 89 | " }\n", |
60 | 90 | ")\n", |
61 | 91 | "model.deploy(\n", |
|
67 | 97 | }, |
68 | 98 | { |
69 | 99 | "cell_type": "code", |
70 | | - "execution_count": null, |
| 100 | + "execution_count": 2, |
71 | 101 | "id": "18e6de49-11f7-4e36-b7bb-322282a51e53", |
72 | | - "metadata": {}, |
73 | | - "outputs": [], |
| 102 | + "metadata": { |
| 103 | + "execution": { |
| 104 | + "iopub.execute_input": "2025-10-15T18:01:24.860899Z", |
| 105 | + "iopub.status.busy": "2025-10-15T18:01:24.860666Z", |
| 106 | + "iopub.status.idle": "2025-10-15T18:01:24.864407Z", |
| 107 | + "shell.execute_reply": "2025-10-15T18:01:24.863955Z", |
| 108 | + "shell.execute_reply.started": "2025-10-15T18:01:24.860877Z" |
| 109 | + } |
| 110 | + }, |
| 111 | + "outputs": [ |
| 112 | + { |
| 113 | + "name": "stdout", |
| 114 | + "output_type": "stream", |
| 115 | + "text": [ |
| 116 | + "Endpoint name: Qwen3-4B-ep-2025-10-15-17-49-51-128\n", |
| 117 | + "Stored 'SAGEMAKER_ENDPOINT_NAME' (str)\n" |
| 118 | + ] |
| 119 | + } |
| 120 | + ], |
74 | 121 | "source": [ |
75 | 122 | "SAGEMAKER_ENDPOINT_NAME = model.endpoint_name\n", |
76 | 123 | "print(f\"Endpoint name: {SAGEMAKER_ENDPOINT_NAME}\")\n", |
|
0 commit comments