@@ -307,6 +307,8 @@ type ChatCompletionRequest struct {
307307 // Such as think mode for qwen3. "chat_template_kwargs": {"enable_thinking": false}
308308 // https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes
309309 ChatTemplateKwargs map [string ]any `json:"chat_template_kwargs,omitempty"`
310+ // Specifies the latency tier to use for processing the request.
311+ ServiceTier ServiceTier `json:"service_tier,omitempty"`
310312}
311313
312314type StreamOptions struct {
@@ -390,6 +392,15 @@ const (
390392 FinishReasonNull FinishReason = "null"
391393)
392394
395+ type ServiceTier string
396+
397+ const (
398+ ServiceTierAuto ServiceTier = "auto"
399+ ServiceTierDefault ServiceTier = "default"
400+ ServiceTierFlex ServiceTier = "flex"
401+ ServiceTierPriority ServiceTier = "priority"
402+ )
403+
393404func (r FinishReason ) MarshalJSON () ([]byte , error ) {
394405 if r == FinishReasonNull || r == "" {
395406 return []byte ("null" ), nil
@@ -422,6 +433,7 @@ type ChatCompletionResponse struct {
422433 Usage Usage `json:"usage"`
423434 SystemFingerprint string `json:"system_fingerprint"`
424435 PromptFilterResults []PromptFilterResult `json:"prompt_filter_results,omitempty"`
436+ ServiceTier ServiceTier `json:"service_tier,omitempty"`
425437
426438 httpHeader
427439}
0 commit comments