Merge branch 'features/fix_routing_api'

Paweł Kędzia · Paweł Kędzia · commit 7a20a25ef882 · 2025-11-10T20:09:26.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,4 +10,4 @@
 | 0.1.1   | Prometheus metrics logging. Workers/Threads/Workers class is able to set by environments. Streaming fixes. Multi-providers for single model with default-balanced strategy.                                                                                                                 |
 | 0.2.0   | Add balancing strategies: `balanced`, `weighted`, `dynamic_weighted` and `first_available` which works for streaming and non streaming requests. Included Prometheus metrics logging via `/metrics` endpoint. First stage of `llm_router_lib` library, to simply usage of `llm-router-api`. |
 | 0.2.1   | Fix stream: OpenAI->Ollama, Ollama->OpenAI. Add Redis caching of availability of model providers (when using `first_available` strategy). Add `llm_router_web` module with simple flask-based frontend to manage llm-router config files.                                                   |
-| 0.2.2   | Update dockerfile and requirements.                                                                                                                                                                                                                                                         |
+| 0.2.2   | Update dockerfile and requirements. Fix routing with vLLM.                                                                                                                                                                                                                                  |
diff --git a/llm_router_api/core/api_types/dispatcher.py b/llm_router_api/core/api_types/dispatcher.py
@@ -103,6 +103,13 @@ def _get_impl(cls, api_type: str) -> ApiTypesI:
             )
         return impl()
 
+    def get_proper_endpoint(self, api_type: str, endpoint_url: str) -> str:
+        endpoint_url = endpoint_url.strip("/")
+        if endpoint_url in ["chat/completions", "api/chat/completions"]:
+            return self.chat_ep(api_type=api_type)
+
+        return self.completions_ep(api_type=api_type)
+
     @classmethod
     def chat_ep(cls, api_type: str) -> str:
         """
diff --git a/llm_router_api/core/api_types/ollama.py b/llm_router_api/core/api_types/ollama.py
@@ -18,7 +18,7 @@ def chat_method(self) -> str:
         return "POST"
 
     def completions_ep(self) -> str:
-        return "/api/generate"
+        return self.chat_ep()
 
     def completions_method(self) -> str:
         return "POST"
diff --git a/llm_router_api/core/api_types/openai.py b/llm_router_api/core/api_types/openai.py
@@ -35,7 +35,7 @@ def chat_method(self) -> str:
         return "POST"
 
     def completions_ep(self) -> str:
-        return "/v1/completions"
+        return self.chat_ep()
 
     def completions_method(self) -> str:
         return "POST"
diff --git a/llm_router_api/core/api_types/vllm.py b/llm_router_api/core/api_types/vllm.py
@@ -17,7 +17,7 @@ def chat_method(self) -> str:
         return "POST"
 
     def completions_ep(self) -> str:
-        return "/v1/completions"
+        return self.chat_ep()
 
     def completions_method(self) -> str:
         return "POST"
diff --git a/llm_router_api/endpoints/builtin/openai.py b/llm_router_api/endpoints/builtin/openai.py
@@ -80,7 +80,7 @@ def __init__(
             prompt_handler=prompt_handler,
             model_handler=model_handler,
             dont_add_api_prefix=True,
-            api_types=["openai", "lmstudio", "vllm"],
+            api_types=["openai", "lmstudio"],
             direct_return=direct_return,
             method="POST",
         )
diff --git a/llm_router_api/endpoints/endpoint_i.py b/llm_router_api/endpoints/endpoint_i.py
@@ -669,6 +669,7 @@ def run_ep(
                 map_prompt = params.pop("map_prompt", {})
                 prompt_str_force = params.pop("prompt_str_force", "")
                 prompt_str_postfix = params.pop("prompt_str_postfix", "")
+                params.pop("response_time", "")
 
             # self.logger.debug(json.dumps(params or {}, indent=2, ensure_ascii=False))
 
@@ -712,12 +713,12 @@ def run_ep(
 
             use_streaming = bool((params or {}).get("stream", False))
 
-            if simple_proxy and not use_streaming:
-                ep_pref = ""
-                if self.add_api_prefix and DEFAULT_API_PREFIX:
-                    ep_pref = DEFAULT_API_PREFIX.strip()
-                ep_url = ep_pref.strip("/") + "/" + self.name.lstrip("/")
+            # Prepare proper endpoint url
+            ep_url = self._api_type_dispatcher.get_proper_endpoint(
+                api_type=api_model_provider.api_type, endpoint_url=self.name
+            )
 
+            if simple_proxy and not use_streaming:
                 return self._return_response_or_rerun(
                     api_model_provider=api_model_provider,
                     ep_url=ep_url,
@@ -732,10 +733,6 @@ def run_ep(
                 self.logger.debug(f" -> prompt_name: {prompt_name}")
                 self.logger.debug(f" -> prompt_str: {str(prompt_str)[:40]}...")
 
-            ep_url = self._api_type_dispatcher.chat_ep(
-                api_type=api_model_provider.api_type
-            )
-
             if api_model_provider.api_type in ["openai"]:
                 params = self._filter_params_to_acceptable(
                     api_type=api_model_provider.api_type, params=params
@@ -813,6 +810,7 @@ def _return_response_or_rerun(
                 call_for_each_user_msg=self._call_for_each_user_msg,
             )
         except Exception as e:
+            self.logger.error(e)
             status_code_force = 500
 
         self.unset_model(

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ def __init__(`
`80`	`80`	`prompt_handler=prompt_handler,`
`81`	`81`	`model_handler=model_handler,`
`82`	`82`	`dont_add_api_prefix=True,`
`83`		`- api_types=["openai", "lmstudio", "vllm"],`
	`83`	`+ api_types=["openai", "lmstudio"],`
`84`	`84`	`direct_return=direct_return,`
`85`	`85`	`method="POST",`
`86`	`86`	`)`