disable prompt caching when using Bedrock Invoke API

dlqqq · dlqqq · commit 4daa42f15e6f · 2025-12-05T15:49:17.000-08:00
diff --git a/jupyter_ai_jupyternaut/jupyternaut/chat_models.py b/jupyter_ai_jupyternaut/jupyternaut/chat_models.py
@@ -344,18 +344,29 @@ async def acompletion_with_retry(
         """Use tenacity to retry the async completion call."""
         retry_decorator = _create_retry_decorator(self, run_manager=run_manager)
 
+        # Enables ephemeral prompt caching of the last system message by
+        # default when passed to `litellm.acompletion()`.
+        #
+        # See: https://docs.litellm.ai/docs/tutorials/prompt_caching
+        cache_control_kwargs = {
+            "cache_control_injection_points": [
+                { "location": "message", "role": "system" }
+            ]
+        }
+
+        # Disable ephemeral prompt caching on Amazon Bedrock when the
+        # InvokeModel API is used instead of Converse API. This is motivated by
+        # an upstream bug in LiteLLM that has yet to be patched.
+        #
+        # See: github.com/BerriAI/litellm/issues/17479
+        if self.model.startswith("bedrock/") and not self.model.startswith("bedrock/converse/"):
+            cache_control_kwargs = {}
+
         @retry_decorator
         async def _completion_with_retry(**kwargs: Any) -> Any:
             return await self.client.acompletion(
                 **kwargs,
-                # Enables ephemeral prompt caching of the last system message by
-                # default.
-                cache_control_injection_points=[
-                    {
-                        "location": "message",
-                        "role": "system",
-                    }
-                ],
+                **cache_control_kwargs,
             )
 
         return await _completion_with_retry(**kwargs)