Skip to content

Commit 4daa42f

Browse files
committed
disable prompt caching when using Bedrock Invoke API
1 parent 353850c commit 4daa42f

File tree

1 file changed

+19
-8
lines changed

1 file changed

+19
-8
lines changed

jupyter_ai_jupyternaut/jupyternaut/chat_models.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -344,18 +344,29 @@ async def acompletion_with_retry(
344344
"""Use tenacity to retry the async completion call."""
345345
retry_decorator = _create_retry_decorator(self, run_manager=run_manager)
346346

347+
# Enables ephemeral prompt caching of the last system message by
348+
# default when passed to `litellm.acompletion()`.
349+
#
350+
# See: https://docs.litellm.ai/docs/tutorials/prompt_caching
351+
cache_control_kwargs = {
352+
"cache_control_injection_points": [
353+
{ "location": "message", "role": "system" }
354+
]
355+
}
356+
357+
# Disable ephemeral prompt caching on Amazon Bedrock when the
358+
# InvokeModel API is used instead of Converse API. This is motivated by
359+
# an upstream bug in LiteLLM that has yet to be patched.
360+
#
361+
# See: github.com/BerriAI/litellm/issues/17479
362+
if self.model.startswith("bedrock/") and not self.model.startswith("bedrock/converse/"):
363+
cache_control_kwargs = {}
364+
347365
@retry_decorator
348366
async def _completion_with_retry(**kwargs: Any) -> Any:
349367
return await self.client.acompletion(
350368
**kwargs,
351-
# Enables ephemeral prompt caching of the last system message by
352-
# default.
353-
cache_control_injection_points=[
354-
{
355-
"location": "message",
356-
"role": "system",
357-
}
358-
],
369+
**cache_control_kwargs,
359370
)
360371

361372
return await _completion_with_retry(**kwargs)

0 commit comments

Comments
 (0)