From 1c6b29d9337a182eaff04b93abaadb9e3f46574b Mon Sep 17 00:00:00 2001 From: Akshay Sood <50393320+akshay-sood@users.noreply.github.com> Date: Tue, 19 May 2026 21:05:46 +0530 Subject: [PATCH] feat: allow disabling prompt caching per model via a0_explicit_caching kwarg Models that don't support prompt caching (e.g. NVIDIA Nemotron on Bedrock) fail with 403 errors when cache_control headers are present in messages. This adds support for a new model kwarg `a0_explicit_caching: false` that can be set in preset additional settings to disable prompt caching for specific models. The check is placed before _convert_messages() so cache_control markers are never injected into the message payload. --- models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/models.py b/models.py index 8518f0d1e3..54d18782c7 100644 --- a/models.py +++ b/models.py @@ -498,6 +498,10 @@ async def unified_call( if user_message: messages.append(HumanMessage(content=user_message)) + # Allow model kwargs to disable explicit caching (for models that don't support it) + if self.kwargs.get("a0_explicit_caching") is False or kwargs.get("a0_explicit_caching") is False: + explicit_caching = False + # convert to litellm format msgs_conv = self._convert_messages(messages, explicit_caching=explicit_caching) @@ -510,6 +514,7 @@ async def unified_call( call_kwargs: dict[str, Any] = _without_stream_kwarg({**self.kwargs, **kwargs}) max_retries: int = int(call_kwargs.pop("a0_retry_attempts", 2)) retry_delay_s: float = float(call_kwargs.pop("a0_retry_delay_seconds", 1.5)) + call_kwargs.pop("a0_explicit_caching", None) # strip before passing to LiteLLM stream = reasoning_callback is not None or response_callback is not None or tokens_callback is not None # results