From 1c6b29d9337a182eaff04b93abaadb9e3f46574b Mon Sep 17 00:00:00 2001
From: Akshay Sood <50393320+akshay-sood@users.noreply.github.com>
Date: Tue, 19 May 2026 21:05:46 +0530
Subject: [PATCH] feat: allow disabling prompt caching per model via
 a0_explicit_caching kwarg

Models that don't support prompt caching (e.g. NVIDIA Nemotron on Bedrock)
fail with 403 errors when cache_control headers are present in messages.

This adds support for a new model kwarg `a0_explicit_caching: false` that
can be set in preset additional settings to disable prompt caching for
specific models.

The check is placed before _convert_messages() so cache_control markers
are never injected into the message payload.
---
 models.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/models.py b/models.py
index 8518f0d1e3..54d18782c7 100644
--- a/models.py
+++ b/models.py
@@ -498,6 +498,10 @@ async def unified_call(
         if user_message:
             messages.append(HumanMessage(content=user_message))
 
+        # Allow model kwargs to disable explicit caching (for models that don't support it)
+        if self.kwargs.get("a0_explicit_caching") is False or kwargs.get("a0_explicit_caching") is False:
+            explicit_caching = False
+
         # convert to litellm format
         msgs_conv = self._convert_messages(messages, explicit_caching=explicit_caching)
 
@@ -510,6 +514,7 @@ async def unified_call(
         call_kwargs: dict[str, Any] = _without_stream_kwarg({**self.kwargs, **kwargs})
         max_retries: int = int(call_kwargs.pop("a0_retry_attempts", 2))
         retry_delay_s: float = float(call_kwargs.pop("a0_retry_delay_seconds", 1.5))
+        call_kwargs.pop("a0_explicit_caching", None)  # strip before passing to LiteLLM
         stream = reasoning_callback is not None or response_callback is not None or tokens_callback is not None
 
         # results