Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions tests/test_renderer_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,57 @@ def test_attach_tool_call_names_unknown_tool_call_id_left_unset():
assert "name" not in out[1]


@pytest.mark.asyncio
async def test_to_native_tool_returns_openai_envelope():
"""``RendererClient.to_native_tool`` must wrap each ``Tool`` in the
OpenAI envelope (``{"type": "function", "function": {...}}``) β€” the
same shape ``OpenAIChatCompletionsClient`` sends server-side under
TITO/MITO. Modern function-calling models (Qwen3 family, GLM, Kimi)
saw the envelope at training time, so the renderer client's prompt
must match. Regression for the bare-form bug where rollout-mode
tool envs produced uniformly zero rewards because the model never
emitted ``<tool_call>`` blocks under an out-of-distribution prompt.
"""
from verifiers.types import Tool

client = object.__new__(RendererClient)
tool = Tool(
name="get_weather",
description="Get the weather for a city",
parameters={
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
)

native = await client.to_native_tool(tool)
assert native["type"] == "function"
assert native["function"]["name"] == "get_weather"
assert native["function"]["description"] == "Get the weather for a city"
assert native["function"]["parameters"]["required"] == ["city"]
assert "strict" not in native["function"]


@pytest.mark.asyncio
async def test_to_native_tool_propagates_strict_flag():
"""When ``Tool.strict`` is set the envelope must carry it through β€”
OpenAI's strict-schema enforcement only kicks in on the inner function
object, never the envelope itself."""
from verifiers.types import Tool

client = object.__new__(RendererClient)
tool = Tool(
name="get_weather",
description="Get the weather for a city",
parameters={"type": "object", "properties": {}},
strict=True,
)

native = await client.to_native_tool(tool)
assert native["function"]["strict"] is True


@pytest.mark.asyncio
async def test_renderer_client_accepts_dict_native_response_with_content():
client = object.__new__(RendererClient)
Expand Down
96 changes: 91 additions & 5 deletions verifiers/clients/renderer_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,19 @@ class RendererClient(
] = {}
_shared_pools_lock: ClassVar[threading.Lock] = threading.Lock()

# Cache of ``max_model_len`` per (api_base_url, model). Fixed at server
# startup, so cache forever per process. See
# ``_get_max_model_len`` for why this is needed.
# TODO(vllm-upstream): Drop once vLLM patches
# ``vllm/entrypoints/serve/disagg/serving.py::ServingTokens.serve_tokens``
# to apply ``get_max_tokens()`` like
# ``vllm/entrypoints/openai/chat_completion/serving.py`` does at L284.
# Once that lands, ``/inference/v1/generate`` will default ``max_tokens``
# to ``max_model_len - prompt_len`` server-side and this cache + the
# fallback in ``get_native_response`` become dead code.
_max_model_len_cache: ClassVar[dict[tuple[str, str], int]] = {}
_max_model_len_lock: ClassVar[asyncio.Lock | None] = None

def __init__(
self,
config: ClientConfig,
Expand Down Expand Up @@ -461,6 +474,63 @@ def _get_renderer_or_pool(self, model: str) -> Renderer | RendererPool:

return self._shared_pools[cache_key]

async def _get_max_model_len(self, model: str) -> int | None:
"""Fetch ``max_model_len`` for ``model`` from vLLM's ``/v1/models``.

``vllm.SamplingParams.max_tokens`` defaults to ``16``.
``/v1/chat/completions`` masks this server-side via
``get_max_tokens(max_model_len, request.max_tokens, prompt_len, ...)``
so callers that omit ``max_tokens`` get the full remaining context.
``/inference/v1/generate`` (the endpoint this client talks to) is a
thin pass-through that hands ``SamplingParams`` to the engine
verbatim, so the 16-token default leaks through and silently caps
every generation at 16 tokens β€” long enough to start a tool-call
envelope but not long enough to close one, producing reward 0 on
any tool-using rollout.

Until vLLM applies the same defaulting in ``serve_tokens``, query
``max_model_len`` once per (server, model) and cache forever.
Returns ``None`` if the server doesn't surface ``max_model_len``
in ``/v1/models``; the caller falls back to the original behaviour
(no client-side default) in that case.
"""
base = str(self.client.base_url).rstrip("/")
key = (base, model)
cached = self._max_model_len_cache.get(key)
if cached is not None:
return cached

if RendererClient._max_model_len_lock is None:
RendererClient._max_model_len_lock = asyncio.Lock()
async with RendererClient._max_model_len_lock:
cached = self._max_model_len_cache.get(key)
if cached is not None:
return cached
try:
resp = await self.client.get(
"/models",
cast_to=cast(Any, dict[str, Any]),
)
except Exception as exc:
self.logger.warning(
"RendererClient: failed to fetch max_model_len from /v1/models "
"(%s); generations will use vLLM's SamplingParams default of 16 "
"tokens unless caller sets max_tokens. Set "
"max_completion_tokens explicitly to avoid silent truncation.",
exc,
)
return None

for entry in (resp or {}).get("data", []) or []:
if not isinstance(entry, dict) or entry.get("id") != model:
continue
mml = entry.get("max_model_len")
if isinstance(mml, int) and mml > 0:
self._max_model_len_cache[key] = mml
return mml
break
return None

# ── Type conversions ────────────────────────────────────────────

async def to_native_prompt(
Expand All @@ -472,11 +542,14 @@ async def to_native_prompt(
)

async def to_native_tool(self, tool: Tool) -> ToolSpec:
return ToolSpec(
name=tool.name,
description=tool.description or "",
parameters=tool.parameters or {},
)
function: dict[str, Any] = {
"name": tool.name,
"description": tool.description,
"parameters": tool.parameters,
}
if tool.strict is not None:
function["strict"] = tool.strict
return cast(ToolSpec, {"type": "function", "function": function})

# ── Core request cycle ──────────────────────────────────────────

Expand Down Expand Up @@ -518,6 +591,19 @@ async def get_native_response(
tools=tools,
)

# /inference/v1/generate hands SamplingParams to the engine verbatim
# and skips the get_max_tokens() defaulting that /v1/chat/completions
# applies. Replicate that defaulting here so caller-omitted max_tokens
# doesn't silently fall to vLLM's 16-token SamplingParams default.
# TODO(vllm-upstream): Drop once vLLM patches serve_tokens
# (vllm/entrypoints/serve/disagg/serving.py) to call get_max_tokens().
if "max_tokens" not in sampling_params:
max_model_len = await self._get_max_model_len(model)
if max_model_len is not None:
sampling_params["max_tokens"] = max(
1, max_model_len - len(prompt_ids)
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated
)

return await generate(
client=self.client,
renderer=renderer,
Expand Down
Loading