Skip to content
Closed
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
0d74eca
feat(rl): import RL surface from bis/parity-tokenize-tcp as baseline …
biswapanda May 4, 2026
d295ebc
feat(rl): composite state endpoint, 3-mode pause, RL extras, drop dea…
biswapanda May 4, 2026
f034171
feat(rl): /v1/chat/completions absorbs TITO + full SamplingParams parity
biswapanda May 5, 2026
a2cc90d
fix(rl): typed Result on get_stop_token_ids + tests + relax mutual-ex…
biswapanda May 5, 2026
2cb5e60
rl api docs
biswapanda May 7, 2026
8e0e019
update
biswapanda May 7, 2026
dc62cb7
chore(rl): scrub internal review markers from PR-added comments + docs
biswapanda May 8, 2026
0ee6cbe
fix(rl): address blocking review items — unwrap/expect, structured tr…
biswapanda May 8, 2026
4aac7e8
fix(rl): address CodeRabbit review findings on RL surface
biswapanda May 8, 2026
8e08e32
chore(rl): address Tier 1/2/3/5 review issues — drop dead code, dedup…
biswapanda May 8, 2026
6007c77
refactor(rl): extract dynamo-rl crate at lib/rl (PR A — pure refactor…
biswapanda May 8, 2026
67058be
feat(rl/vllm): add WeightTransport trait + FilesystemTransport/NcclTr…
biswapanda May 8, 2026
93a7e41
style(llm): cargo fmt — strip trailing whitespace + reformat delta.rs…
biswapanda May 8, 2026
575afd9
feat(rl): PR C — drop legacy /v1/rl/{state,health,ready,liveness,weig…
biswapanda May 8, 2026
b6e471d
feat(rl): PR B — request-plane fan-out via Discovery+PushRouter (work…
biswapanda May 8, 2026
626d3e4
rl: add request-plane admin fanout
biswapanda May 8, 2026
7c15b6b
rl: allow lora unload without transport
biswapanda May 8, 2026
dbda27c
feat(vllm): gate RL endpoint on --enable-rl / DYN_ENABLE_RL
biswapanda May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions components/src/dynamo/frontend/vllm_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,40 @@ async def _generate_and_stream(
break
choice = post.process_output(output)
if choice:
# ── RL logprobs injection ──────────────────────
# The vLLM worker sends log_probs/top_logprobs in
# the engine_response dict. Since we can't easily
# construct LogprobsLists for EngineCoreOutput, we
# inject them directly into the choice here.
worker_log_probs = engine_response.get("log_probs")
worker_top_logprobs = engine_response.get("top_logprobs")
if worker_log_probs is not None and choice.get("logprobs") is None:
oai_logprobs_content = []
new_tids = engine_response.get("token_ids", [])
for i, lp in enumerate(worker_log_probs):
# Always populate token/bytes so consumers never see a
# missing key. If top_logprobs is absent or the token
# string cannot be resolved we fall back to the numeric
# ID as a string — better than a KeyError / silent None.
tid_str = str(new_tids[i]) if i < len(new_tids) else ""
entry: dict = {
"logprob": lp,
"token": tid_str,
"bytes": None,
}
# Resolve the human-readable token string and top_logprobs
# from the engine's top_logprobs table when available.
if worker_top_logprobs and i < len(worker_top_logprobs):
tops = worker_top_logprobs[i]
entry["top_logprobs"] = tops
if i < len(new_tids):
for tp in tops:
if tp.get("token_id") == new_tids[i]:
entry["token"] = tp.get("token", tid_str)
break
oai_logprobs_content.append(entry)
choice["logprobs"] = {"content": oai_logprobs_content}

choices.append(choice)

if choices:
Expand All @@ -646,6 +680,11 @@ async def _generate_and_stream(
if usage := engine_response.get("completion_usage"):
dynamo_out["usage"] = usage

# ── RL: pass output token IDs for nvext.completion_token_ids ──
new_token_ids = engine_response.get("token_ids", [])
if new_token_ids:
dynamo_out["_completion_token_ids"] = new_token_ids

yield dynamo_out
_nvtx.end_range(rng_stream)
except Exception as e:
Expand Down
457 changes: 457 additions & 0 deletions components/src/dynamo/vllm/handlers.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions components/src/dynamo/vllm/publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def record(
*args: object,
**kwargs: object,
) -> None:
# scheduler_stats can be None right after a weight reload / cache reset.
if scheduler_stats is None:
return

Expand Down
35 changes: 32 additions & 3 deletions components/src/dynamo/vllm/worker_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ async def _create_decode_worker(
component_name=config.component,
)

# Register engine routes
# Register engine routes (sleep/wake_up + RL weight-lifecycle + RL LoRA)
self.register_engine_routes(runtime, handler)

# Parse endpoint types from --endpoint-types flag
Expand Down Expand Up @@ -576,7 +576,7 @@ async def _create_prefill_worker(
component_name=config.component,
)

# Register engine routes
# Register engine routes (sleep/wake_up + RL weight-lifecycle + RL LoRA)
self.register_engine_routes(runtime, handler)

await self._maybe_wait_for_failover_lock(handler, runtime, config)
Expand Down Expand Up @@ -676,6 +676,35 @@ def register_engine_routes(
runtime.register_engine_route("wake_up", handler.wake_up)
runtime.register_engine_route("scale_elastic_ep", handler.scale_elastic_ep)

# RL weight-lifecycle routes — driven by the
# /v1/rl/{pause,resume,update_weights} bracket in the Rust frontend.
# Names line up with the SGLang RL admin routes so a single admin
# coordinator can talk to either backend.
runtime.register_engine_route("pause_generation", handler.pause_generation)
runtime.register_engine_route("resume_generation", handler.resume_generation)
runtime.register_engine_route("flush_cache", handler.flush_cache)
runtime.register_engine_route(
"update_weights_from_path", handler.update_weights_from_path
)
runtime.register_engine_route("get_weight_version", handler.get_weight_version)

# RL state + liveness — drive /v1/rl/state and /v1/rl/liveness in the
# Rust frontend. /v1/rl/state aggregates these per-worker snapshots
# into the composite RlStateResponse.
runtime.register_engine_route("get_state", handler.get_state)
runtime.register_engine_route("liveness_probe", handler.liveness_probe)

# RL LoRA adapter routes: filesystem-native hot-swap used by RL
# trainers every step to broadcast new adapter weights into the engine.
runtime.register_engine_route("load_lora_adapter", handler.load_lora_adapter)
runtime.register_engine_route(
"unload_lora_adapter", handler.unload_lora_adapter
)

logger.info(
"Registered engine routes: /engine/sleep, /engine/wake_up, /engine/scale_elastic_ep, /engine/start_profile, /engine/stop_profile"
"Registered engine routes: sleep, wake_up, scale_elastic_ep, "
"start_profile, stop_profile, pause_generation, resume_generation, "
"flush_cache, update_weights_from_path, get_weight_version, "
"get_state, liveness_probe, "
"load_lora_adapter, unload_lora_adapter"
)
37 changes: 36 additions & 1 deletion container/deps/vllm/install_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

set -euo pipefail

VLLM_VER="0.20.0"
VLLM_VER="0.19.1"
VLLM_REF="v${VLLM_VER}"
DEVICE="cuda"

Expand Down Expand Up @@ -300,4 +300,39 @@ if [ "$DEVICE" = "cuda" ]; then
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
fi

# ---------------------------------------------------------------------------
# prime-rl inference-side vLLM plugin (pinned tag).
#
# Registers the ``vllm.general_plugins`` entry-point that applies prime-rl's
# monkey patches (LoRA adapter load, DP engine pause/resume deadlock, Qwen 3.5
# LoRA, etc.) automatically in every vLLM worker process -- including spawned
# subprocesses. Required for prime-rl / Dynamo RL training integration.
#
# Pinned to an immutable commit SHA (not a tag) for reproducibility; tags can
# be re-pointed upstream. PRIME_RL_REF is kept for human-readable build logs.
# Override at build time: --build-arg PRIME_RL_COMMIT=<full-sha>
# --no-deps: prime-rl's full dep tree includes trainer + wandb; Dynamo only
# needs the inference-side plugin and worker-extension classes.
# Python version: prime-rl pins requires-python = "~=3.12.0"; Dynamo containers
# are Python 3.12, so no version override is needed. For 3.11 local
# dev venvs use the regular pip (not uv) with --ignore-requires-python.
# ---------------------------------------------------------------------------
PRIME_RL_REF="${PRIME_RL_REF:-v0.5.1.dev101}"
PRIME_RL_COMMIT="${PRIME_RL_COMMIT:-d49f3939e7dca29bceb9ed515cc1782497b67e81}"
printf '\n=== Installing prime-rl vLLM plugin (ref=%s commit=%s) ===\n' \
"$PRIME_RL_REF" "$PRIME_RL_COMMIT"
uv pip install --no-deps \
"prime-rl @ git+https://github.com/PrimeIntellect-ai/prime-rl@${PRIME_RL_COMMIT}"

Comment thread
coderabbitai[bot] marked this conversation as resolved.
# Sanity-check: confirm vllm.general_plugins entry-point is registered.
python3 - <<'PY_SANITY'
from importlib.metadata import entry_points
names = [ep.name for ep in entry_points(group="vllm.general_plugins")]
assert "prime_rl" in names, (
f"prime-rl plugin NOT registered; vllm.general_plugins={names}"
)
print(f"✓ prime-rl plugin registered (vllm.general_plugins={names})")
PY_SANITY

echo "\n✅ All installations completed successfully!"
Loading
Loading