ai-dynamo · biswapanda · May 4, 2026 · May 4, 2026 · May 5, 2026 · May 5, 2026
@@ -633,6 +633,40 @@ async def _generate_and_stream(
                         break
                     choice = post.process_output(output)
                     if choice:
+                        # ── RL logprobs injection ──────────────────────
+                        # The vLLM worker sends log_probs/top_logprobs in
+                        # the engine_response dict.  Since we can't easily
+                        # construct LogprobsLists for EngineCoreOutput, we
+                        # inject them directly into the choice here.
+                        worker_log_probs = engine_response.get("log_probs")
+                        worker_top_logprobs = engine_response.get("top_logprobs")
+                        if worker_log_probs is not None and choice.get("logprobs") is None:
+                            oai_logprobs_content = []
+                            new_tids = engine_response.get("token_ids", [])
+                            for i, lp in enumerate(worker_log_probs):
+                                # Always populate token/bytes so consumers never see a
+                                # missing key.  If top_logprobs is absent or the token
+                                # string cannot be resolved we fall back to the numeric
+                                # ID as a string — better than a KeyError / silent None.
+                                tid_str = str(new_tids[i]) if i < len(new_tids) else ""
+                                entry: dict = {
+                                    "logprob": lp,
+                                    "token": tid_str,
+                                    "bytes": None,
+                                }
+                                # Resolve the human-readable token string and top_logprobs
+                                # from the engine's top_logprobs table when available.
+                                if worker_top_logprobs and i < len(worker_top_logprobs):
+                                    tops = worker_top_logprobs[i]
+                                    entry["top_logprobs"] = tops
+                                    if i < len(new_tids):
+                                        for tp in tops:
+                                            if tp.get("token_id") == new_tids[i]:
+                                                entry["token"] = tp.get("token", tid_str)
+                                                break
+                                oai_logprobs_content.append(entry)
+                            choice["logprobs"] = {"content": oai_logprobs_content}
+
                         choices.append(choice)
 
                 if choices:
@@ -646,6 +680,11 @@ async def _generate_and_stream(
                     if usage := engine_response.get("completion_usage"):
                         dynamo_out["usage"] = usage
 
+                    # ── RL: pass output token IDs for nvext.completion_token_ids ──
+                    new_token_ids = engine_response.get("token_ids", [])
+                    if new_token_ids:
+                        dynamo_out["_completion_token_ids"] = new_token_ids
+
                     yield dynamo_out
             _nvtx.end_range(rng_stream)
         except Exception as e:

@@ -58,6 +58,7 @@ def record(
         *args: object,
         **kwargs: object,
     ) -> None:
+        # scheduler_stats can be None right after a weight reload / cache reset.
         if scheduler_stats is None:
             return
 

@@ -366,7 +366,7 @@ async def _create_decode_worker(
                 component_name=config.component,
             )
 
-        # Register engine routes
+        # Register engine routes (sleep/wake_up + RL weight-lifecycle + RL LoRA)
         self.register_engine_routes(runtime, handler)
 
         # Parse endpoint types from --endpoint-types flag
@@ -576,7 +576,7 @@ async def _create_prefill_worker(
                 component_name=config.component,
             )
 
-        # Register engine routes
+        # Register engine routes (sleep/wake_up + RL weight-lifecycle + RL LoRA)
         self.register_engine_routes(runtime, handler)
 
         await self._maybe_wait_for_failover_lock(handler, runtime, config)
@@ -676,6 +676,35 @@ def register_engine_routes(
         runtime.register_engine_route("wake_up", handler.wake_up)
         runtime.register_engine_route("scale_elastic_ep", handler.scale_elastic_ep)
 
+        # RL weight-lifecycle routes — driven by the
+        # /v1/rl/{pause,resume,update_weights} bracket in the Rust frontend.
+        # Names line up with the SGLang RL admin routes so a single admin
+        # coordinator can talk to either backend.
+        runtime.register_engine_route("pause_generation", handler.pause_generation)
+        runtime.register_engine_route("resume_generation", handler.resume_generation)
+        runtime.register_engine_route("flush_cache", handler.flush_cache)
+        runtime.register_engine_route(
+            "update_weights_from_path", handler.update_weights_from_path
+        )
+        runtime.register_engine_route("get_weight_version", handler.get_weight_version)
+
+        # RL state + liveness — drive /v1/rl/state and /v1/rl/liveness in the
+        # Rust frontend. /v1/rl/state aggregates these per-worker snapshots
+        # into the composite RlStateResponse.
+        runtime.register_engine_route("get_state", handler.get_state)
+        runtime.register_engine_route("liveness_probe", handler.liveness_probe)
+
+        # RL LoRA adapter routes: filesystem-native hot-swap used by RL
+        # trainers every step to broadcast new adapter weights into the engine.
+        runtime.register_engine_route("load_lora_adapter", handler.load_lora_adapter)
+        runtime.register_engine_route(
+            "unload_lora_adapter", handler.unload_lora_adapter
+        )
+
         logger.info(
-            "Registered engine routes: /engine/sleep, /engine/wake_up, /engine/scale_elastic_ep, /engine/start_profile, /engine/stop_profile"
+            "Registered engine routes: sleep, wake_up, scale_elastic_ep, "
+            "start_profile, stop_profile, pause_generation, resume_generation, "
+            "flush_cache, update_weights_from_path, get_weight_version, "
+            "get_state, liveness_probe, "
+            "load_lora_adapter, unload_lora_adapter"
         )
@@ -13,7 +13,7 @@
 
 set -euo pipefail
 
-VLLM_VER="0.20.0"
+VLLM_VER="0.19.1"
 VLLM_REF="v${VLLM_VER}"
 DEVICE="cuda"
 
@@ -300,4 +300,39 @@ if [ "$DEVICE" = "cuda" ]; then
     # TODO we will be able to specify which pplx and deepep commit we want in future
     TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
 fi
+
+# ---------------------------------------------------------------------------
+# prime-rl inference-side vLLM plugin (pinned tag).
+#
+# Registers the ``vllm.general_plugins`` entry-point that applies prime-rl's
+# monkey patches (LoRA adapter load, DP engine pause/resume deadlock, Qwen 3.5
+# LoRA, etc.) automatically in every vLLM worker process -- including spawned
+# subprocesses. Required for prime-rl / Dynamo RL training integration.
+#
+# Pinned to an immutable commit SHA (not a tag) for reproducibility; tags can
+# be re-pointed upstream. PRIME_RL_REF is kept for human-readable build logs.
+# Override at build time:  --build-arg PRIME_RL_COMMIT=<full-sha>
+# --no-deps: prime-rl's full dep tree includes trainer + wandb; Dynamo only
+#            needs the inference-side plugin and worker-extension classes.
+# Python version: prime-rl pins requires-python = "~=3.12.0"; Dynamo containers
+#            are Python 3.12, so no version override is needed. For 3.11 local
+#            dev venvs use the regular pip (not uv) with --ignore-requires-python.
+# ---------------------------------------------------------------------------
+PRIME_RL_REF="${PRIME_RL_REF:-v0.5.1.dev101}"
+PRIME_RL_COMMIT="${PRIME_RL_COMMIT:-d49f3939e7dca29bceb9ed515cc1782497b67e81}"
+printf '\n=== Installing prime-rl vLLM plugin (ref=%s commit=%s) ===\n' \
+    "$PRIME_RL_REF" "$PRIME_RL_COMMIT"
+uv pip install --no-deps \
+    "prime-rl @ git+https://github.com/PrimeIntellect-ai/prime-rl@${PRIME_RL_COMMIT}"
+
+# Sanity-check: confirm vllm.general_plugins entry-point is registered.
+python3 - <<'PY_SANITY'
+from importlib.metadata import entry_points
+names = [ep.name for ep in entry_points(group="vllm.general_plugins")]
+assert "prime_rl" in names, (
+    f"prime-rl plugin NOT registered; vllm.general_plugins={names}"
+)
+print(f"✓ prime-rl plugin registered (vllm.general_plugins={names})")
+PY_SANITY
+
 echo "\n✅ All installations completed successfully!"