ai-dynamo · biswapanda · May 4, 2026 · May 4, 2026 · May 5, 2026 · May 5, 2026
diff --git a/Cargo.lock b/Cargo.lock
@@ -24,6 +24,7 @@ members = [
     "lib/backend-common/examples/mocker",
     "lib/bindings/c",
     "lib/bindings/python/codegen",
+    "lib/rl",
 ]
 resolver = "3"
 
@@ -41,6 +42,7 @@ keywords = ["llm", "genai", "inference", "nvidia", "distributed"]
 # Local crates
 dynamo-runtime = { path = "lib/runtime", version = "1.2.0" }
 dynamo-llm = { path = "lib/llm", version = "1.2.0" }
+dynamo-rl = { path = "lib/rl", version = "1.2.0" }
 dynamo-config = { path = "lib/config", version = "1.2.0" }
 dynamo-tokenizers = { path = "lib/tokenizers", version = "1.2.0" }
 dynamo-tokens = { path = "lib/tokens", version = "1.2.0" }

@@ -56,6 +56,7 @@ class FrontendConfig(RouterConfigBase, KvRouterConfigBase, AicPerfConfigBase):
     kv_cache_block_size: Optional[int]
     http_host: str
     http_port: int
+    rl_port: int
     tls_cert_path: Optional[pathlib.Path]
     tls_key_path: Optional[pathlib.Path]
 
@@ -97,6 +98,8 @@ def validate(self) -> None:
             raise ValueError(
                 f"--migration-limit must be between 0 and {_U32_MAX} (0=disabled)"
             )
+        if self.rl_port < 0 or self.rl_port > 65535:
+            raise ValueError("--rl-port must be between 0 and 65535")
         if self.migration_max_seq_len is not None and (
             self.migration_max_seq_len < 1 or self.migration_max_seq_len > _U32_MAX
         ):
@@ -208,6 +211,14 @@ def add_arguments(self, parser) -> None:
             help="HTTP port for the engine (u16).",
             arg_type=int,
         )
+        add_argument(
+            g,
+            flag_name="--rl-port",
+            env_var="DYN_RL_PORT",
+            default=8002,
+            help="Dedicated HTTP port for RL admin endpoints (u16).",
+            arg_type=int,
+        )
         add_negatable_bool_argument(
             g,
             flag_name="--serve-indexer",

@@ -237,6 +237,7 @@ def signal_handler():
     kwargs: dict[str, Any] = {
         "http_host": config.http_host,
         "http_port": config.http_port,
+        "rl_port": config.rl_port,
         "kv_cache_block_size": config.kv_cache_block_size,
         "router_config": router_config,
         "migration_limit": config.migration_limit,

@@ -633,6 +633,40 @@ async def _generate_and_stream(
                         break
                     choice = post.process_output(output)
                     if choice:
+                        # ── RL logprobs injection ──────────────────────
+                        # The vLLM worker sends log_probs/top_logprobs in
+                        # the engine_response dict.  Since we can't easily
+                        # construct LogprobsLists for EngineCoreOutput, we
+                        # inject them directly into the choice here.
+                        worker_log_probs = engine_response.get("log_probs")
+                        worker_top_logprobs = engine_response.get("top_logprobs")
+                        if worker_log_probs is not None and choice.get("logprobs") is None:
+                            oai_logprobs_content = []
+                            new_tids = engine_response.get("token_ids", [])
+                            for i, lp in enumerate(worker_log_probs):
+                                # Always populate token/bytes so consumers never see a
+                                # missing key.  If top_logprobs is absent or the token
+                                # string cannot be resolved we fall back to the numeric
+                                # ID as a string — better than a KeyError / silent None.
+                                tid_str = str(new_tids[i]) if i < len(new_tids) else ""
+                                entry: dict = {
+                                    "logprob": lp,
+                                    "token": tid_str,
+                                    "bytes": None,
+                                }
+                                # Resolve the human-readable token string and top_logprobs
+                                # from the engine's top_logprobs table when available.
+                                if worker_top_logprobs and i < len(worker_top_logprobs):
+                                    tops = worker_top_logprobs[i]
+                                    entry["top_logprobs"] = tops
+                                    if i < len(new_tids):
+                                        for tp in tops:
+                                            if tp.get("token_id") == new_tids[i]:
+                                                entry["token"] = tp.get("token", tid_str)
+                                                break
+                                oai_logprobs_content.append(entry)
+                            choice["logprobs"] = {"content": oai_logprobs_content}
+
                         choices.append(choice)
 
                 if choices:
@@ -646,6 +680,11 @@ async def _generate_and_stream(
                     if usage := engine_response.get("completion_usage"):
                         dynamo_out["usage"] = usage
 
+                    # ── RL: pass output token IDs for nvext.completion_token_ids ──
+                    new_token_ids = engine_response.get("token_ids", [])
+                    if new_token_ids:
+                        dynamo_out["_completion_token_ids"] = new_token_ids
+
                     yield dynamo_out
             _nvtx.end_range(rng_stream)
         except Exception as e: