Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
0d74eca
feat(rl): import RL surface from bis/parity-tokenize-tcp as baseline …
biswapanda May 4, 2026
d295ebc
feat(rl): composite state endpoint, 3-mode pause, RL extras, drop dea…
biswapanda May 4, 2026
f034171
feat(rl): /v1/chat/completions absorbs TITO + full SamplingParams parity
biswapanda May 5, 2026
a2cc90d
fix(rl): typed Result on get_stop_token_ids + tests + relax mutual-ex…
biswapanda May 5, 2026
2cb5e60
rl api docs
biswapanda May 7, 2026
8e0e019
update
biswapanda May 7, 2026
dc62cb7
chore(rl): scrub internal review markers from PR-added comments + docs
biswapanda May 8, 2026
0ee6cbe
fix(rl): address blocking review items — unwrap/expect, structured tr…
biswapanda May 8, 2026
4aac7e8
fix(rl): address CodeRabbit review findings on RL surface
biswapanda May 8, 2026
8e08e32
chore(rl): address Tier 1/2/3/5 review issues — drop dead code, dedup…
biswapanda May 8, 2026
6007c77
refactor(rl): extract dynamo-rl crate at lib/rl (PR A — pure refactor…
biswapanda May 8, 2026
67058be
feat(rl/vllm): add WeightTransport trait + FilesystemTransport/NcclTr…
biswapanda May 8, 2026
93a7e41
style(llm): cargo fmt — strip trailing whitespace + reformat delta.rs…
biswapanda May 8, 2026
575afd9
feat(rl): PR C — drop legacy /v1/rl/{state,health,ready,liveness,weig…
biswapanda May 8, 2026
b6e471d
feat(rl): PR B — request-plane fan-out via Discovery+PushRouter (work…
biswapanda May 8, 2026
626d3e4
rl: add request-plane admin fanout
biswapanda May 8, 2026
7c15b6b
rl: allow lora unload without transport
biswapanda May 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ members = [
"lib/backend-common/examples/mocker",
"lib/bindings/c",
"lib/bindings/python/codegen",
"lib/rl",
]
resolver = "3"

Expand All @@ -41,6 +42,7 @@ keywords = ["llm", "genai", "inference", "nvidia", "distributed"]
# Local crates
dynamo-runtime = { path = "lib/runtime", version = "1.2.0" }
dynamo-llm = { path = "lib/llm", version = "1.2.0" }
dynamo-rl = { path = "lib/rl", version = "1.2.0" }
dynamo-config = { path = "lib/config", version = "1.2.0" }
dynamo-tokenizers = { path = "lib/tokenizers", version = "1.2.0" }
dynamo-tokens = { path = "lib/tokens", version = "1.2.0" }
Expand Down
11 changes: 11 additions & 0 deletions components/src/dynamo/frontend/frontend_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class FrontendConfig(RouterConfigBase, KvRouterConfigBase, AicPerfConfigBase):
kv_cache_block_size: Optional[int]
http_host: str
http_port: int
rl_port: int
tls_cert_path: Optional[pathlib.Path]
tls_key_path: Optional[pathlib.Path]

Expand Down Expand Up @@ -97,6 +98,8 @@ def validate(self) -> None:
raise ValueError(
f"--migration-limit must be between 0 and {_U32_MAX} (0=disabled)"
)
if self.rl_port < 0 or self.rl_port > 65535:
raise ValueError("--rl-port must be between 0 and 65535")
if self.migration_max_seq_len is not None and (
self.migration_max_seq_len < 1 or self.migration_max_seq_len > _U32_MAX
):
Expand Down Expand Up @@ -208,6 +211,14 @@ def add_arguments(self, parser) -> None:
help="HTTP port for the engine (u16).",
arg_type=int,
)
add_argument(
g,
flag_name="--rl-port",
env_var="DYN_RL_PORT",
default=8002,
help="Dedicated HTTP port for RL admin endpoints (u16).",
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--serve-indexer",
Expand Down
1 change: 1 addition & 0 deletions components/src/dynamo/frontend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ def signal_handler():
kwargs: dict[str, Any] = {
"http_host": config.http_host,
"http_port": config.http_port,
"rl_port": config.rl_port,
"kv_cache_block_size": config.kv_cache_block_size,
"router_config": router_config,
"migration_limit": config.migration_limit,
Expand Down
39 changes: 39 additions & 0 deletions components/src/dynamo/frontend/vllm_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,40 @@ async def _generate_and_stream(
break
choice = post.process_output(output)
if choice:
# ── RL logprobs injection ──────────────────────
# The vLLM worker sends log_probs/top_logprobs in
# the engine_response dict. Since we can't easily
# construct LogprobsLists for EngineCoreOutput, we
# inject them directly into the choice here.
worker_log_probs = engine_response.get("log_probs")
worker_top_logprobs = engine_response.get("top_logprobs")
if worker_log_probs is not None and choice.get("logprobs") is None:
oai_logprobs_content = []
new_tids = engine_response.get("token_ids", [])
for i, lp in enumerate(worker_log_probs):
# Always populate token/bytes so consumers never see a
# missing key. If top_logprobs is absent or the token
# string cannot be resolved we fall back to the numeric
# ID as a string — better than a KeyError / silent None.
tid_str = str(new_tids[i]) if i < len(new_tids) else ""
entry: dict = {
"logprob": lp,
"token": tid_str,
"bytes": None,
}
# Resolve the human-readable token string and top_logprobs
# from the engine's top_logprobs table when available.
if worker_top_logprobs and i < len(worker_top_logprobs):
tops = worker_top_logprobs[i]
entry["top_logprobs"] = tops
if i < len(new_tids):
for tp in tops:
if tp.get("token_id") == new_tids[i]:
entry["token"] = tp.get("token", tid_str)
break
oai_logprobs_content.append(entry)
choice["logprobs"] = {"content": oai_logprobs_content}

choices.append(choice)

if choices:
Expand All @@ -646,6 +680,11 @@ async def _generate_and_stream(
if usage := engine_response.get("completion_usage"):
dynamo_out["usage"] = usage

# ── RL: pass output token IDs for nvext.completion_token_ids ──
new_token_ids = engine_response.get("token_ids", [])
if new_token_ids:
dynamo_out["_completion_token_ids"] = new_token_ids

yield dynamo_out
_nvtx.end_range(rng_stream)
except Exception as e:
Expand Down
Loading
Loading