diff --git a/CHANGES.txt b/CHANGES.txt
index 3fd711eb..7d8cecff 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -226,3 +226,5 @@ v<3.4.0>, <05/10/2026> -- ADEngine and MCP audit-cycle fixes from a UCI Ionosphe
 v<3.5.0>, <05/11/2026> -- Sustainable model persistence. New `pyod.utils.persistence` module with three additive helpers (`save`, `load`, `compat_load`); no breaking change to existing `joblib.dump` / `joblib.load` workflows. `save(clf, path, metadata=None)` writes a versioned envelope (`_pyod_persistence_version`, `pyod_version`, `sklearn_version`, `numpy_version`, `scipy_version`, `joblib_version`, `python_version`, `saved_at`, `model_class`, optional user metadata, model). `load(path, strict=False, return_metadata=False)` reads the envelope, compares the recorded dependency versions against the running environment, and emits a `UserWarning` on drift in any of sklearn, joblib, numpy, or scipy. Python-version drift is severity `info` and is diagnostic only: non-strict `load` does not warn and `strict=True` does not raise on `python_version`-only drift on the normal envelope path; after a compat repair, strict mode still refuses to return the repaired model, but the error follows the no-drift compat-repair branch and does not name `python_version`. `strict=True` escalates every `warn`-severity drift to `ValueError`, rejects raw legacy artifacts that have no envelope, and refuses to return a model that required a compatibility repair. `return_metadata=True` returns `(model, envelope_without_model_field)`. `compat_load(path, mmap_mode=None)` mirrors `joblib.load` with the BUILD-opcode dispatch entry patched on a subclass of `joblib.numpy_pickle.NumpyUnpickler`; when sklearn's `Tree.__setstate__` would raise `ValueError: node array from the pickle has an incompatible dtype`, the saved Tree-node state is realigned to the running sklearn dtype first. Realignment is allowlist-driven: `_TREE_NODE_FIELD_DEFAULTS` (currently `{"missing_go_to_left": 0}`, the pre-1.3 sklearn default) zero-fills documented missing fields, `_TREE_NODE_FIELD_RENAMES` (empty in v3.5.0) maps known renames with rename targets resolved BEFORE the missing-field-default check so a future rename does not also need a default entry, and any other dtype difference (unknown new field, kind change, signedness change, itemsize change, shape change) raises `ValueError` with a re-fit recommendation. Same-name byte-order-only differences realign safely. Current dtype is discovered dynamically from `sklearn.tree._tree.NODE_DTYPE` (no hardcoded layout). A single `UserWarning` recommending re-fit fires when at least one Tree was realigned; non-tree artifacts (ECOD, COPOD, HBOS, LOF, ...) pass through silently. `load()` falls through to `compat_load()` automatically when `joblib.load` raises the documented dtype prefix; the original exception is preserved via `raise ... from`, and a non-prefix `ValueError` from `joblib.load` propagates without invoking `compat_load`. Dependency floor: `requirements.txt` and `docs/requirements.txt` now pin `joblib>=1.5` because `compat_load` reuses `joblib.numpy_pickle._validate_fileobject_and_memmap` and the joblib 1.5 `NumpyUnpickler(filename, file_handle, ensure_native_byte_order, mmap_mode=...)` constructor; older joblib lacks both, and the import is guarded with a clear `ImportError` recommending an upgrade. Closes issue #519. Tests: 31 new in `test_persistence.py` covering Tree-dtype realignment (synthetic aged pickles produced by an `_OldDtypeTree` pickle-time shim), the committed binary fixture under `pyod/test/fixtures/iforest_sklearn_1_2_x.joblib` (a real sklearn 1.2.2 IsolationForest, regenerable via `regen_iforest_sklearn_1_2.py`), envelope round-trip, version-drift warnings including the `info`-only `python_version` silent case, strict-mode rejection paths, schema-version validation including a future-version reject, the strict-after-compat no-drift case, exception chaining, a synthetic rename test that proves `_TREE_NODE_FIELD_RENAMES` works without a paired `_TREE_NODE_FIELD_DEFAULTS` entry, and a monkey-patched `joblib.load` test that pins the exact-prefix fall-through gate (non-prefix `ValueError` propagates unchanged; prefix `ValueError` invokes `compat_load` exactly once). CI: new `persistence-nightly` job in `testing-cron.yml` installs pre-release `sklearn` / `numpy` / `scipy` / `joblib` (scientific-python nightly index) and runs only `test_persistence.py`; failure surfaces upstream dtype evolution before downstream users hit it and is not a release blocker. Docs: `docs/model_persistence.rst` rewritten with quick-start, trust-boundary, why-versioning, legacy-load decision tree, cross-sklearn-version compatibility section, troubleshooting table keyed on error text, strict-mode notes, and envelope-metadata-reading guidance. `docs/pyod.utils.rst` cross-references the new module. `examples/save_load_model_example.py` now leads with `persistence.save` / `persistence.load` and notes raw `joblib` as a secondary alternative. Deferred: a true header-only `inspect_artifact(path)` and `pyod inspect <path>` CLI require a `.pyod` zip container layout (metadata sidecar separate from the model payload) and remain Phase 3 work; deep-learning state-dict persistence stays scoped to its own future design. No breaking API changes.
 v<3.5.1>, <05/13/2026> -- External-contributor PR review pass (jbbqqf + tuanaiseo bundles) plus NSF funding acknowledgment. Bug fixes: LUNAR no longer shares its `MinMaxScaler` across instances because the constructor default was a mutable shared object; `LUNAR.__init__` now defaults `scaler=None`, `_resolve_scaler()` materializes a fresh `MinMaxScaler` per fit (or deep-copies a user-supplied instance, or disables scaling on `scaler=False`), and the fitted scaler lives on `self.scaler_` so `sklearn.base.clone()` round-trips (closes #502). DIF stops double-normalizing during fit: the inner `self.decision_function(X)` call that set `decision_scores_` was receiving an already-min-max-scaled `X`, and `decision_function` re-scales internally; the fix preserves the raw `X` and passes it to `decision_function`, so `decision_scores_` now matches `decision_function(X_train)` (closes #546). SOS perplexity inner loop replaces `np.sum(...)` with the ndarray `.sum()` method (closes #635); numerical equivalence test asserts bit-exact match. SUOD defers the optional `suod` import to the `SUOD()` constructor with an actionable `ImportError` instead of the old print-then-crash pattern at module top (closes #640). LOF docstring corrects the `novelty` default from `False` to `True` (matches the actual `__init__` default, which is required for PyOD's fit-then-predict contract because scikit-learn's `LocalOutlierFactor` only exposes `predict`/`decision_function` on unseen data in novelty mode); a regression test pins both the `inspect.signature` default and the docstring substring (closes #638). GAAL torch-optional handling: `pyod/models/gaal_base.py` (closes #660 via tuanaiseo), then a follow-up extends the same guarded-import + actionable `ImportError` pattern to `pyod/models/mo_gaal.py`, `pyod/models/so_gaal.py`, and `pyod/models/so_gaal_new.py` so user-visible imports `from pyod.models.mo_gaal import MO_GAAL` and `from pyod.models.so_gaal import SO_GAAL` no longer print-then-crash when torch is absent; all four GAAL files now raise the unified message pointing at `pip install pyod[torch]` or `pip install torch`. `pyod/models/__init__.py` adds an inline comment explaining why detector imports are deliberately omitted at the package level (several detectors require optional extras). Funding: README.rst gains an Acknowledgments section and docs/about.rst gains a Funding section, both citing NSF Award No. 2346158, "NSF POSE: Phase II: OpenAD: An Integrated Open-Source Ecosystem for Anomaly Detection," using the NSF PAPPG recipient-obligation form with the standard disclaimer; lead and sub-awardee organizations are listed separately from PI/co-PI names to avoid stale per-person affiliation claims. Tests: 6 new across `test_lof.py` (1), `test_dif.py` (1), `test_sos.py` (1), `test_lunar.py` (2), and `test_suod.py` (1). No breaking API changes.
 v<3.5.2>, <05/18/2026> -- Reproducibility and kwargs-forwarding bug fixes surfaced by the PyOD 3 paper (KDD 2027 ADS Cycle 1) §5 evidence work. Bug fixes: (1) Closes #685 (`ABOD`/`KNN`/`LUNAR`/`SOD` accepted arbitrary `**kwargs` and forwarded them unfiltered to `sklearn.neighbors.NearestNeighbors`, which crashed at fit time -- or, for KNN, at `__init__` time -- on any kwarg outside `NearestNeighbors`'s signature, including the sklearn-convention `random_state`, a `verbose` flag, or a typo like `n_neighbours`). The four detectors were introduced in commit b8f6c81 (fix for #654) with the over-forwarded `**kwargs`. The fix removes `**kwargs` from each `__init__` and stops forwarding `**self.kwargs` / `**kwargs` to `NearestNeighbors`; the six named forwarding parameters added in b8f6c81 (`algorithm`, `leaf_size`, `metric`, `p`, `metric_params`, `n_jobs`) still cover the use case #654 asked for. Unknown kwargs on ABOD / KNN / SOD now raise a clean `TypeError: <Detector>.__init__() got an unexpected keyword argument '...'` at construction time that points at the user's call site (the sklearn stack frame from the late-fit crash is gone); regression tests assert that the error message names the detector class and does NOT contain `NearestNeighbors` so a future regression that re-introduces the old shape is caught. LUNAR is the one #685 detector that is actually stochastic (it calls `train_test_split`, uses `np.random` in `generate_negative_samples`, and initializes plus trains a torch network), so it does not reject `random_state`; instead, `LUNAR.__init__` now declares an explicit `random_state=None` parameter that accepts either `int` or `numpy.random.RandomState` (sklearn convention; both forms go through `sklearn.utils.check_random_state`) and threads through (a) `torch.manual_seed` (and `torch.cuda.manual_seed_all` when CUDA is available) before the network is built, deriving a single int seed by drawing once from `check_random_state(random_state)`, (b) the numpy `RandomState` returned by the same `check_random_state` used as the `random_state` argument to `sklearn.model_selection.train_test_split`, and (c) the same `random_state` argument added to `generate_negative_samples`. After the fix, two `LUNAR(random_state=42)` instances fit on the same X produce identical `labels_` and `decision_scores_` (within 1e-6). Soft API removal: the accidental arbitrary-`**kwargs` surface added in b8f6c81 is gone. Code that relied on it -- for example `ABOD(some_unknown_kwarg=value)` -- now fails fast at the constructor call instead of at the `NearestNeighbors` constructor inside fit. The six named forwarding parameters still work; this is the only meaningful behavior change. (2) Closes #686 (`ADEngine.investigate` was non-deterministic on byte-identical input because no public API pinned `random_state`). The fix adds `random_state: int | None = None` to `ADEngine.__init__`; the engine stores the seed and passes it through `ADEngine.build_detector` -> `build_detector_from_plan(plan, kb, random_state=...)`. The factory then injects `random_state` into `plan['params']` only for detector classes whose `__init__` declares an explicit `random_state` parameter (verified via `inspect.signature`); detectors that do not declare it -- ABOD, KNN, SOD, and other deterministic classes -- are instantiated unchanged, so the v3.5.1 call shape for those classes is preserved bit-for-bit. A caller-supplied `plan['params']['random_state']` wins over the engine default to preserve explicit caller intent. The factory does `dict(plan.get('params', {}))` before injecting so the caller's plan is not mutated. `build_from_preset(...)` was likewise updated to forward the engine seed: `EmbeddingOD` presets `for_text` / `for_image` (called via `build_detector_from_plan` when `plan.get('preset')` is set) now receive `random_state` as a kwarg, `EmbeddingOD.__init__` accepts and stores it, and `resolve_detector(detector, contamination, random_state=...)` injects the seed into the inner shortcut detector (`'LUNAR'`, `'KNN'`, ...) when that detector class declares `random_state`. `EmbeddingOD._preprocess_fit` also passes `random_state=self.random_state` to the optional ``PCA(n_components=self.reduce_dim, ...)`` dimensionality-reducer so a preset plan with `reduce_dim` set is fully deterministic (PCA can otherwise pick a randomized SVD solver under `svd_solver='auto'` on high-dimensional embeddings, which would have left a stochastic preprocessing step before the seeded detector). The external encoder's own inference (sentence-transformers, DINOv2) is treated as deterministic given fixed weights and is NOT seeded by `EmbeddingOD.random_state`; the docstring documents this boundary. With this, `ADEngine(random_state=42).build_detector({'detector_name': 'EmbeddingOD', 'preset': 'for_text', 'params': {'quality': 'balanced'}})` now produces an `EmbeddingOD(detector='LUNAR', random_state=42)` and the inner `LUNAR` is seeded -- closing the round-2-flagged gap where `EmbeddingOD.for_text()` defaults to LUNAR and silently dropped the engine seed. With `ADEngine(random_state=42)`, repeated `investigate(X)` calls on the same X now produce byte-identical `state.consensus['labels']` and identical `state.analysis['consensus_analysis']['anomaly_ratio']`, and the engine seed propagates end-to-end through `detect()`, `investigate()` -> `run()`, post-recovery reruns, and the `EmbeddingOD` text / image preset path because every path instantiates through `self.build_detector()`. The previously-broken LUNAR direct-plan case is also covered: `ADEngine(random_state=42).run_detection(X, {'detector_name': 'LUNAR', 'params': {...}})` is now bit-stable across reruns. Backward compatibility: `ADEngine()` without a seed retains v3.5.1 behavior (no determinism guarantee). (3) Closes #469 (LODA results are not reproducible because `LODA.__init__` did not accept `random_state` and the inner `np.random.randn` + `np.random.permutation` calls fell back to numpy's module-level state). The fix adds `random_state: int | None = None` to `LODA.__init__`, threads it through `sklearn.utils.check_random_state`, and replaces the two `np.random.*` call sites with `rng.randn(...)` and `rng.permutation(...)` so two `LODA(random_state=42)` fits on the same X produce bit-identical `decision_scores_`. Because LODA now declares `random_state` in its signature, `ADEngine(random_state=42)` propagates the engine seed to LODA plans through the same `_accepts_random_state` factory path used for IForest / LUNAR. Tests: 31 new across `test_ad_engine.py::TestRandomStateDeterminism` (4 -- determinism + cross-seed + default + LUNAR-plan determinism), `test_ad_engine.py::TestRandomStateFactory` (11 -- IForest seed injection, plan-level override wins, KNN/ABOD/SOD not given a seed, plan dict not mutated, no-seed default unchanged, plus 3 preset-path tests for `EmbeddingOD.for_text` seed propagation, plan-level wins, and no-seed default, plus a monkeypatch test asserting `EmbeddingOD._preprocess_fit` constructs `PCA(random_state=...)` with the engine seed), `test_abod.py::TestABODKwargsRejection` (3 -- tightened to assert `ABOD` in the error message and `NearestNeighbors` not in it), `test_knn.py::TestKNNKwargsRejection` (3 -- same tightening for `KNN`), `test_lunar.py::TestLUNARKwargsAndRandomState` (4 -- unknown kwarg rejection with tightened message check + default construction + same-seed determinism + `RandomState` object input accepted), `test_sod.py::TestSODKwargsRejection` (3 -- same tightening for `SOD`), `test_loda.py::TestLODARandomState` (3 -- same-seed determinism + cross-seed differ + no-seed unchanged). Related progress on #599 (sklearn-style `random_state` across pyod): `ADEngine.__init__`, `LUNAR.__init__`, `LODA.__init__`, and `EmbeddingOD.__init__` now accept `random_state`; ABOD / KNN / SOD reject unknown kwargs cleanly at construction. Other detectors with internal stochasticity (e.g., deep-learning models that depend on torch state, `IForest` which already had `random_state`) are not in scope for v3.5.2 and remain follow-up work tracked under #599.
+v<3.5.3>, <05/19/2026> -- KB-tools API for agent-driven and LLM-API-driven routing. Surface 1 (agent tools): `ADEngine.get_kb_for_routing(profile, top_k=3, constraints=None)` returns a structured KB snapshot (every shipped detector with strengths, weaknesses, best_for, avoid_when, complexity, benchmark_rank, modality_match) filtered by `constraints.exclude_detectors` and `constraints.data_type_strict` (default True), sorted by benchmark rank for the profile modality. `ADEngine.make_plan(detector_choices, justifications=None, params=None)` validates the caller-chosen ordered detector list against the KB (case-sensitive; unknown / non-shipped names raise `ValueError`), overlays per-detector params with engine contamination resolution, and returns a closed-schema `DetectionPlan` consumable by `build_detector` / `run`. The pair lets agent runtimes (Claude Code, Codex CLI, MCP tool clients) reason over the KB directly and commit a routing decision without going through hand-coded rules. Surface 2 (programmatic API): `ADEngine.plan_detection(profile, llm_client=callable, top_k=3)` accepts a user-supplied `(prompt: str) -> str` callable wrapping any LLM SDK (Anthropic, OpenAI, vLLM, self-hosted). When `llm_client` is set, the engine builds the routing prompt internally via `pyod.utils._llm.build_routing_prompt`, invokes the callable, parses the response via `pyod.utils._llm.parse_routing_response`, and returns the same `DetectionPlan` shape. On LLM call failure or response parse failure, falls back to rule-driven routing with a `RuntimeWarning`; set `PYOD3_LLM_STRICT=1` to re-raise instead. `LLMCallable` is a Protocol -- PyOD ships no provider-specific adapter classes; users wrap their own SDK. The parser tolerates surrounding prose and markdown fences, skips unknown detector names with a logged warning, dedupes, and truncates to `top_k`; raises `RoutingParseError` if no JSON array is extractable or no valid detector survives KB validation. `top_k` generalization: `ADEngine.plan_detection(..., top_k=3)` exposes the previously hard-coded `valid[1:3]` alternatives slice as a parameter. Default 3 preserves v3.5.2 behavior; values < 1 are clamped to 1. Tests: 44 new in `test_kb_router_surface1.py` covering schema, filters, ordering, KB validation, top_k clamping, stub LLM client canned plan, top_k truncation of LLM response, malformed response fallback, `PYOD3_LLM_STRICT=1` re-raise, prose tolerance, markdown-fence tolerance, dedupe, and bare-string entries. All 205 existing ADEngine tests continue to pass. Backward compatibility: every v3.5.2 caller pattern (`plan_detection(profile)`, `plan_detection(profile, priority=...)`, `plan_detection(profile, constraints=...)`) produces identical output. The new `top_k=3` and `llm_client=None` parameters are keyword-only with backward-compatible defaults. Out of scope: `routing_rules.json` rule authoring (rules remain the offline fallback); LLM-decided `top_k` (caller decides); built-in CLI adapter classes for Codex / Claude Code (users wrap subscriptions themselves); async `llm_client`. No breaking API changes. Round 1 reviewer fixes (Codex via /implement-review auto): (a) High: `_plan_via_llm` now enforces the constrained KB context after parsing -- if the LLM returns a detector excluded by `constraints.exclude_detectors` or filtered by `data_type_strict`, the engine raises `RoutingParseError` and falls back to rule routing with a `RuntimeWarning`. Previously the LLM path validated only against the global KB and could bypass hard `exclude_detectors` constraints. (b) Medium: `get_kb_for_routing` now consults modality-specific benchmark-rank keys instead of `{modality}.title() + '_overall'` only -- `time_series` uses `TSB_AD_overall` / `TSB_AD_overall_iforest`, `graph` uses `BOND_deep` / `BOND_overall`, `text` uses `NLP_ADBench_overall`, `image` uses `MVTec_overall`, all with `ADBench_overall` as the universal fallback. Previously non-tabular modalities effectively sorted alphabetically because the legacy key form did not match the KB's actual rank fields. (c) Medium: new per-call kwarg `plan_detection(..., llm_strict: bool | None = None)`. Precedence: explicit `True` re-raises on LLM/parse failure; explicit `False` falls back with `RuntimeWarning`; `None` defers to `PYOD3_LLM_STRICT` env var. The env-only switch was process-global and incorrect for concurrent callers in the same process. Six additional regression tests cover the constraint bypass, modality rank-key ordering for time_series and graph, and the three-way llm_strict precedence (True/False/None). Round 2 reviewer fixes (Codex via /implement-review auto): (d) Med: `plan_detection`'s new `top_k`, `llm_client`, and `llm_strict` parameters are now actually keyword-only via a `*` separator before them in the signature, matching the release notes claim. (e) Med: `get_kb_for_routing` now stamps each returned detector entry with `resolved_rank` and `resolved_rank_key` fields carrying the modality-specific benchmark rank it used for sorting; `build_routing_prompt` reads those fields so the LLM-facing prompt now shows e.g. `rank=10 (TSB_AD_overall)` for time-series detectors instead of the empty `rank=` it previously rendered (because the prompt had hard-coded the legacy `{modality}.title() + '_overall'` key). Three additional regression tests cover (a) the keyword-only signature contract, (b) prompt rank annotation under time-series, and (c) the text-modality fallback path when the KB has no rank data.
+v<3.5.4>, <06/03/2026> -- Claims-honesty and framing-consistency remediation of the v3 agentic layer from an internal audit (no detector behavior change). Determinism: `ADEngine.random_state` docstring upgraded from the vague "deterministic-up-to-numpy-module-state" hedge to the audited guarantee (a run-to-run audit of the shipped shallow detectors found every one either honors the seed or is deterministic by construction; deep detectors additionally depend on framework seeding). Counts: every public surface now reports 60 buildable detectors instead of 60+/61/50+; `scripts/regen_skill.py` and `pyod/cli.py` exclude `status == "planned"` so the non-buildable `LLMAD` no longer inflates the od-expert skill's counts/lists or `pyod info` (now `60 total (43 tabular, 7 time-series, 8 graph, 2 text, 2 image, 1 multimodal)`); `LLMAD` stays in the raw KB as a roadmap entry. Expert-level: `docs/index.rst`, `od_expert/SKILL.md`, `docs/skill_maintenance.rst`, and `docs/examples/agentic.rst` reword "expert-level/expert-quality results" to a complete-workflow/accessibility claim. Trust verdict: `docs/examples/adengine.rst` demotes the quality verdict to descriptive diagnostics with a "heuristic, not a guarantee" note and corrects the stale "Jaccard" stability description to the cutoff-gap formula; `_quality_metrics.compute_quality` docstring documents `separation` as circular (computed from the run's own predicted labels, near-always high, and not independent of the majority-vote consensus labels); the od-expert skill's Trigger 4 is reframed to cutoff-instability on `stability` only, and its result-interpretation, per-modality confidence lines, and examples route confidence through low `agreement` plus label-free caveats instead of `separation`/`overall`/`verdict`. Consensus: skill guidance softened from "never report from a single detector" to "prefer consensus for robustness; about as accurate as the best single pick." Framing consistency: ADEngine is described as a "lifecycle orchestration" engine rather than "intelligent orchestration" across README, docs, the API reference, and the module docstring, matching the finding that the layer's value is the drivable, reproducible workflow rather than selection intelligence. Tests: 2 new count-locking regression tests (`test_cli.py::test_pyod_info_excludes_planned_detectors`, `test_skill_kb_consistency.py::test_skill_count_prose_matches_kb`) compute expected buildable counts from the KB and fail on regression. Reviewed via /implement-review (Codex, 4 rounds): R1 raised 3 High + 2 Medium + 1 Low, R2 verified 5/6 and flagged trust-gate residue, R3 cleared it, R4 confirmed commit-ready. No breaking API changes.
diff --git a/README.rst b/README.rst
index 94ff2d49..c6bc5d37 100644
--- a/README.rst
+++ b/README.rst
@@ -62,7 +62,7 @@ PyOD 3 is the most comprehensive Python library for anomaly detection. Four pill
 ===========================  ========================================================================================
 Pillar                       What it means
 ===========================  ========================================================================================
-Multi-Modal                  60+ detectors across **tabular, time series, graph, text, and image** data, one API
+Multi-Modal                  60 detectors across **tabular, time series, graph, text, and image** data, one API
 Full Lifecycle               From raw data to explained anomalies and next-step guidance in a single call
 Agentic                      ``od-expert`` turns natural-language requests into ADEngine workflows; MCP exposes structured tools for other agents
 Most Used                    38+ million downloads; benchmark-backed routing (ADBench, TSB-AD, BOND, NLP-ADBench)
@@ -122,7 +122,7 @@ Layer      Name                   When to use
 3          Agentic Investigation  You want an AI agent to drive OD through natural conversation           `Layer 3 walkthrough <https://pyod.readthedocs.io/en/latest/examples/agentic.html>`__
 =========  =====================  ======================================================================  =======================================
 
-Layers 2 and 3 are powered by ``ADEngine``, PyOD's intelligent orchestration core. The full multi-turn Layer 3 investigation flow is available through the ``od-expert`` skill for Claude Code and Codex. The MCP server (``python -m pyod.mcp_server``) exposes ten stateless tools for MCP-compatible LLMs, spanning knowledge queries (``list_detectors``, ``explain_detector``, ``compare_detectors``, ``get_benchmarks``), planning (``profile_data``, ``plan_detection``, ``build_detector``), and detection (``run_detection``, ``analyze_results``, ``explain_findings``); stateful ``investigate`` / ``iterate`` MCP tools are deferred.
+Layers 2 and 3 are powered by ``ADEngine``, PyOD's lifecycle orchestration core. The full multi-turn Layer 3 investigation flow is available through the ``od-expert`` skill for Claude Code and Codex. The MCP server (``python -m pyod.mcp_server``) exposes ten stateless tools for MCP-compatible LLMs, spanning knowledge queries (``list_detectors``, ``explain_detector``, ``compare_detectors``, ``get_benchmarks``), planning (``profile_data``, ``plan_detection``, ``build_detector``), and detection (``run_detection``, ``analyze_results``, ``explain_findings``); stateful ``investigate`` / ``iterate`` MCP tools are deferred.
 
 .. image:: https://raw.githubusercontent.com/yzhao062/pyod/development/docs/figs/agentic-demo.png
    :alt: PyOD 3 agentic investigation demo on cardiotocography dataset
@@ -142,7 +142,7 @@ About PyOD
 
 PyOD, established in 2017, is the longest-running and most widely used Python library for anomaly detection. With `38+ million downloads <https://pepy.tech/project/pyod>`__, it serves both academic research (featured in `Analytics Vidhya <https://www.analyticsvidhya.com/blog/2019/02/outlier-detection-python-pyod/>`__, `KDnuggets <https://www.kdnuggets.com/2019/02/outlier-detection-methods-cheat-sheet.html>`__, and `Towards Data Science <https://towardsdatascience.com/anomaly-detection-for-dummies-15f148e559c1>`__) and commercial products.
 
-V3 extends the library with ``ADEngine`` (intelligent orchestration) and the ``od-expert`` skill (agentic workflow), while keeping the classic ``fit``/``predict`` API fully backward-compatible. V3 is built on SUOD [#Zhao2021SUOD]_ for fast parallel training and numba JIT for per-model speedups.
+V3 extends the library with ``ADEngine`` (lifecycle orchestration) and the ``od-expert`` skill (agentic workflow), while keeping the classic ``fit``/``predict`` API fully backward-compatible. V3 is built on SUOD [#Zhao2021SUOD]_ for fast parallel training and numba JIT for per-model speedups.
 
 **Impact & Recognition**:
 
@@ -253,7 +253,7 @@ Additional Topics
 Implemented Algorithms
 ^^^^^^^^^^^^^^^^^^^^^^
 
-PyOD is organized into two functional groups: **(i) Detection Algorithms**, with dedicated subsections for tabular, time series, and graph data (EmbeddingOD inside the tabular table adds multi-modal support for text and image via foundation model encoders); and **(ii) Utility Functions** for data generation, evaluation, and intelligent orchestration.
+PyOD is organized into two functional groups: **(i) Detection Algorithms**, with dedicated subsections for tabular, time series, and graph data (EmbeddingOD inside the tabular table adds multi-modal support for text and image via foundation model encoders); and **(ii) Utility Functions** for data generation, evaluation, and lifecycle orchestration.
 
 **(i-a) Tabular & Multi-Modal Detection Algorithms** :
 
diff --git a/docs/examples/adengine.rst b/docs/examples/adengine.rst
index 3c51f42c..297c0674 100644
--- a/docs/examples/adengine.rst
+++ b/docs/examples/adengine.rst
@@ -1,7 +1,7 @@
-Layer 2: ADEngine Intelligent Orchestration
+Layer 2: ADEngine Lifecycle Orchestration
 ============================================
 
-ADEngine is PyOD's intelligent anomaly detection engine. It profiles your data, selects benchmark-backed detectors from PyOD's 60+ catalog, runs multiple detectors in parallel, computes consensus scores, and assesses result quality, all in one call.
+ADEngine is PyOD's anomaly detection lifecycle engine. It profiles your data, selects benchmark-backed detectors from PyOD's 60-detector catalog, runs multiple detectors in parallel, computes consensus scores, and reports descriptive diagnostics, all in one call.
 
 Use Layer 2 when you are not sure which detector to pick.
 
@@ -61,11 +61,13 @@ ADEngine runs the top-3 detectors from PyOD's knowledge base and computes a cons
 Quality Assessment
 ------------------
 
-ADEngine quantifies how trustworthy the results are through three metrics:
+ADEngine reports three descriptive diagnostics of a run. They summarize the
+score distribution and cross-detector behavior. They are not a label-free
+guarantee that the results are correct (see the note below):
 
-* **Separation** -- ratio of anomaly scores to inlier scores ([0, 1])
-* **Agreement** -- mean pairwise Spearman correlation between detectors ([0, 1])
-* **Stability** -- Jaccard index of top-k sets under +/- 20% contamination ([0, 1])
+* **Separation** -- relative mean score gap between the run's flagged set and the rest ([0, 1]). It is computed from the run's own predicted labels, so it is descriptive only; it does not show that the cutoff or the vote is correct.
+* **Agreement** -- mean pairwise Spearman correlation between detectors ([0, 1]). The most useful of the three: low agreement flags inputs with no shared structure (near-noise), where the detectors rank points differently.
+* **Stability** -- standardized score gap at the rank-k cutoff ([0, 1]). Low values mean many tied scores near the threshold, so the flagged set is sensitive to the contamination value.
 
 .. code-block:: python
 
@@ -78,6 +80,15 @@ ADEngine quantifies how trustworthy the results are through three metrics:
 
 Verdicts are ``'high'`` (>=0.7), ``'medium'`` (>=0.4), or ``'low'`` (<0.4).
 
+.. note::
+
+   The verdict is a heuristic summary of the score distribution and
+   cross-detector behavior, not a guarantee that the results are correct. Use
+   it as a rough signal, not as a basis for trusting results without labels:
+   low ``agreement`` is the most reliable component and flags near-noise
+   inputs, while ``separation`` is descriptive only. To judge correctness,
+   validate against held-out labels or a domain review.
+
 ----
 
 Session API (Step by Step)
diff --git a/docs/examples/agentic.rst b/docs/examples/agentic.rst
index 4d892438..5511ba88 100644
--- a/docs/examples/agentic.rst
+++ b/docs/examples/agentic.rst
@@ -1,7 +1,7 @@
 Layer 3: Agentic Investigation
 ===============================
 
-PyOD 3's ``od-expert`` skill lets any AI agent drive a full anomaly detection investigation through natural conversation. The agent handles benchmark-backed detector selection, multi-detector consensus, quality assessment, adaptive escalation, and iteration on user feedback, all without requiring the user to be an OD expert.
+PyOD 3's ``od-expert`` skill lets any AI agent drive a full anomaly detection investigation through natural conversation. The agent handles benchmark-backed detector selection, multi-detector consensus, quality diagnostics, adaptive escalation, and iteration on user feedback, all without requiring the user to be an OD expert.
 
 .. figure:: ../figs/agentic-demo.png
    :alt: PyOD 3 agentic investigation demo on a diabetes screening dataset
@@ -43,9 +43,9 @@ When a user asks about anomalies in their data, PyOD's ``od-expert`` skill auto-
 1. **Walks the master decision tree** -- timestamps, graph structure, text/image, or tabular? Load the matching ``references/<modality>.md``.
 2. **Walks the top-10 pitfall checklist** -- is any pitfall active for this data? Example: feature scale ratio > 100 triggers Pitfall 1 (unscaled features for distance-based detectors) and the agent recommends a pre-scaling step or flags it in the report.
 3. **Walks the 11 escalation triggers** -- does anything about the request call for a pause? Example: "medical screening" fires Trigger 8 (high-stakes domain) and the agent commits to dual-detector validation and a confidence caveat.
-4. **Selects detectors** -- calls ``engine.plan(state)`` to pick the top-3 from PyOD's 61-detector catalog based on benchmark evidence (ADBench, TSB-AD, BOND). Each plan entry in ``state.plans`` has ``detector_name``, ``confidence``, ``reason``, ``evidence``.
+4. **Selects detectors** -- calls ``engine.plan(state)`` to pick the top-3 from PyOD's 60-detector catalog based on benchmark evidence (ADBench, TSB-AD, BOND). The benchmark ranks seed the plan; the agent may override them from its own judgment or the user's constraints. Each plan entry in ``state.plans`` has ``detector_name``, ``confidence``, ``reason``, ``evidence``.
 5. **Runs in parallel** -- executes all selected detectors and builds a rank-normalized consensus in ``state.consensus``.
-6. **Re-walks a subset of triggers post-run** -- detector disagreement (T3), weak quality (T4), suspiciously clean results (T10). If any fire, the agent hedges the report or iterates.
+6. **Re-walks a subset of triggers post-run** -- detector disagreement (T3), cutoff instability (T4), suspiciously clean results (T10). If any fire, the agent hedges the report or iterates.
 7. **Generates a report** -- Markdown or JSON, always including a "what I assumed and why" block that lists the contamination rate, the detectors used, the best detector, and any caveats the trigger/pitfall walk surfaced.
 
 The agent's decisions at each of these steps are visible in the interactive demo's dark "od-expert" panels.
@@ -122,9 +122,10 @@ Why this dataset? It exercises the skill's machinery: the feature scale ratio is
            low-dim small datasets. Scale mismatch noted for
            the final report.
 
-           Results: 62 flagged (8.1%), separation 0.96,
-           agreement 0.59, quality HIGH (0.79). Top case:
-           patient #13. KNN strongest individually.
+           Results: 62 flagged (8.1%), agreement 0.59
+           (label-free; separation and the quality verdict
+           are descriptive only). Top case: patient #13.
+           KNN strongest individually.
 
 Behind the scenes:
 
@@ -281,4 +282,4 @@ With PyOD 3 and the v3.2.0 ``od-expert`` skill:
 6. Re-checks quality-related triggers post-run and hedges the report accordingly.
 7. Always reports the assumptions and caveats, including the scale mismatch, contamination, and any triggered escalations.
 
-The agent becomes an OD expert through the library, not despite it.
+The agent follows an OD expert's workflow through the library, not despite it.
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index 27c84f85..5a97e3ef 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -54,7 +54,7 @@ following ``state.next_action`` at each step. See :doc:`agentic` for the full wa
 Examples by Data Type
 ---------------------
 
-* :doc:`tabular`: 50+ detectors for tabular data (ECOD, IForest, KNN, LOF, ...)
+* :doc:`tabular`: 43 detectors for tabular data (ECOD, IForest, KNN, LOF, ...)
 * :doc:`timeseries`: 5 shipped + 2 experimental time series detectors (KShape, MatrixProfile, SpectralResidual, ...)
 * :doc:`graph`: 8 graph detectors (DOMINANT, CoLA, CONAD, ...)
 * :doc:`embedding`: Text and image detection via foundation model embeddings
diff --git a/docs/examples/tabular.rst b/docs/examples/tabular.rst
index 8e24b510..8ed7e770 100644
--- a/docs/examples/tabular.rst
+++ b/docs/examples/tabular.rst
@@ -1,7 +1,7 @@
 Layer 1: Tabular Anomaly Detection
 ====================================
 
-PyOD has 50+ tabular detectors covering probabilistic, linear, proximity, ensemble, and deep learning approaches. All use the same ``fit``/``predict``/``decision_function`` API.
+PyOD has 43 tabular detectors covering probabilistic, linear, proximity, ensemble, and deep learning approaches. All use the same ``fit``/``predict``/``decision_function`` API.
 
 .. code-block:: python
 
diff --git a/docs/index.rst b/docs/index.rst
index ddc764b7..250f4f89 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -64,16 +64,16 @@ Welcome to PyOD 3 documentation!
 
 .. note::
 
-   **New in V3.** Any AI agent can now run expert-level anomaly detection on your data. Just ask.
+   **New in V3.** Any AI agent can now run a complete anomaly detection workflow on your data. Just ask.
 
 PyOD 3 is the most comprehensive Python library for anomaly detection. Four pillars:
 
 ===========================  ========================================================================================
 Pillar                       What it means
 ===========================  ========================================================================================
-Multi-Modal                  60+ detectors across **tabular, time series, graph, text, and image** data, one API
+Multi-Modal                  60 detectors across **tabular, time series, graph, text, and image** data, one API
 Full Lifecycle               From raw data to explained anomalies and next-step guidance in a single call
-Agentic                      Ask in plain English, and AI agents run expert-level detection without OD expertise
+Agentic                      Ask in plain English, and AI agents run the full detection workflow without OD expertise
 Most Used                    `38+ million downloads <https://pepy.tech/project/pyod>`_; benchmark-backed routing (ADBench, TSB-AD, BOND, NLP-ADBench)
 ===========================  ========================================================================================
 
@@ -131,7 +131,7 @@ Layer      Name                   When to use
 3          Agentic Investigation  You want an AI agent to drive OD through natural conversation           :doc:`examples/agentic`
 =========  =====================  ======================================================================  ============================
 
-Layers 2 and 3 are powered by :class:`~pyod.utils.ad_engine.ADEngine`, PyOD's intelligent orchestration core. Layer 3 adds the ``od-expert`` skill that auto-activates in Claude Code, Codex, and MCP-compatible agents.
+Layers 2 and 3 are powered by :class:`~pyod.utils.ad_engine.ADEngine`, PyOD's lifecycle orchestration core. Layer 3 adds the ``od-expert`` skill that auto-activates in Claude Code, Codex, and MCP-compatible agents.
 
 .. figure:: figs/agentic-demo.png
    :alt: PyOD 3 agentic investigation demo on cardiotocography dataset
@@ -157,7 +157,7 @@ About PyOD
 
 PyOD, established in 2017, is the longest-running and most widely used Python library for `anomaly detection <https://en.wikipedia.org/wiki/Anomaly_detection>`_. With `38+ million downloads <https://pepy.tech/project/pyod>`_, it serves both academic research and commercial products worldwide.
 
-V3 extends the library with :class:`~pyod.utils.ad_engine.ADEngine` (intelligent orchestration) and the ``od-expert`` skill (agentic workflow), while keeping the classic ``fit``/``predict`` API fully backward-compatible. V3 is built on SUOD :cite:`a-zhao2021suod` for fast parallel training and numba JIT for per-model speedups.
+V3 extends the library with :class:`~pyod.utils.ad_engine.ADEngine` (lifecycle orchestration) and the ``od-expert`` skill (agentic workflow), while keeping the classic ``fit``/``predict`` API fully backward-compatible. V3 is built on SUOD :cite:`a-zhao2021suod` for fast parallel training and numba JIT for per-model speedups.
 
 **Citing PyOD**:
 
@@ -208,7 +208,7 @@ Benchmarks
 Implemented Algorithms
 ======================
 
-PyOD is organized into two functional groups: **(i) Detection Algorithms**, with dedicated subsections for tabular, time series, and graph data (EmbeddingOD inside the tabular table adds multi-modal support for text and image via foundation model encoders); and **(ii) Utility Functions** for data generation, evaluation, and intelligent orchestration.
+PyOD is organized into two functional groups: **(i) Detection Algorithms**, with dedicated subsections for tabular, time series, and graph data (EmbeddingOD inside the tabular table adds multi-modal support for text and image via foundation model encoders); and **(ii) Utility Functions** for data generation, evaluation, and lifecycle orchestration.
 
 **(i-a) Tabular & Multi-Modal Detection Algorithms** :
 
@@ -390,7 +390,7 @@ Encoding             :func:`~pyod.utils.encoders.resolve_encoder`     Resolve an
 Encoding             SentenceTransformerEncoder                       Encode text via sentence-transformers models (see :doc:`pyod.utils <pyod.utils>`)
 Encoding             OpenAIEncoder                                    Encode text via OpenAI Embeddings API (see :doc:`pyod.utils <pyod.utils>`)
 Encoding             HuggingFaceEncoder                               Encode text or images via HuggingFace transformers (see :doc:`pyod.utils <pyod.utils>`)
-Intelligence         :class:`~pyod.utils.ad_engine.ADEngine`          Intelligent anomaly detection lifecycle engine: profiling, planning, execution, analysis, and reporting
+Orchestration        :class:`~pyod.utils.ad_engine.ADEngine`          Anomaly detection lifecycle engine: profiling, planning, execution, analysis, and reporting
 ===================  ===============================================  =====================================================================================================================================================
 
 
diff --git a/docs/install.rst b/docs/install.rst
index a63d1d35..696bb42b 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -114,7 +114,7 @@ Example output:
 .. code-block:: text
 
     PyOD version:          3.1.0
-    Detectors (ADEngine):  61 total (44 tabular, 7 time-series, 8 graph, 3 text, 2 image, 1 multimodal)
+    Detectors (ADEngine):  60 total (43 tabular, 7 time-series, 8 graph, 2 text, 2 image, 1 multimodal)
     Classic API:           OK
     ADEngine (Layer 2):    OK
     MCP extra:             OK (run: pyod mcp serve)
diff --git a/docs/pyod.ad_engine.rst b/docs/pyod.ad_engine.rst
index 060f1aa1..768db791 100644
--- a/docs/pyod.ad_engine.rst
+++ b/docs/pyod.ad_engine.rst
@@ -1,7 +1,7 @@
 ADEngine
 ========
 
-:class:`pyod.utils.ad_engine.ADEngine` is PyOD's intelligent anomaly detection engine. It provides three layers of capability:
+:class:`pyod.utils.ad_engine.ADEngine` is PyOD's anomaly detection lifecycle engine. It provides three layers of capability:
 
 * **Knowledge queries** -- list detectors, explain detectors, get benchmarks
 * **Detection lifecycle** -- profile, plan, run, analyze, explain, iterate, report
diff --git a/docs/skill_maintenance.rst b/docs/skill_maintenance.rst
index 504dabd2..5c01aec9 100644
--- a/docs/skill_maintenance.rst
+++ b/docs/skill_maintenance.rst
@@ -6,7 +6,7 @@ PyOD ships agent skills (currently ``od-expert``) as packaged Markdown that Clau
 What makes a skill "real"
 -------------------------
 
-A "real" skill encodes domain expertise so a non-expert user gets expert-quality results without driving every decision. The four criteria:
+A "real" skill encodes domain expertise so a non-expert user can run a complete, auditable workflow without driving every decision. The four criteria:
 
 1. **Drives the agent autonomously through a complete workflow.** From data to profile to detector selection to run to analyze to iterate to report, the agent makes informed decisions on the user's behalf and only pauses when uncertain (adaptive escalation).
 2. **Encodes domain knowledge a non-expert lacks.** Decision rules, pitfalls, result interpretation patterns, and worked examples, all distilled from real literature and practice.
diff --git a/examples/agentic_demo.html b/examples/agentic_demo.html
index 7ec80c3c..150621f2 100644
--- a/examples/agentic_demo.html
+++ b/examples/agentic_demo.html
@@ -574,7 +574,7 @@ <h1>Any AI Agent Becomes an <em>OD Expert</em></h1>
         <div class="sp-section">
           <div class="sp-title">Post-run triggers (T3, T4, T10)</div>
           <span class="sp-pass">&check; T3</span> agreement = <b>0.59</b> &gt; 0.4 floor.<br>
-          <span class="sp-pass">&check; T4</span> separation = <b>0.96</b>, stability = <b>0.81</b>, overall = <b>0.79</b> (verdict: <b>high</b>).<br>
+          <span class="sp-pass">&check; T4</span> stability = <b>0.81</b> (cutoff stable); separation = <b>0.96</b> is descriptive only; overall quality is a diagnostic summary, not correctness evidence.<br>
           <span class="sp-pass">&check; T10</span> not over-tight.<br>
           <span class="sp-kv"><b>state.next_action['action']</b> = 'report_to_user'.</span>
         </div>
@@ -705,7 +705,7 @@ <h1>Any AI Agent Becomes an <em>OD Expert</em></h1>
           <strong>Detectors:</strong> KNN, IForest, LOF (3 / 3 converged)<br>
           <strong>Best detector:</strong> KNN (highest correlation with consensus)<br>
           <strong>Final flagged (consensus):</strong> 62 patients (8.1%)<br>
-          <strong>Quality:</strong> <span class="verdict high">HIGH (0.79)</span>
+          <strong>Diagnostics:</strong> <span class="verdict high">internally consistent; validate clinically</span>
         </div>
 
         <hr class="sep">
@@ -731,7 +731,7 @@ <h1>Any AI Agent Becomes an <em>OD Expert</em></h1>
   </div>
 
   <div class="footer">
-    <strong>PyOD V3.2.0</strong> &middot; 61 detectors &middot; 5 modalities &middot; od-expert skill (1000 lines: SKILL.md + 6 references)<br>
+    <strong>PyOD V3.2.0</strong> &middot; 60 detectors &middot; 5 modalities &middot; od-expert skill (1000 lines: SKILL.md + 6 references)<br>
     <a href="https://github.com/yzhao062/pyod">github.com/yzhao062/pyod</a> &middot;
     <a href="https://pyod.readthedocs.io">pyod.readthedocs.io</a> &middot;
     <code style="font-size: 12px;">pip install pyod &amp;&amp; pyod install skill</code>
diff --git a/examples/agentic_example.py b/examples/agentic_example.py
index 91783565..da783173 100644
--- a/examples/agentic_example.py
+++ b/examples/agentic_example.py
@@ -2,7 +2,7 @@
 """Example: Agent-driven anomaly detection with PyOD 3 (Layer 3).
 
 Demonstrates what makes PyOD distinctive in the agentic workflow:
-    - 60+ detectors across 5 data modalities
+    - 60 detectors across 5 data modalities
     - Benchmark-backed detector selection (ADBench, TSB-AD, BOND)
     - Multi-detector consensus with per-detector scores
     - Result quality assessment
@@ -80,7 +80,7 @@ def show_detectors_with_benchmark(engine, data_type='tabular'):
     print("       Running all 3 in parallel...")
     print("       Consensus: %d anomalies (%.1f%%), agreement %.2f."
           % (n_anom, 100.0 * n_anom / X.shape[0], agreement))
-    print("       Result quality: %s (%.2f)."
+    print("       Diagnostics: %s (%.2f), label-free."
           % (state.quality['verdict'], state.quality['overall']))
     print()
 
@@ -117,7 +117,7 @@ def show_detectors_with_benchmark(engine, data_type='tabular'):
     print("       Each detector recomputes its threshold; consensus")
     print("       is majority vote on labels.")
     print()
-    print("       %d cases flagged (%.1f%%). Quality: %s (%.2f)."
+    print("       %d cases flagged (%.1f%%). Diagnostics: %s (%.2f), label-free."
           % (n_anom2, 100.0 * n_anom2 / X.shape[0],
              state.quality['verdict'], state.quality['overall']))
     print()
diff --git a/examples/agentic_hindsight_real_data.py b/examples/agentic_hindsight_real_data.py
index c8770eae..e4c45067 100644
--- a/examples/agentic_hindsight_real_data.py
+++ b/examples/agentic_hindsight_real_data.py
@@ -123,15 +123,16 @@ def skill_postrun_triggers(state):
     """Return od-expert adaptive triggers fired after analysis."""
     triggers = []
     agreement = state.quality['agreement']
-    separation = state.quality['separation']
     stability = state.quality['stability']
+    # `separation` is computed from the run's own predicted labels and is
+    # near-always high, so it is descriptive only and is NOT used as a gate.
     if agreement < 0.4:
         triggers.append('Trigger 3: detector disagreement '
                         '(agreement %.2f < 0.40).' % agreement)
-    if separation < 0.1 or stability < 0.5:
-        triggers.append('Trigger 4: weak cutoff diagnostics '
-                        '(separation %.2f, stability %.2f).'
-                        % (separation, stability))
+    if stability < 0.5:
+        triggers.append('Trigger 4: cutoff instability '
+                        '(stability %.2f < 0.50; flagged set is '
+                        'contamination-sensitive).' % stability)
     if agreement > 0.9:
         triggers.append('Trigger 10: very high agreement %.2f; '
                         'sanity-check top flagged points.' % agreement)
@@ -196,7 +197,7 @@ def print_state(label, state, y):
                   % (result['detector_name'], result.get('error')))
 
     print("Agent analysis: %s" % state.analysis['summary'])
-    print("Quality: %s (%.2f); %s"
+    print("Diagnostics (label-free): %s (%.2f); %s"
           % (state.quality['verdict'], state.quality['overall'],
              state.quality['explanation']))
     triggers = skill_postrun_triggers(state)
diff --git a/pyod/cli.py b/pyod/cli.py
index 3a8636fd..d1441eaa 100644
--- a/pyod/cli.py
+++ b/pyod/cli.py
@@ -46,10 +46,16 @@ def _cmd_info(args: argparse.Namespace) -> int:
         from pyod.utils.ad_engine import ADEngine
         engine = ADEngine()
         counts: Counter = Counter()
-        for algo in engine.kb.algorithms.values():
+        # Count only buildable detectors: a `planned` entry (e.g., LLMAD) has
+        # no backing module, so it must not inflate the reported total.
+        buildable = {
+            name: algo for name, algo in engine.kb.algorithms.items()
+            if algo.get("status") != "planned"
+        }
+        for algo in buildable.values():
             for dt in algo.get("data_types", []):
                 counts[dt] += 1
-        total = len(engine.kb.algorithms)
+        total = len(buildable)
         ad_ok = True
     except Exception:
         counts = Counter()
diff --git a/pyod/skills/od_expert/SKILL.md b/pyod/skills/od_expert/SKILL.md
index 25cb3ddc..7c1a808e 100644
--- a/pyod/skills/od_expert/SKILL.md
+++ b/pyod/skills/od_expert/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: od-expert
-description: Anomaly detection expert backed by PyOD's ADEngine. Drives autonomous detection workflows on tabular, time series, graph, text, and image data — profiling, planning, multi-detector comparison, quality assessment, iteration, and reporting. Encodes deep OD knowledge so non-expert users get expert-quality results without driving every decision.
+description: Anomaly detection expert backed by PyOD's ADEngine. Drives autonomous detection workflows on tabular, time series, graph, text, and image data: profiling, planning, multi-detector comparison, quality assessment, iteration, and reporting. Encodes deep OD knowledge so non-expert users can run the full workflow without driving every decision.
 ---
 
 You are an anomaly detection expert backed by PyOD's ADEngine. Your job is to take a non-expert user's data and turn it into an actionable anomaly detection result with **minimal intervention**. Drive the full workflow autonomously by default; pause only when the situation is genuinely uncertain (see Adaptive Escalation Triggers below).
@@ -19,7 +19,7 @@ Fire this skill when:
 
 ## What you have access to
 
-PyOD ships <!-- KB-snapshot count -->61<!-- /KB-snapshot --> detectors across five modalities (44 tabular, 7 time series, 8 graph, 3 text, 2 image, 1 multimodal). Use the `ADEngine` session API to drive the full workflow:
+PyOD ships <!-- KB-snapshot count -->60<!-- /KB-snapshot --> detectors across five modalities (43 tabular, 7 time series, 8 graph, 2 text, 2 image, 1 multimodal). Use the `ADEngine` session API to drive the full workflow:
 
 ```python
 from pyod.utils.ad_engine import ADEngine
@@ -75,8 +75,8 @@ These are pitfalls that silently produce wrong results if ignored. The agent mus
 3. **Deep learning detector on tiny data.** Do not run `AutoEncoder`, `VAE`, `DeepSVDD`, or `AnoGAN` on datasets with fewer than 1000 rows. They overfit immediately. Trigger 6 (escalation) catches this; recommend `ECOD` / `IForest` / `HBOS` instead.
 4. **Graph detector without PyG installed.** `DOMINANT`, `CoLA`, `CONAD`, `AnomalyDAE`, `GUIDE`, `Radar`, `ANOMALOUS` require `pyod[graph]`. Check with `importlib.util.find_spec("torch_geometric")` before recommending. Trigger 7 catches this.
 5. **Mixing categorical and numerical without encoding.** PyOD detectors expect numeric input. Categorical columns must be one-hot or label encoded first. `engine.profile_data` will fail or produce nonsense if string columns are present.
-6. **Ignoring `state.quality.separation`.** Separation < 0.1 means the consensus is essentially noise. Do NOT report "found anomalies" with high confidence in that case. Trigger 4 catches this.
-7. **Single-detector runs.** Never report from a single detector. Always run the top-3 from `engine.plan` and use consensus. The exception is when the user explicitly requested a specific detector via the `detectors=` argument.
+6. **Ignoring low cross-detector agreement.** `state.quality.agreement` near 0 means the detectors disagree on what to flag, i.e., the input likely has no usable structure (near-noise). Do NOT report "found anomalies" with high confidence in that case. Note that `state.quality.separation` is computed from the run's own predicted labels and is near-always high, so it does not indicate trustworthiness; agreement is the diagnostic that actually catches noise. Trigger 3 catches this.
+7. **Single-detector runs.** Prefer the top-3 from `engine.plan` with consensus for robustness against a single detector's blind spots. Consensus is about as accurate as the single best pick on benchmarks, not reliably better, so reporting one strong detector is acceptable when the plan's top pick is clearly best or the user requested a specific detector via the `detectors=` argument.
 8. **Time series treated as tabular.** If the data has a timestamp column AND row order matters, it is time series, not tabular. Tabular detectors will report most boundary points as anomalies. Trigger 1 catches modality ambiguity.
 9. **Reporting raw scores instead of percentiles or labels.** Raw `decision_function` scores are not interpretable across detectors. Always report `decision_scores_` ranks, percentiles, or `labels_` (binary). The result interpretation patterns in `references/workflow.md` show the right phrasings.
 10. **Missing the requires-extra check.** Some detectors require optional extras (`pyod[xgboost]` for `XGBOD`, `pyod[suod]` for `SUOD`, `pyod[combo]` for `FeatureBagging`). Check `engine.explain_detector(name)` before recommending; if the extra is missing, suggest the install command and pick a substitute.
@@ -88,7 +88,7 @@ Run autonomously by default. Pause and ask the user **only** when one of these t
 1. **Modality ambiguity** — data has timestamps but also feature columns
 2. **Contamination uncertainty** — heuristic range > 5x (e.g., 1%-25%)
 3. **Detector disagreement** — `state.quality.agreement < 0.4` after running
-4. **Quality assessment weak** — `state.quality.separation < 0.1` OR `state.quality.stability < 0.5`
+4. **Cutoff instability**: `state.quality.stability < 0.5` (many tied scores near the threshold; the flagged set is contamination-sensitive, not a sign the labels are wrong). `separation` is descriptive only; use low `state.quality.agreement` (Trigger 3) as the label-free near-noise signal
 5. **Labels mentioned but not provided** — user said "I have known fraud cases" but did not pass labels
 6. **Heavy detector + small data** — DL detector requested, n < 1000
 7. **Missing optional extra** — graph requested but `pyod[graph]` not installed
diff --git a/pyod/skills/od_expert/references/graph.md b/pyod/skills/od_expert/references/graph.md
index 96bba319..52bdaa30 100644
--- a/pyod/skills/od_expert/references/graph.md
+++ b/pyod/skills/od_expert/references/graph.md
@@ -110,7 +110,7 @@ state = engine.analyze(state)
 # state.analysis['best_detector']: 'DOMINANT'
 ```
 
-Note: agreement is below the 0.4 floor would NOT trigger here (0.55 > 0.4). Separation 0.22 is just above the 0.1 critical floor — moderate confidence. Report with a hedge.
+Note: agreement is above the 0.4 floor (0.55), so the detector rankings are moderately consistent. Separation 0.22 is descriptive only and is not a confidence floor. Report with a label-free caveat and recommend domain review.
 
 ### What the agent reports
 
@@ -118,7 +118,7 @@ Note: agreement is below the 0.4 floor would NOT trigger here (0.55 > 0.4). Sepa
 >
 > Top-5 most suspicious accounts: nodes [42, 891, 2103, 5566, 7891]. The dominant pattern was unusual follower-to-following ratios combined with rare account-age values.
 >
-> Confidence: medium. Separation is 0.22 (above the 0.1 floor, but not strong), so I recommend manually reviewing the top-20 against your trust-and-safety database before acting on these flags.
+> Confidence: label-free. The detectors agreed moderately (agreement 0.55), so the result is internally consistent; I recommend manually reviewing the top-20 against your trust-and-safety database before acting on these flags.
 >
 > **What I assumed**:
 > - Graph, node-attributed, homogeneous
diff --git a/pyod/skills/od_expert/references/tabular.md b/pyod/skills/od_expert/references/tabular.md
index a9e22c13..fb1bf0de 100644
--- a/pyod/skills/od_expert/references/tabular.md
+++ b/pyod/skills/od_expert/references/tabular.md
@@ -1,6 +1,6 @@
 # Tabular anomaly detection reference
 
-PyOD's largest modality (44 of 61 detectors). The agent loads this file when the master decision tree (in SKILL.md) routes to tabular.
+PyOD's largest modality (43 of 60 buildable detectors). The agent loads this file when the master decision tree (in SKILL.md) routes to tabular.
 
 ## Decision table by data shape (expert heuristics)
 
@@ -41,7 +41,6 @@ These are rules of thumb for reasoning about which detectors a non-expert would
 - **KDE** (Kernel Density Estimation) — complexity: time O(n^2 * d), space O(n * d); best for: Low-to-moderate dimensional data where non-parametric density estimation is desired; avoid when: Data is high-dimensional or dataset is very large; paper: Latecki et al., SDM 2007
 - **KNN** (K-Nearest Neighbors Outlier Detection) — complexity: time O(n^2 * d), space O(n * d); best for: General-purpose distance-based outlier detection on moderate-sized datasets; avoid when: Dataset is very large or has highly variable local densities; paper: Ramaswamy et al., SIGMOD 2000
 - **KPCA** (Kernel Principal Component Analysis) — complexity: time O(n^2 * d), space O(n^2); best for: Moderately sized datasets with nonlinear structure; avoid when: Dataset is very large due to quadratic kernel matrix or a linear model suffices; paper: Hoffmann, 2007
-- **LLMAD** (LLM-Based Anomaly Detection) — complexity: time varies, space varies; best for: Zero-shot or few-shot anomaly detection leveraging LLM world knowledge; avoid when: Feature is needed before release or LLM API costs are prohibitive; paper: TBD
 - **LMDD** (Linear Model Deviation-based Detection) — complexity: time O(n_iter * n * d), space O(n * d); best for: Multivariate data where anomalies are detectable through linear projections; avoid when: Anomalies require nonlinear feature combinations to detect; paper: Arning et al., KDD 1996
 - **LOCI** (Local Correlation Integral) — complexity: time O(n^2 * d), space O(n^2); best for: Small to medium datasets where automatic threshold selection is valued; avoid when: Dataset is large or faster LOF-based methods are sufficient; paper: Papadimitriou et al., ICDE 2003
 - **LODA** (Lightweight Online Detector of Anomalies) — complexity: time O(n * n_cuts * d), space O(n_bins * n_cuts); best for: Streaming or online anomaly detection with limited computational resources; avoid when: Batch setting with enough time for more powerful methods; paper: Pevny, 2016
@@ -112,7 +111,7 @@ report = engine.report(state)
 >
 > Top-5 most suspicious: rows [42, 77, 153, 891, 1240]. The dominant features driving their flagging were transaction amount (4 of 5) and time-of-day (3 of 5).
 >
-> Confidence: medium. Separation is moderate (0.28), so I recommend manually checking the top-5 against your fraud database.
+> Confidence: label-free. The detectors agreed (agreement 0.71), so the result is internally consistent, but I recommend manually checking the top-5 against your fraud database to confirm.
 >
 > **What I assumed**:
 > - Tabular, unlabeled
@@ -136,7 +135,7 @@ Some datasets have outliers concentrated in 1-2 columns (e.g., latency monitorin
 
 ### T4. Ensemble combination defaults
 
-The default consensus combination is `mean`. For very imbalanced detector quality, `weighted_mean` (weighted by separation) can be better. Mitigation: ADEngine handles this if you use `engine.run` directly. Do not bypass.
+The default consensus combination is `mean`. Do not weight detectors by `separation`; it is computed from each detector's own predicted labels and is not a cross-detector quality measure. Use the rank-normalized consensus from `state.consensus['scores']`, and inspect low `agreement` when detector rankings conflict.
 
 ### T5. ID-only embeddings without numerics
 
diff --git a/pyod/skills/od_expert/references/text_image.md b/pyod/skills/od_expert/references/text_image.md
index 8211ff0e..814db80f 100644
--- a/pyod/skills/od_expert/references/text_image.md
+++ b/pyod/skills/od_expert/references/text_image.md
@@ -14,7 +14,6 @@ The two-step pattern (embed first, then run a classical detector) won the NLP-AD
 
 <!-- BEGIN KB-DERIVED: text-image-detector-list -->
 - **EmbeddingOD** (Embedding-Based Outlier Detection) — complexity: time O(n * embedding_cost + detector_cost), space O(n * embedding_dim); best for: Anomaly detection on unstructured data (text, images) via foundation model representations; avoid when: Data is already tabular or a suitable encoder is not available; requires: pyod[torch]; paper: Zhao et al., 2025
-- **LLMAD** (LLM-Based Anomaly Detection) — complexity: time varies, space varies; best for: Zero-shot or few-shot anomaly detection leveraging LLM world knowledge; avoid when: Feature is needed before release or LLM API costs are prohibitive; paper: TBD
 - **MultiModalOD** (Multi-Modal Outlier Detection) — complexity: time O(n * n_modalities * embedding_cost + detector_cost), space O(n * n_modalities * embedding_dim); best for: Anomaly detection on multi-modal data combining text, image, or other modalities; avoid when: Only a single modality is available or data is purely tabular; requires: pyod[torch]; paper: Zhao et al., 2025
 <!-- END KB-DERIVED: text-image-detector-list -->
 
@@ -111,7 +110,7 @@ report = engine.report(state)
 >
 > Top-5 most unusual descriptions: rows [42, 815, 2200, 14501, 31200]. The dominant pattern was descriptions in non-English languages mixed into the catalog, plus descriptions consisting only of model numbers without prose.
 >
-> Confidence: medium. Separation is 0.24 (above the 0.1 floor, but moderate), so I recommend manually reviewing the top-50 to validate the categories of anomaly.
+> Confidence: label-free, and only one detector ran, so there is no cross-detector agreement to lean on and `separation` (0.24) is descriptive only. I recommend manually reviewing the top-50 to validate the categories of anomaly.
 >
 > **What I assumed**:
 > - Text data, English (auto-detected from sample)
diff --git a/pyod/skills/od_expert/references/time_series.md b/pyod/skills/od_expert/references/time_series.md
index 1582e102..0373ffb2 100644
--- a/pyod/skills/od_expert/references/time_series.md
+++ b/pyod/skills/od_expert/references/time_series.md
@@ -105,7 +105,7 @@ report = engine.report(state)
 >
 > Top-5 most anomalous intervals: minutes [842, 1503, 4221, 5917, 8330]. The dominant pattern in 4 of 5 was a sharp temperature spike followed by a slow recovery — characteristic of a sensor glitch or cooling failure.
 >
-> Confidence: medium-high. Separation is 0.31 (above the 0.2 floor), agreement is 0.62 (above the 0.4 floor), stability is around 0.X (cutoff-gap-based).
+> Confidence: label-free. Agreement is 0.62 (above the 0.4 floor), so the detectors are consistent; stability is around 0.X (cutoff-gap-based, a threshold-sensitivity signal), and `separation` (0.31) is descriptive only. Validate the top intervals against domain knowledge or held-out labels.
 >
 > **What I assumed**:
 > - Time series, univariate, regular sampling (passed `data_type='time_series'` explicitly)
diff --git a/pyod/skills/od_expert/references/workflow.md b/pyod/skills/od_expert/references/workflow.md
index 7327e43b..14be9f86 100644
--- a/pyod/skills/od_expert/references/workflow.md
+++ b/pyod/skills/od_expert/references/workflow.md
@@ -72,9 +72,9 @@ When a trigger fires, the agent pauses and asks the user. These phrasings are in
 
 > "The top-3 detectors I ran ([A], [B], [C]) disagree on which points are anomalies (consensus = [X], where 1.0 is full agreement). I can iterate with a different set of detectors, or report what I have with a clear caveat. **Recommended: iterate** — say 'report' to skip iteration."
 
-### Trigger 4: Quality assessment weak
+### Trigger 4: Cutoff instability
 
-> "Quality diagnostics fired: separation = [X], stability = [Y]. These measure two different things and have different remedies. Low separation (< 0.1) means the detectors did not produce a usable ranking; recommended action is to iterate with a different detector mix. Low stability (< 0.5) means the cutoff is fragile (many scores tied near the threshold) but the ranking itself may still be useful; recommended action is `engine.iterate(state, {'action': 'adjust_contamination', 'value': <rate>})`. If both metrics are low, treat as the separation case first because adjusting the cutoff cannot fix a noisy ranking. I'll iterate accordingly unless you say 'report'."
+> "Cutoff diagnostic fired: stability = [Y]. Many scores are tied near the threshold, so the flagged set is sensitive to the contamination value. This describes threshold sensitivity, not whether the labels are correct; the remedy is `engine.iterate(state, {'action': 'adjust_contamination', 'value': <rate>})`, or report with a threshold-sensitivity caveat. Note that `separation` is computed from the run's own predicted labels and is near-always high, so it is descriptive only and not a quality gate. If the detectors disagree (low `agreement`), handle Trigger 3 first, because adjusting the cutoff cannot fix a ranking the detectors do not agree on. I will iterate accordingly unless you say 'report'."
 
 ### Trigger 5: Labels mentioned but not provided
 
@@ -102,7 +102,7 @@ When a trigger fires, the agent pauses and asks the user. These phrasings are in
 
 ### Trigger 11: Iteration loop deadlock
 
-> "I've iterated twice with no quality improvement. The result is still [X separation, Y stability]. I can keep going with a fresh detector mix, or settle on what we have and report. Recommended: report. Say 'iterate' to keep going."
+> "I have iterated twice without improving agreement or cutoff stability. I can keep going with a fresh detector mix, or settle on what we have and report with the remaining caveats. Recommended: report. Say 'iterate' to keep going."
 
 ## The cardio canonical worked example
 
@@ -174,7 +174,7 @@ No triggers fire. Move toward report.
 
 Agent reports to user:
 
-> "Detection complete. The three detectors converged 3/3 with agreement 0.68 and a clean cutoff (stability around 0.X on this synthetic example), overall quality high. `IForest` had the highest correlation with the consensus. Found 172 cases flagged out of 1,831 at the default contamination rate of 0.1."
+> "Detection complete. The three detectors converged 3/3 with agreement 0.68 and a clean cutoff (stability around 0.X on this synthetic example); this is label-free internal consistency, not a correctness guarantee. `IForest` had the highest correlation with the consensus. Found 172 cases flagged out of 1,831 at the default contamination rate of 0.1."
 
 ### Turn 3: Report
 
@@ -188,7 +188,7 @@ Agent translates the report into user-facing language using the result interpret
 >
 > - **172 of 1,831 recordings (9.4%) flagged as unusual.** The agent ran a consensus of 3 detectors (`IForest`, `ECOD`, `KNN`) using ADEngine's default contamination of 0.1. If that rate looks wrong for the clinical context, re-run via `engine.iterate(state, {"action": "adjust_contamination", "value": <rate>})`.
 > - **Top case**: row #1656 (analyzed via `IForest`, the detector with the highest Spearman correlation with the consensus). Drill into `state.analysis['consensus_analysis']['top_anomalies']` for the ranked index list, then call `engine.explain_findings(state.results[best_idx])` for per-feature z-scores.
-> - **Confidence**: high. Overall quality is high. Agreement across detectors is 0.68 and stability is around 0.X (cutoff-gap-based). For reference only (not seen by the agent during detection): validation against the cardio ground truth gave precision 49.4% (85/172) at recall 48.3% (85/176).
+> - **Confidence**: label-free. Agreement across detectors is 0.68 and stability is around 0.X (cutoff-gap-based). For reference only (not seen by the agent during detection): validation against the cardio ground truth gave precision 49.4% (85/172) at recall 48.3% (85/176).
 >
 > **What I assumed**:
 > - Data is tabular and unlabeled
@@ -202,21 +202,21 @@ Agent translates the report into user-facing language using the result interpret
 
 When translating `state.consensus['scores']`, `state.consensus['labels']`, and `state.quality` into user-facing language:
 
-### High agreement + high separation → confident report
+### High agreement → internally consistent report
 
-Phrasing: "Found N anomalies. The detectors agreed well (consensus X) and the separation is strong (Y). Confidence: high."
+Phrasing: "Found N anomalies. The detectors agreed well (agreement X). The result is internally consistent; confidence is still label-free, so validate the top cases against held-out labels or domain review."
 
-### High agreement + low separation → calibration warning
+### Low agreement → hedged report or iterate
 
-Phrasing: "The detectors agree on the flagged points, but the gap between flagged and normal is small (separation Y < 0.2). The result is internally consistent but may reflect a dataset with no strong outliers. Confidence: medium-low."
+Phrasing: "The detectors disagree (agreement X < 0.4), which can indicate an input with no shared structure (near-noise). Iterating with a different mix is recommended. If reporting now: confidence is low."
 
-### Low agreement + any separation → hedged report or iterate
+### Fragile cutoff (low stability) → threshold-sensitivity note
 
-Phrasing: "The detectors disagree (consensus X < 0.4). Iterating with a different mix is recommended. If reporting now: confidence is low."
+Phrasing: "The flagged set has a fragile cutoff (stability Y): many scores are tied near the threshold, so the exact count is sensitive to the contamination value. This describes threshold sensitivity, not correctness."
 
-### Single-detector consensus (one detector clearly best)
+### Single strong detector (one detector clearly preferred by the plan)
 
-Phrasing: "`ECOD` performed substantially better than `IForest` and `LOF` on this dataset (separation 0.34 vs 0.12 and 0.18). Reporting `ECOD`'s flagged points as the primary result."
+Phrasing: "`ECOD` is the strongest planned detector for this setting. Reporting its flagged points is acceptable when the plan clearly prefers it or the user requested it; otherwise prefer consensus for robustness." Do not rank detectors by `separation`: it is computed from each detector's own predicted labels and is not a cross-detector quality measure.
 
 ### Result with labels (supervised mode via XGBOD)
 
diff --git a/pyod/test/test_cli.py b/pyod/test/test_cli.py
index f1595bcb..47b90462 100644
--- a/pyod/test/test_cli.py
+++ b/pyod/test/test_cli.py
@@ -31,6 +31,37 @@ def test_pyod_info_runs():
     assert "detectors" in result.stdout.lower() or "Detectors" in result.stdout
 
 
+def test_pyod_info_excludes_planned_detectors():
+    """`pyod info` counts only buildable detectors (status != planned).
+
+    A planned entry (e.g., LLMAD) has no backing module and must not inflate
+    the reported total. Expected counts are computed from the live KB so the
+    test does not hardcode a number that legitimate detector additions change;
+    it only locks the invariant that planned entries are excluded.
+    """
+    from pyod.utils.ad_engine import ADEngine
+
+    kb = ADEngine().kb.algorithms
+    buildable = {n: a for n, a in kb.items() if a.get("status") != "planned"}
+    expected_total = len(buildable)
+    expected_tabular = sum(
+        1 for a in buildable.values() if "tabular" in a.get("data_types", []))
+    assert expected_total < len(kb), (
+        "test precondition: KB should have at least one planned entry to "
+        "exercise the exclusion; if none remain, drop this assertion")
+
+    result = subprocess.run(
+        [sys.executable, "-m", "pyod.cli", "info"],
+        capture_output=True, text=True,
+    )
+    assert result.returncode == 0, f"stderr={result.stderr}"
+    assert f"{expected_total} total" in result.stdout, (
+        f"expected '{expected_total} total' (buildable count) in: "
+        f"{result.stdout}")
+    assert f"{expected_tabular} tabular" in result.stdout, (
+        f"expected '{expected_tabular} tabular' in: {result.stdout}")
+
+
 def test_pyod_info_does_not_exit_without_mcp():
     """`pyod info` must not crash in a core install without the mcp extra.
 
diff --git a/pyod/test/test_kb_router_surface1.py b/pyod/test/test_kb_router_surface1.py
new file mode 100644
index 00000000..e8efd48b
--- /dev/null
+++ b/pyod/test/test_kb_router_surface1.py
@@ -0,0 +1,490 @@
+# -*- coding: utf-8 -*-
+"""Tests for pyod 3.5.3 Surface 1 (KB-tools for agent-driven routing).
+
+Covers:
+- ADEngine.get_kb_for_routing(profile, top_k, constraints)
+- ADEngine.make_plan(detector_choices, justifications, params)
+- ADEngine.plan_detection(..., top_k=K) parameter generalization
+"""
+
+import os
+import sys
+import unittest
+
+import numpy as np
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from pyod.utils.ad_engine import ADEngine
+
+
+class TestGetKbForRouting(unittest.TestCase):
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(200, 8))
+        self.X[-10:] += 4 * rng.normal(size=(10, 8))
+        self.profile = self.engine.profile_data(self.X, data_type='tabular')
+
+    def test_returns_expected_keys(self):
+        kb = self.engine.get_kb_for_routing(self.profile, top_k=3)
+        for k in ('task_profile', 'available_detectors',
+                  'top_k_requested', 'response_format_hint',
+                  'n_available'):
+            assert k in kb, f"missing key {k}"
+
+    def test_top_k_field_preserved(self):
+        kb = self.engine.get_kb_for_routing(self.profile, top_k=5)
+        assert kb['top_k_requested'] == 5
+        # Non-positive top_k is clamped to 1
+        kb1 = self.engine.get_kb_for_routing(self.profile, top_k=0)
+        assert kb1['top_k_requested'] == 1
+
+    def test_detector_entries_have_kb_fields(self):
+        kb = self.engine.get_kb_for_routing(self.profile)
+        assert kb['n_available'] > 0
+        sample = kb['available_detectors'][0]
+        for k in ('name', 'category', 'complexity_time', 'complexity_space',
+                  'strengths', 'weaknesses', 'best_for', 'avoid_when',
+                  'benchmark_rank', 'modality_match'):
+            assert k in sample, f"missing detector field {k}"
+        assert isinstance(sample['strengths'], list)
+        assert isinstance(sample['weaknesses'], list)
+
+    def test_exclude_constraint(self):
+        kb = self.engine.get_kb_for_routing(
+            self.profile, constraints={'exclude_detectors': ['IForest', 'KNN']})
+        names = [d['name'] for d in kb['available_detectors']]
+        assert 'IForest' not in names
+        assert 'KNN' not in names
+
+    def test_data_type_strict_filter(self):
+        # Default data_type_strict=True drops detectors whose data_types do
+        # not include the profile's modality.
+        kb_strict = self.engine.get_kb_for_routing(self.profile)
+        kb_lax = self.engine.get_kb_for_routing(
+            self.profile, constraints={'data_type_strict': False})
+        assert kb_strict['n_available'] <= kb_lax['n_available']
+
+    def test_pure_function(self):
+        # No state mutation; profile dict is not modified.
+        before = dict(self.profile)
+        self.engine.get_kb_for_routing(self.profile, top_k=3)
+        assert self.profile == before
+
+    def test_bad_profile_raises(self):
+        with self.assertRaises(ValueError):
+            self.engine.get_kb_for_routing("not a dict")
+
+    def test_ranking_order_iforest_first_on_tabular(self):
+        # KB benchmark_rank should place IForest near the top on ADBench.
+        kb = self.engine.get_kb_for_routing(self.profile)
+        names = [d['name'] for d in kb['available_detectors'][:5]]
+        assert 'IForest' in names
+
+    def test_ranking_uses_modality_specific_keys_time_series(self):
+        """Med-2 (Codex Round 1): time_series should sort by TSB_AD_overall.
+
+        Asserts the modality-specific rank key is consulted by checking
+        that the available_detectors list is sorted by TSB_AD_overall
+        (ascending) for any pair of detectors that both declare it.
+        """
+        ts_profile = dict(self.profile)
+        ts_profile['data_type'] = 'time_series'
+        kb = self.engine.get_kb_for_routing(ts_profile)
+        ts_ranked = [
+            (d['name'], (d.get('benchmark_rank') or {}).get('TSB_AD_overall'))
+            for d in kb['available_detectors']
+            if (d.get('benchmark_rank') or {}).get('TSB_AD_overall') is not None
+        ]
+        # Any pair with TSB_AD_overall must be in non-decreasing rank order.
+        ranks = [r for _, r in ts_ranked]
+        assert ranks == sorted(ranks), (
+            f"time_series ordering ignored TSB_AD_overall: {ts_ranked}")
+
+    def test_ranking_uses_modality_specific_keys_graph(self):
+        """Med-2 (Codex Round 1): graph should sort by BOND_deep / BOND_overall."""
+        g_profile = dict(self.profile)
+        g_profile['data_type'] = 'graph'
+        kb = self.engine.get_kb_for_routing(g_profile)
+        g_ranked = []
+        for d in kb['available_detectors']:
+            br = d.get('benchmark_rank') or {}
+            r = br.get('BOND_deep') or br.get('BOND_overall')
+            if r is not None:
+                g_ranked.append((d['name'], r))
+        ranks = [r for _, r in g_ranked]
+        assert ranks == sorted(ranks), (
+            f"graph ordering ignored BOND keys: {g_ranked}")
+
+
+class TestMakePlan(unittest.TestCase):
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+
+    def test_single_detector(self):
+        plan = self.engine.make_plan(['IForest'])
+        assert plan['detector_name'] == 'IForest'
+        assert plan['confidence'] == 0.7
+        assert plan['alternatives'] == []
+
+    def test_three_detectors_with_justifications(self):
+        plan = self.engine.make_plan(
+            ['IForest', 'KNN', 'ECOD'],
+            justifications=['rank', 'proximity', 'parameter-free'])
+        assert plan['detector_name'] == 'IForest'
+        assert plan['reason'] == 'rank'
+        assert len(plan['alternatives']) == 2
+        assert plan['alternatives'][0]['detector_name'] == 'KNN'
+        assert plan['alternatives'][0]['confidence'] == 0.5
+        assert plan['alternatives'][1]['detector_name'] == 'ECOD'
+
+    def test_default_justification(self):
+        plan = self.engine.make_plan(['IForest', 'KNN'])
+        assert 'caller-selected' in plan['reason']
+        assert 'caller-selected' in plan['alternatives'][0]['reason']
+
+    def test_unknown_detector_raises(self):
+        with self.assertRaises(ValueError) as cm:
+            self.engine.make_plan(['DoesNotExist'])
+        assert 'DoesNotExist' in str(cm.exception)
+        assert 'case-sensitive' in str(cm.exception)
+
+    def test_empty_choices_raises(self):
+        with self.assertRaises(ValueError):
+            self.engine.make_plan([])
+
+    def test_non_list_raises(self):
+        with self.assertRaises(ValueError):
+            self.engine.make_plan('IForest')
+
+    def test_params_overlay_keeps_contamination(self):
+        plan = self.engine.make_plan(
+            ['IForest'],
+            params=[{'n_estimators': 50}])
+        assert plan['params'].get('n_estimators') == 50
+        # _with_contamination should add contamination if KB has a default
+        # (it does for IForest); we check it's present, not the exact value.
+        assert 'contamination' in plan['params']
+
+    def test_build_detector_consumes_plan(self):
+        # The plan returned by make_plan must be consumable by build_detector
+        # so existing detector-construction code keeps working.
+        plan = self.engine.make_plan(['IForest'])
+        detector = self.engine.build_detector(plan)
+        assert detector is not None
+        # And it should be fittable
+        rng = np.random.RandomState(7)
+        X = rng.normal(size=(100, 5))
+        detector.fit(X)
+        assert hasattr(detector, 'decision_scores_')
+
+
+class TestPlanDetectionSignature(unittest.TestCase):
+    """Med-1 (Codex Round 2): new params must be keyword-only."""
+
+    def test_new_params_are_keyword_only(self):
+        import inspect
+        sig = inspect.signature(ADEngine.plan_detection)
+        for name in ('top_k', 'llm_client', 'llm_strict'):
+            kind = sig.parameters[name].kind
+            assert kind is inspect.Parameter.KEYWORD_ONLY, (
+                f"{name!r} should be KEYWORD_ONLY, got {kind!r}; "
+                "CHANGES.txt advertises keyword-only and callers must "
+                "not bind these positionally.")
+        # The v3.5.2 params remain positional-or-keyword for backward compat.
+        for name in ('profile', 'priority', 'constraints'):
+            kind = sig.parameters[name].kind
+            assert kind is inspect.Parameter.POSITIONAL_OR_KEYWORD, (
+                f"v3.5.2 param {name!r} must stay POSITIONAL_OR_KEYWORD; "
+                f"got {kind!r}")
+
+
+class TestPlanDetectionTopK(unittest.TestCase):
+    """plan_detection(..., top_k=K) generalizes the v3.5.2 valid[1:3] cap."""
+
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(300, 8))
+        self.profile = self.engine.profile_data(self.X, data_type='tabular')
+
+    def test_default_top_k_matches_v352_behavior(self):
+        # v3.5.2 used valid[1:3] which gave up to 2 alternatives.
+        plan = self.engine.plan_detection(self.profile)
+        assert len(plan.get('alternatives', [])) <= 2
+
+    def test_top_k_1(self):
+        plan = self.engine.plan_detection(self.profile, top_k=1)
+        assert plan.get('alternatives', []) == []
+
+    def test_top_k_5(self):
+        plan = self.engine.plan_detection(self.profile, top_k=5)
+        # plan returns primary + up to top_k-1 alternatives.
+        assert len(plan.get('alternatives', [])) <= 4
+
+    def test_top_k_clamped_to_1(self):
+        plan_neg = self.engine.plan_detection(self.profile, top_k=-1)
+        plan_zero = self.engine.plan_detection(self.profile, top_k=0)
+        plan_one = self.engine.plan_detection(self.profile, top_k=1)
+        # All three should yield the same single-primary plan (no alternatives)
+        assert plan_neg.get('alternatives') == plan_one.get('alternatives')
+        assert plan_zero.get('alternatives') == plan_one.get('alternatives')
+
+
+class TestSurface2LlmClient(unittest.TestCase):
+    """plan_detection(llm_client=...) Surface 2 routing."""
+
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(200, 8))
+        self.profile = self.engine.profile_data(self.X, data_type='tabular')
+
+    def test_stub_llm_returns_canned_plan(self):
+        def stub_llm(prompt: str) -> str:
+            return ('[{"detector":"IForest","justification":"top rank"},'
+                    '{"detector":"ECOD","justification":"parameter-free"},'
+                    '{"detector":"KNN","justification":"proximity"}]')
+        plan = self.engine.plan_detection(self.profile, llm_client=stub_llm)
+        assert plan['detector_name'] == 'IForest'
+        assert plan.get('note') == 'llm-driven via plan_detection(llm_client=...)'
+        assert plan.get('evidence') == ['llm_routing']
+        alts = [a['detector_name'] for a in plan.get('alternatives', [])]
+        assert alts == ['ECOD', 'KNN']
+
+    def test_top_k_truncates_llm_response(self):
+        def stub_llm(prompt: str) -> str:
+            return ('[{"detector":"IForest","justification":"a"},'
+                    '{"detector":"ECOD","justification":"b"},'
+                    '{"detector":"KNN","justification":"c"},'
+                    '{"detector":"LOF","justification":"d"},'
+                    '{"detector":"HBOS","justification":"e"}]')
+        plan = self.engine.plan_detection(self.profile, llm_client=stub_llm,
+                                          top_k=2)
+        assert plan['detector_name'] == 'IForest'
+        assert len(plan.get('alternatives', [])) == 1
+
+    def test_llm_response_cannot_select_excluded_detector(self):
+        """High-1 (Codex Round 1): LLM output must not bypass exclude_detectors."""
+        import warnings
+
+        def excluded_llm(prompt: str) -> str:
+            return '[{"detector":"IForest","justification":"trying to bypass"}]'
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            plan = self.engine.plan_detection(
+                self.profile,
+                constraints={'exclude_detectors': ['IForest']},
+                llm_client=excluded_llm)
+        # Must have fallen back to rule routing.
+        assert plan['detector_name'] != 'IForest'
+        assert plan.get('evidence') != ['llm_routing']
+        # And a RuntimeWarning must have explained the fallback.
+        assert any(issubclass(x.category, RuntimeWarning) for x in w)
+
+    def test_per_call_llm_strict_true_reraises(self):
+        """Med-3 (Codex Round 1): explicit llm_strict=True re-raises."""
+        from pyod.utils._llm import RoutingParseError
+        def bad_llm(prompt: str) -> str:
+            return "I cannot help with that."
+        with self.assertRaises(RoutingParseError):
+            self.engine.plan_detection(self.profile,
+                                        llm_client=bad_llm,
+                                        llm_strict=True)
+
+    def test_per_call_llm_strict_false_overrides_env(self):
+        """Med-3 (Codex Round 1): explicit llm_strict=False overrides env var."""
+        import os, warnings
+        os.environ['PYOD3_LLM_STRICT'] = '1'
+        try:
+            def bad_llm(prompt: str) -> str:
+                return "no json"
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                plan = self.engine.plan_detection(
+                    self.profile, llm_client=bad_llm, llm_strict=False)
+            # Did not raise -- explicit kwarg won over env var.
+            assert plan['detector_name']
+            assert any(issubclass(x.category, RuntimeWarning) for x in w)
+        finally:
+            del os.environ['PYOD3_LLM_STRICT']
+
+    def test_per_call_llm_strict_none_defers_to_env(self):
+        """Med-3 (Codex Round 1): llm_strict=None reads PYOD3_LLM_STRICT."""
+        import os
+        from pyod.utils._llm import RoutingParseError
+        os.environ['PYOD3_LLM_STRICT'] = '1'
+        try:
+            def bad_llm(prompt: str) -> str:
+                return "no json"
+            with self.assertRaises(RoutingParseError):
+                self.engine.plan_detection(self.profile,
+                                           llm_client=bad_llm,
+                                           llm_strict=None)
+        finally:
+            del os.environ['PYOD3_LLM_STRICT']
+
+    def test_malformed_response_falls_back_to_rules(self):
+        import warnings
+        def bad_llm(prompt: str) -> str:
+            return "Sorry, I cannot help with that."
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            plan = self.engine.plan_detection(self.profile,
+                                              llm_client=bad_llm)
+        assert plan['detector_name']  # rule-driven primary
+        assert plan.get('evidence') != ['llm_routing']
+        # A RuntimeWarning should have been emitted
+        runtime_warnings = [x for x in w
+                            if issubclass(x.category, RuntimeWarning)]
+        assert len(runtime_warnings) >= 1
+
+    def test_strict_mode_reraises(self):
+        import os
+        from pyod.utils._llm import RoutingParseError
+        def bad_llm(prompt: str) -> str:
+            return "no JSON here"
+        os.environ['PYOD3_LLM_STRICT'] = '1'
+        try:
+            with self.assertRaises(RoutingParseError):
+                self.engine.plan_detection(self.profile, llm_client=bad_llm)
+        finally:
+            del os.environ['PYOD3_LLM_STRICT']
+
+    def test_llm_client_none_preserves_rule_routing(self):
+        # llm_client=None must produce the exact v3.5.2 rule plan.
+        plan_default = self.engine.plan_detection(self.profile)
+        plan_none = self.engine.plan_detection(self.profile, llm_client=None)
+        assert plan_default == plan_none
+
+
+class TestPromptBuilderModalityRank(unittest.TestCase):
+    """Med-2 (Codex Round 2): the LLM prompt must carry the modality-
+    specific benchmark rank, not just the modality-title-overall key.
+    """
+
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(200, 8))
+
+    def _profile_for(self, data_type: str) -> dict:
+        # profile_data only inspects X; data_type just rides through.
+        prof = self.engine.profile_data(self.X, data_type=data_type)
+        return prof
+
+    def test_text_modality_prompt_falls_back_when_no_rank_data(self):
+        # text detectors currently ship without benchmark_rank entries
+        # in the KB. The prompt must still render successfully (no rank
+        # annotation, no crash) and must NOT use the old broken
+        # `Text_overall` key name (which never existed in the KB).
+        from pyod.utils._llm import build_routing_prompt
+        prof = self._profile_for('text')
+        kb = self.engine.get_kb_for_routing(prof)
+        prompt = build_routing_prompt(kb, top_k=3)
+        assert 'TASK PROFILE' in prompt
+        assert 'Text_overall' not in prompt, (
+            "build_routing_prompt should not emit the legacy "
+            "`Text_overall` key form -- it was never a real KB field")
+
+    def test_time_series_modality_prompt_shows_tsb_ad_rank(self):
+        from pyod.utils._llm import build_routing_prompt
+        prof = self._profile_for('time_series')
+        kb = self.engine.get_kb_for_routing(prof)
+        prompt = build_routing_prompt(kb, top_k=3)
+        # TSB_AD_overall is the documented primary key for time_series.
+        assert 'TSB_AD_overall' in prompt, (
+            "time_series prompt should annotate the rank with TSB_AD_overall")
+
+    def test_kb_entries_stamp_resolved_rank(self):
+        # The contract: get_kb_for_routing places `resolved_rank` and
+        # `resolved_rank_key` on each detector entry so downstream tools
+        # do not have to re-do the lookup.
+        prof = self._profile_for('time_series')
+        kb = self.engine.get_kb_for_routing(prof)
+        any_ranked = [d for d in kb['available_detectors']
+                      if d.get('resolved_rank') is not None]
+        assert any_ranked, (
+            "time_series KB should expose resolved_rank on at least "
+            "one entry")
+        for d in any_ranked:
+            assert d.get('resolved_rank_key') is not None, (
+                f"detector {d['name']} has resolved_rank but no "
+                "resolved_rank_key -- contract violation")
+
+
+class TestRoutingResponseParser(unittest.TestCase):
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        self.kb = self.engine.kb
+
+    def test_parse_plain_json_array(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = '[{"detector":"IForest","justification":"x"}]'
+        choices, justs = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+        assert justs == ['x']
+
+    def test_parse_tolerates_prose(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('I recommend the following three detectors:\n'
+                '[{"detector":"IForest","justification":"x"},'
+                '{"detector":"ECOD","justification":"y"}]\n'
+                'Hope this helps.')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest', 'ECOD']
+
+    def test_parse_tolerates_markdown_fences(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('```json\n'
+                '[{"detector":"IForest","justification":"x"}]\n'
+                '```')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+
+    def test_parse_skips_unknown_detectors(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('[{"detector":"BogusDetector","justification":"x"},'
+                '{"detector":"IForest","justification":"y"}]')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+
+    def test_parse_dedupes(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('[{"detector":"IForest","justification":"a"},'
+                '{"detector":"IForest","justification":"b"}]')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+
+    def test_parse_truncates_to_top_k(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('[{"detector":"IForest"},'
+                '{"detector":"ECOD"},'
+                '{"detector":"KNN"},'
+                '{"detector":"LOF"}]')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=2)
+        assert choices == ['IForest', 'ECOD']
+
+    def test_parse_raises_on_no_array(self):
+        from pyod.utils._llm import parse_routing_response, RoutingParseError
+        with self.assertRaises(RoutingParseError):
+            parse_routing_response("no json here", self.kb)
+
+    def test_parse_raises_on_all_invalid(self):
+        from pyod.utils._llm import parse_routing_response, RoutingParseError
+        resp = '[{"detector":"BogusOne"},{"detector":"BogusTwo"}]'
+        with self.assertRaises(RoutingParseError):
+            parse_routing_response(resp, self.kb)
+
+    def test_parse_accepts_bare_string_entries(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = '["IForest", "ECOD"]'
+        choices, justs = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest', 'ECOD']
+        assert justs == ['', '']
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pyod/test/test_skill_kb_consistency.py b/pyod/test/test_skill_kb_consistency.py
index 45d03f1c..d8b9353c 100644
--- a/pyod/test/test_skill_kb_consistency.py
+++ b/pyod/test/test_skill_kb_consistency.py
@@ -319,6 +319,57 @@ def test_kb_loads_cleanly():
     )
 
 
+def test_skill_count_prose_matches_kb():
+    """Hand-written detector-count claims in skill prose must match the KB.
+
+    These counts live OUTSIDE KB-DERIVED blocks (the ``KB-snapshot`` marker in
+    SKILL.md and the modality header in references/tabular.md), so the
+    byte-identical regen check does not cover them. A ``planned`` entry (e.g.
+    LLMAD) has no backing module, so the buildable count excludes
+    ``status == 'planned'``. This locks the counts against the stale-inflation
+    regression fixed in the claim audit (they had read 61 / "44 of 61").
+    """
+    import re
+
+    kb = _load_kb()
+    buildable = {n: a for n, a in kb.items() if a.get("status") != "planned"}
+    total = len(buildable)
+    tabular = sum(
+        1 for a in buildable.values() if "tabular" in a.get("data_types", []))
+
+    skill_md = (SKILLS_DIR / "od_expert" / "SKILL.md").read_text(encoding="utf-8")
+    m = re.search(r"<!-- KB-snapshot count -->(\d+)<!-- /KB-snapshot -->", skill_md)
+    assert m, "KB-snapshot count marker not found in od_expert/SKILL.md"
+    assert int(m.group(1)) == total, (
+        f"SKILL.md KB-snapshot count {m.group(1)} != buildable total {total} "
+        f"(planned detectors are excluded); update the count line."
+    )
+
+    tab_ref = (SKILLS_DIR / "od_expert" / "references" / "tabular.md").read_text(
+        encoding="utf-8")
+    m2 = re.search(r"(\d+) of (\d+) buildable detectors", tab_ref)
+    assert m2, "'<N> of <M> buildable detectors' phrase not found in tabular.md"
+    assert (int(m2.group(1)), int(m2.group(2))) == (tabular, total), (
+        f"tabular.md says {m2.group(1)} of {m2.group(2)} buildable detectors; "
+        f"KB has {tabular} tabular of {total} buildable."
+    )
+
+    # No skill file may reassert a stale, inflated count.
+    stale = re.compile(
+        r"\b61 detectors\b|60\+\s*detectors|\b44 of 61\b|50\+\s*tabular")
+    offenders = []
+    for f in _all_skill_files():
+        for lineno, line in enumerate(
+                f.read_text(encoding="utf-8").splitlines(), 1):
+            if stale.search(line):
+                offenders.append(
+                    f"{f.relative_to(REPO_ROOT)}:{lineno}: {line.strip()}")
+    assert not offenders, (
+        "Stale detector-count claims in skill files:\n  "
+        + "\n  ".join(offenders)
+    )
+
+
 def test_allowlist_does_not_shadow_kb_keys():
     """The backtick allowlist must not contain any live KB detector name.
 
diff --git a/pyod/utils/_llm.py b/pyod/utils/_llm.py
new file mode 100644
index 00000000..530c011f
--- /dev/null
+++ b/pyod/utils/_llm.py
@@ -0,0 +1,267 @@
+"""LLM-client Protocol, prompt builder, and routing-response parser.
+
+This module powers pyod 3.5.3's :meth:`ADEngine.plan_detection` Surface 2
+extension. When a user passes ``llm_client=callable``, the engine
+invokes :func:`build_routing_prompt` and :func:`parse_routing_response`
+through this module; when ``llm_client=None``, the rules path is
+unchanged.
+
+Public:
+    LLMCallable -- typing.Protocol; any (prompt: str) -> str
+    RoutingParseError -- raised when the parser cannot extract a plan
+    build_routing_prompt(kb_context, top_k) -> str
+    parse_routing_response(response, kb, top_k) -> (list[str], list[str])
+
+No optional dependencies are imported at module load; PyOD does not ship
+any provider-specific adapters.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Any, Protocol
+
+
+logger = logging.getLogger(__name__)
+
+
+class LLMCallable(Protocol):
+    """Any ``(prompt: str) -> str`` callable.
+
+    Users supply an instance wrapping their preferred LLM SDK. Example
+    (Anthropic SDK):
+
+    .. code-block:: python
+
+        from anthropic import Anthropic
+        client = Anthropic(api_key=os.environ['ANTHROPIC_API_KEY'])
+
+        def my_llm(prompt: str) -> str:
+            return client.messages.create(
+                model='claude-opus-4-7',
+                max_tokens=4096,
+                messages=[{'role': 'user', 'content': prompt}],
+            ).content[0].text
+
+        plan = engine.plan_detection(profile, llm_client=my_llm)
+
+    PyOD ships no provider-specific adapter classes; users wrap their
+    own SDK in a ``(prompt) -> str`` callable.
+    """
+
+    def __call__(self, prompt: str) -> str: ...
+
+
+class RoutingParseError(ValueError):
+    """Raised when :func:`parse_routing_response` cannot extract a plan.
+
+    The engine catches this and falls back to rule-driven routing unless
+    the environment variable ``PYOD3_LLM_STRICT=1`` is set.
+    """
+
+
+def build_routing_prompt(kb_context: dict, top_k: int = 3) -> str:
+    """Render a routing prompt from a knowledge-base context dict.
+
+    Parameters
+    ----------
+    kb_context : dict
+        Output of :meth:`ADEngine.get_kb_for_routing`. Carries
+        ``task_profile`` and ``available_detectors`` lists.
+    top_k : int
+        Number of detectors the LLM should select. Default 3.
+
+    Returns
+    -------
+    str
+        A self-contained prompt instructing the LLM to return a JSON
+        array of ``{"detector": ..., "justification": ...}`` objects.
+
+    Notes
+    -----
+    The template avoids chain-of-thought scaffolding so the same prompt
+    works across diverse LLMs (Claude, GPT, Gemini, open-weight).
+    """
+    profile = kb_context.get("task_profile", {})
+    detectors = kb_context.get("available_detectors", [])
+    # Compress each detector entry to a single line: name + 1-line
+    # best_for + 1-line avoid_when + benchmark_rank (resolved per-modality).
+    # Prefer `resolved_rank` / `resolved_rank_key` that get_kb_for_routing
+    # stamped on each entry; fall back to the modality-title-overall key
+    # for older callers that bypass get_kb_for_routing.
+    lines = []
+    for d in detectors:
+        rank = d.get("resolved_rank")
+        rank_key = d.get("resolved_rank_key")
+        if rank is None:
+            bench = d.get("benchmark_rank") or {}
+            rank = bench.get(
+                f"{str(profile.get('data_type', 'tabular')).title()}_overall"
+            ) or bench.get("ADBench_overall")
+            rank_key = None
+        rank_str = (f" rank={rank} ({rank_key})" if rank is not None
+                    and rank_key else
+                    (f" rank={rank}" if rank is not None else ""))
+        strengths = "; ".join((d.get("strengths") or [])[:2])
+        weaknesses = "; ".join((d.get("weaknesses") or [])[:2])
+        best_for = d.get("best_for") or ""
+        avoid_when = d.get("avoid_when") or ""
+        lines.append(
+            f"- {d['name']} ({d.get('category', 'unknown')}{rank_str}): "
+            f"best_for={best_for!r}; avoid_when={avoid_when!r}; "
+            f"strengths=[{strengths}]; weaknesses=[{weaknesses}]")
+
+    profile_str = (
+        f"data_type={profile.get('data_type', 'tabular')}, "
+        f"n_samples={profile.get('n_samples', '?')}, "
+        f"n_features={profile.get('n_features', '?')}, "
+        f"contamination_estimate={profile.get('contamination_estimate', '?')}"
+    )
+
+    return (
+        "You are an anomaly-detection routing expert. Given the task "
+        "profile and a list of available detectors annotated with "
+        "strengths, weaknesses, best_for, avoid_when, and benchmark "
+        "rank, choose the ordered top-K detectors most likely to "
+        "succeed on this task.\n\n"
+        f"TASK PROFILE: {profile_str}\n\n"
+        f"AVAILABLE DETECTORS ({len(detectors)}):\n"
+        + "\n".join(lines) + "\n\n"
+        f"Return exactly {top_k} detectors as a JSON array of objects, "
+        'each shaped {"detector": "<name>", "justification": "<one '
+        'sentence>"}. Detector names are case-sensitive and must come '
+        "from the list above. Return ONLY the JSON array (no prose, "
+        "no markdown fences).\n"
+    )
+
+
+# Matches a balanced top-level JSON array. We do not parse arbitrary
+# nested arrays defensively; the spec asks for a flat list of objects.
+_JSON_ARRAY_RE = re.compile(r"\[[^\[\]]*?\]", re.DOTALL)
+
+
+def _extract_first_array(response: str) -> str | None:
+    """Return the first ``[...]`` substring that parses as a JSON list.
+
+    Tolerates surrounding prose, markdown fences, or repeated arrays.
+    Returns ``None`` if no parseable array is found.
+    """
+    # Strip ```json fences if present.
+    fenced = re.sub(r"```(?:json)?\s*", "", response)
+    fenced = re.sub(r"```", "", fenced)
+
+    # Try greedy first: from first '[' to last ']'.
+    first = fenced.find("[")
+    last = fenced.rfind("]")
+    if 0 <= first < last:
+        candidate = fenced[first: last + 1]
+        try:
+            data = json.loads(candidate)
+            if isinstance(data, list):
+                return candidate
+        except json.JSONDecodeError:
+            pass
+
+    # Fall back to balanced matches without nested arrays.
+    for m in _JSON_ARRAY_RE.findall(fenced):
+        try:
+            data = json.loads(m)
+            if isinstance(data, list):
+                return m
+        except json.JSONDecodeError:
+            continue
+    return None
+
+
+def parse_routing_response(response: str, kb: Any,
+                           top_k: int = 3) -> tuple[list[str], list[str]]:
+    """Parse an LLM routing response into ``(detector_choices, justifications)``.
+
+    Parameters
+    ----------
+    response : str
+        The raw LLM text. Expected to be a JSON array of
+        ``{"detector": str, "justification": str}`` objects, but
+        tolerates surrounding prose and markdown fences.
+    kb : pyod.utils.knowledge.KnowledgeBase
+        Used to validate detector names. Unknown names are skipped
+        with a warning.
+    top_k : int
+        Truncate to at most this many detectors. Default 3.
+
+    Returns
+    -------
+    detector_choices : list[str]
+        Validated, ordered list (length >= 1). Trimmed to ``top_k``.
+    justifications : list[str]
+        Parallel list, one short sentence per detector. Empty string
+        when the LLM omitted the field.
+
+    Raises
+    ------
+    RoutingParseError
+        If no JSON array can be extracted OR if fewer than 1 detector
+        survives validation against ``kb``.
+    """
+    if not isinstance(response, str):
+        raise RoutingParseError(
+            f"response must be a string; got {type(response).__name__}")
+
+    candidate = _extract_first_array(response)
+    if candidate is None:
+        raise RoutingParseError(
+            "no JSON array found in LLM response (response head: "
+            f"{response[:120]!r})")
+
+    try:
+        data = json.loads(candidate)
+    except json.JSONDecodeError as ex:
+        raise RoutingParseError(f"JSON parse error: {ex}") from ex
+
+    if not isinstance(data, list):
+        raise RoutingParseError(
+            f"expected JSON array; got {type(data).__name__}")
+
+    detector_choices: list[str] = []
+    justifications: list[str] = []
+    seen: set[str] = set()
+    for entry in data:
+        if isinstance(entry, str):
+            name = entry
+            just = ""
+        elif isinstance(entry, dict):
+            name = entry.get("detector") or entry.get("name") or ""
+            just = (entry.get("justification") or entry.get("reason")
+                    or "")
+        else:
+            continue
+        if not isinstance(name, str) or not name:
+            continue
+        # Drop duplicates so the LLM cannot pad top_k with repeats.
+        if name in seen:
+            continue
+        algo = kb.get_algorithm(name)
+        if algo is None:
+            logger.warning(
+                "parse_routing_response: skipping unknown detector %r "
+                "(not in KB)", name)
+            continue
+        if algo.get("status") != "shipped":
+            logger.warning(
+                "parse_routing_response: skipping non-shipped detector "
+                "%r (status=%r)", name, algo.get("status"))
+            continue
+        detector_choices.append(name)
+        justifications.append(just if isinstance(just, str) else "")
+        seen.add(name)
+        if len(detector_choices) >= top_k:
+            break
+
+    if not detector_choices:
+        raise RoutingParseError(
+            "no valid detector names in LLM response after KB "
+            f"validation (raw array: {candidate[:200]!r})")
+
+    return detector_choices, justifications
diff --git a/pyod/utils/_quality_metrics.py b/pyod/utils/_quality_metrics.py
index 395c44a3..97efa55d 100644
--- a/pyod/utils/_quality_metrics.py
+++ b/pyod/utils/_quality_metrics.py
@@ -46,8 +46,13 @@ def compute_quality(
     Each metric diagnoses one independent failure mode and drives
     one branch of `ADEngine.iterate()`:
 
-    - ``separation``: anomaly-vs-inlier mean score gap (global).
-      Low value indicates the detector did not produce a usable signal.
+    - ``separation``: relative mean score gap between the samples
+      flagged by the run and the rest, computed from the run's OWN
+      predicted labels. In ADEngine consensus, labels come from
+      detector votes while scores are rank-averaged, so this stays
+      descriptive and circular: it is not independent correctness
+      evidence. Treat it as descriptive, not as a label-free quality
+      signal.
     - ``agreement``: pairwise Spearman rank correlation across base
       detectors (cross-detector). Low value indicates detectors
       disagree on which samples are anomalous.
diff --git a/pyod/utils/ad_engine.py b/pyod/utils/ad_engine.py
index 5aeb4e00..4f72c8f8 100644
--- a/pyod/utils/ad_engine.py
+++ b/pyod/utils/ad_engine.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""ADEngine: Intelligent anomaly detection lifecycle engine.
+"""ADEngine: anomaly detection lifecycle engine.
 
 Handles data profiling, detection planning, detector construction,
 and knowledge queries. Works as a standalone Python API (no LLM
@@ -57,10 +57,15 @@ class ADEngine:
         Random seed forwarded to every detector that declares an
         explicit ``random_state`` parameter when the engine instantiates
         it from a plan. Detectors without ``random_state`` in their
-        signature (e.g., ABOD, KNN, LOF, SOD) are unaffected and remain
-        deterministic-up-to-numpy-module-state. Set this to a fixed
-        integer for reproducible flagged sets across re-runs on the
-        same input.
+        signature (e.g., ABOD, KNN, LOF, SOD) are deterministic by
+        construction (distance, angle, or density based, with no internal
+        sampling) and need no seed. With this set, the shallow-detector
+        pipeline is reproducible: a run-to-run audit of the shipped
+        shallow detectors found every one either honors the seed or is
+        deterministic by construction, with no nondeterministic cases.
+        Deep detectors additionally depend on framework-level seeding
+        (e.g., ``torch.manual_seed``). Set this to a fixed integer for
+        byte-identical flagged sets across re-runs on the same input.
     """
 
     def __init__(self, knowledge_dir: str | None = None,
@@ -188,7 +193,10 @@ def _with_contamination(self, detector_name: str,
         return out
 
     def plan_detection(self, profile: dict, priority: str = 'balanced',
-                       constraints: dict | None = None) -> dict:
+                       constraints: dict | None = None, *,
+                       top_k: int = 3,
+                       llm_client=None,
+                       llm_strict: bool | None = None) -> dict:
         """Plan a detection pipeline.
 
         Parameters
@@ -199,12 +207,56 @@ def plan_detection(self, profile: dict, priority: str = 'balanced',
             'speed', 'accuracy', or 'balanced'.
         constraints : dict or None
             Optional: {'exclude_detectors': [...]}
+        top_k : int, default 3
+            Number of detectors in the returned plan (primary + ``top_k - 1``
+            alternatives). Default ``3`` preserves the v3.5.2 behaviour
+            (``valid[1:3]`` produced two alternatives plus the primary).
+            Values < 1 are clamped to 1.
+        llm_client : callable or None, default None
+            Optional ``(prompt: str) -> str`` callable (see
+            :class:`pyod.utils._llm.LLMCallable`). When provided, routing
+            consults the LLM with the KB context and parses its response
+            into a plan via :func:`pyod.utils._llm.parse_routing_response`.
+            If the LLM call or parser raises, falls back to rule routing
+            with a :class:`RuntimeWarning` (see ``llm_strict``). When
+            ``None`` (default), v3.5.2 rule routing is unchanged.
+        llm_strict : bool or None, default None
+            Per-call control for LLM-routing failure mode. ``True``
+            re-raises any exception from ``llm_client`` or the response
+            parser; ``False`` falls back to rule routing with a
+            :class:`RuntimeWarning`; ``None`` defers to the
+            ``PYOD3_LLM_STRICT`` environment variable
+            (``"1"`` re-raises, anything else falls back). The explicit
+            kwarg takes precedence so concurrent callers in the same
+            process can choose independently.
 
         Returns
         -------
         plan : dict (DetectionPlan, closed schema)
         """
         constraints = constraints or {}
+        top_k = max(1, int(top_k))
+
+        if llm_client is not None:
+            try:
+                return self._plan_via_llm(profile, top_k, llm_client,
+                                          constraints)
+            except Exception as ex:  # noqa: BLE001
+                if llm_strict is None:
+                    import os
+                    strict = os.environ.get('PYOD3_LLM_STRICT') == '1'
+                else:
+                    strict = bool(llm_strict)
+                if strict:
+                    raise
+                import warnings
+                warnings.warn(
+                    f"plan_detection: llm_client routing failed "
+                    f"({type(ex).__name__}: {ex}); falling back to "
+                    "rule routing. Pass llm_strict=True (or set "
+                    "PYOD3_LLM_STRICT=1) to re-raise.",
+                    RuntimeWarning, stacklevel=2)
+
         exclude = set(constraints.get('exclude_detectors', []))
 
         matched = evaluate_rules(profile, priority, self.kb)
@@ -260,7 +312,7 @@ def plan_detection(self, profile: dict, priority: str = 'balanced',
             reason=r.get('_reason', ''),
             evidence=r.get('_evidence', []),
             confidence=r.get('confidence', 0.5),
-            alternatives=[]) for r in valid[1:3]]
+            alternatives=[]) for r in valid[1:top_k]]
 
         return make_plan(
             detector_name=best['detector'],
@@ -272,6 +324,285 @@ def plan_detection(self, profile: dict, priority: str = 'balanced',
             confidence=best.get('confidence', 0.7),
             alternatives=alternatives)
 
+    # ------------------------------------------------------------------
+    # Surface 1: KB exposure for caller-driven (agent / LLM) routing
+    # ------------------------------------------------------------------
+
+    def get_kb_for_routing(self, profile: dict, top_k: int = 3,
+                           constraints: dict | None = None) -> dict:
+        """Return a structured KB snapshot for caller-driven detector
+        selection.
+
+        This is the agent-facing companion to :meth:`plan_detection`.
+        ``plan_detection`` consumes the KB through hand-coded rules and
+        returns a single plan; ``get_kb_for_routing`` exposes the KB
+        directly so a caller (LLM agent, MCP tool client, ...) can
+        reason over each detector's strengths, weaknesses, complexity,
+        and benchmark rank, then call :meth:`make_plan` to commit a
+        plan.
+
+        Parameters
+        ----------
+        profile : dict
+            Output of :meth:`profile_data`. Must include ``data_type``;
+            ``n_samples`` / ``n_features`` are passed through unchanged.
+        top_k : int, default 3
+            The number of detectors the caller intends to select. The KB
+            snapshot itself is returned in full (filtered + sorted); the
+            field is included in the returned dict so the response-format
+            hint can reference it.
+        constraints : dict or None, optional
+            ``{'exclude_detectors': list[str], 'data_type_strict': bool}``.
+            ``exclude_detectors`` is a hard filter. ``data_type_strict``
+            (default ``True``) drops detectors whose KB ``data_types``
+            field does not include ``profile['data_type']``.
+
+        Returns
+        -------
+        dict
+            ``{'task_profile': {...}, 'available_detectors': [...],
+            'top_k_requested': int, 'response_format_hint': str,
+            'n_available': int}``.
+
+        Notes
+        -----
+        Pure function; no LLM calls, no state mutation.
+        """
+        if not isinstance(profile, dict):
+            raise ValueError("profile must be a dict from profile_data()")
+        top_k = max(1, int(top_k))
+        constraints = constraints or {}
+        exclude = set(constraints.get('exclude_detectors') or [])
+        data_type_strict = constraints.get('data_type_strict', True)
+        target_modality = profile.get('data_type', 'tabular')
+
+        catalog = self.list_detectors(data_type=None, status='shipped')
+        available: list[dict] = []
+        for entry in catalog:
+            name = entry.get('name') if isinstance(entry, dict) else str(entry)
+            if name in exclude:
+                continue
+            dts = entry.get('data_types') or []
+            modality_match = (target_modality in dts) if dts else True
+            if data_type_strict and not modality_match:
+                continue
+            complexity = entry.get('complexity') or {}
+            available.append({
+                'name': name,
+                'category': entry.get('category', 'unknown'),
+                'complexity_time': complexity.get('time'),
+                'complexity_space': complexity.get('space'),
+                'strengths': entry.get('strengths') or [],
+                'weaknesses': entry.get('weaknesses') or [],
+                'best_for': entry.get('best_for'),
+                'avoid_when': entry.get('avoid_when'),
+                'benchmark_rank': entry.get('benchmark_rank') or {},
+                'modality_match': modality_match,
+            })
+
+        # Modality-aware benchmark-rank keys. Each modality lists its
+        # preferred KB rank fields in priority order; the first non-None
+        # value sets the sort key. `ADBench_overall` is the universal
+        # fallback because the KB ships rank for nearly every tabular
+        # detector there. Detectors missing every key sort last (999).
+        _MODALITY_RANK_KEYS = {
+            'tabular': ['ADBench_overall'],
+            'time_series': ['TSB_AD_overall', 'TSB_AD_overall_iforest',
+                            'ADBench_overall'],
+            'timeseries': ['TSB_AD_overall', 'TSB_AD_overall_iforest',
+                           'ADBench_overall'],
+            'graph': ['BOND_deep', 'BOND_overall', 'ADBench_overall'],
+            'text': ['NLP_ADBench_overall', 'ADBench_overall'],
+            'image': ['MVTec_overall', 'ADBench_overall'],
+            'synthetic': ['ADBench_overall'],
+        }
+        rank_key_candidates = _MODALITY_RANK_KEYS.get(
+            str(target_modality).lower(),
+            [f"{str(target_modality).title()}_overall", 'ADBench_overall'])
+
+        def _rank(d):
+            br = d.get('benchmark_rank') or {}
+            for k in rank_key_candidates:
+                v = br.get(k)
+                if v is not None:
+                    return v
+            return 999
+
+        # Stamp the resolved (rank, rank_key) on each entry so downstream
+        # consumers (e.g., build_routing_prompt) can render the modality-
+        # specific rank without re-doing the lookup. None when no rank
+        # field is present in the KB for this detector under this modality.
+        for d in available:
+            br = d.get('benchmark_rank') or {}
+            resolved = None
+            resolved_key = None
+            for k in rank_key_candidates:
+                v = br.get(k)
+                if v is not None:
+                    resolved = v
+                    resolved_key = k
+                    break
+            d['resolved_rank'] = resolved
+            d['resolved_rank_key'] = resolved_key
+
+        available.sort(key=lambda d: (_rank(d), d['name']))
+
+        # Strip non-JSON-safe fields from the profile copy
+        profile_safe = {k: v for k, v in profile.items() if k != 'data'}
+
+        return {
+            'task_profile': profile_safe,
+            'available_detectors': available,
+            'top_k_requested': top_k,
+            'response_format_hint': (
+                "To commit your selection, call ADEngine.make_plan with "
+                "detector_choices=['detName1', ...] (ordered list of "
+                f"top-{top_k} names from available_detectors[*].name; "
+                "case-sensitive) and justifications=['why1', ...] "
+                "(parallel list, one short sentence each)."
+            ),
+            'n_available': len(available),
+        }
+
+    def make_plan(self, detector_choices: list,
+                  justifications: list | None = None,
+                  params: list | None = None) -> dict:
+        """Commit a caller-driven detector plan and return a DetectionPlan.
+
+        Companion to :meth:`get_kb_for_routing`. The caller (LLM agent,
+        rule engine, human script) selects ``len(detector_choices)``
+        detectors and this method validates names against the KB, fills
+        per-detector defaults, and packages the result as a
+        :func:`pyod.utils._kb_router.make_plan`-shaped dict so existing
+        consumers (``build_detector``, ``run``, downstream MCP clients)
+        keep working unchanged.
+
+        Parameters
+        ----------
+        detector_choices : list of str
+            Ordered list of detector class names. ``detector_choices[0]``
+            is the primary; the rest become ``alternatives`` in plan
+            order. Length must be >= 1. Names must match KB entries
+            (case-sensitive) with ``status='shipped'``; otherwise
+            ``ValueError`` is raised.
+        justifications : list of str, optional
+            Parallel to ``detector_choices``. One short sentence per
+            choice. ``None`` is accepted and yields autogenerated
+            reasons.
+        params : list of dict, optional
+            Parallel to ``detector_choices``. Per-detector constructor
+            kwargs. ``None`` -> KB defaults overlaid with the
+            engine's contamination resolution.
+
+        Returns
+        -------
+        dict
+            Closed-schema DetectionPlan: ``{'detector_name',
+            'params', 'reason', 'evidence', 'confidence',
+            'alternatives', 'note'}``.
+
+        Raises
+        ------
+        ValueError
+            If ``detector_choices`` is empty or any name is unknown /
+            not ``status='shipped'`` in the KB.
+        """
+        if not detector_choices:
+            raise ValueError(
+                "detector_choices must be non-empty; got an empty list")
+        if not isinstance(detector_choices, list):
+            raise ValueError(
+                "detector_choices must be a list of strings; "
+                f"got {type(detector_choices).__name__}")
+
+        justifications = list(justifications or [])
+        params_list = list(params or [])
+        while len(justifications) < len(detector_choices):
+            justifications.append('')
+        while len(params_list) < len(detector_choices):
+            params_list.append({})
+
+        unknown = []
+        not_shipped = []
+        for name in detector_choices:
+            algo = self.kb.get_algorithm(name)
+            if algo is None:
+                unknown.append(name)
+                continue
+            if algo.get('status') != 'shipped':
+                not_shipped.append(name)
+        if unknown:
+            raise ValueError(
+                "Unknown detector name(s) (case-sensitive). Names must "
+                "match KB entries from ADEngine.list_detectors(): "
+                f"{unknown!r}")
+        if not_shipped:
+            raise ValueError(
+                f"Detector(s) not shipped (cannot be built): {not_shipped!r}")
+
+        primary = detector_choices[0]
+        primary_params = self._with_contamination(
+            primary, params_list[0] or {})
+        alternatives = []
+        for i, det in enumerate(detector_choices[1:], start=1):
+            alt_params = self._with_contamination(det, params_list[i] or {})
+            alt_reason = (justifications[i] or
+                          'caller-selected via make_plan')
+            alternatives.append(make_plan(
+                detector_name=det,
+                params=alt_params,
+                reason=alt_reason,
+                evidence=['caller_selection'],
+                confidence=0.5,
+                alternatives=[]))
+
+        primary_reason = (justifications[0] or
+                          'caller-selected via make_plan')
+        return make_plan(
+            detector_name=primary,
+            params=primary_params,
+            reason=primary_reason,
+            evidence=['caller_selection'],
+            confidence=0.7,
+            alternatives=alternatives,
+            note='caller-driven via make_plan')
+
+    def _plan_via_llm(self, profile: dict, top_k: int, llm_client,
+                      constraints: dict | None = None) -> dict:
+        """Route via an LLM client (internal; see plan_detection)."""
+        from ._llm import (
+            RoutingParseError,
+            build_routing_prompt,
+            parse_routing_response,
+        )
+        kb_context = self.get_kb_for_routing(
+            profile, top_k=top_k, constraints=constraints or {})
+        prompt = build_routing_prompt(kb_context, top_k=top_k)
+        response = llm_client(prompt)
+        detector_choices, justifications = parse_routing_response(
+            response, self.kb, top_k=top_k)
+        # LLM output is untrusted: enforce the constrained KB context
+        # (exclude_detectors + data_type_strict) after parsing. Without
+        # this, a hostile or buggy client could return an excluded or
+        # modality-mismatched detector and get an LLM-sourced plan.
+        # parse_routing_response only validates against the global KB.
+        allowed = {d['name'] for d in kb_context.get(
+            'available_detectors', [])}
+        blocked = [name for name in detector_choices if name not in allowed]
+        if blocked:
+            raise RoutingParseError(
+                "LLM selected detector(s) outside the constrained KB "
+                f"context: {blocked!r}. The constrained context "
+                f"excluded {sorted(constraints.get('exclude_detectors') or [])!r}.")
+        plan = self.make_plan(
+            detector_choices=detector_choices,
+            justifications=justifications)
+        # Tag the plan so downstream code can distinguish LLM-sourced
+        # plans from caller-driven or rule-driven ones.
+        plan['note'] = 'llm-driven via plan_detection(llm_client=...)'
+        plan['evidence'] = ['llm_routing']
+        return plan
+
     # ------------------------------------------------------------------
     # Detector construction
     # ------------------------------------------------------------------
diff --git a/pyod/version.py b/pyod/version.py
index 3a0dbdae..ab5ffa78 100644
--- a/pyod/version.py
+++ b/pyod/version.py
@@ -1,23 +1,23 @@
-"""
-``pyod`` is a python toolbox for scalable outlier detection
-"""
-# Based on NiLearn package
-# License: simplified BSD
-
-# PEP0440 compatible formatted version, see:
-# https://www.python.org/dev/peps/pep-0440/
-#
-# Generic release markers:
-# X.Y
-# X.Y.Z # For bug fix releases
-#
-# Admissible pre-release markers:
-# X.YaN # Alpha release
-# X.YbN # Beta release
-# X.YrcN # Release Candidate
-# X.Y # Final release
-#
-# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
-# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-#
-__version__ = '3.5.2'  # pragma: no cover
+"""
+``pyod`` is a python toolbox for scalable outlier detection
+"""
+# Based on NiLearn package
+# License: simplified BSD
+
+# PEP0440 compatible formatted version, see:
+# https://www.python.org/dev/peps/pep-0440/
+#
+# Generic release markers:
+# X.Y
+# X.Y.Z # For bug fix releases
+#
+# Admissible pre-release markers:
+# X.YaN # Alpha release
+# X.YbN # Beta release
+# X.YrcN # Release Candidate
+# X.Y # Final release
+#
+# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
+# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
+#
+__version__ = '3.5.4'  # pragma: no cover
diff --git a/pyproject.toml b/pyproject.toml
index d9cc7d5a..d9e6e5dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "pyod"
 dynamic = ["version", "dependencies"]
-description = "A Python library for anomaly detection across tabular, time series, graph, text, and image data. 60+ detectors, benchmark-backed ADEngine orchestration, and an agentic workflow for AI agents."
+description = "A Python library for anomaly detection across tabular, time series, graph, text, and image data. 60 detectors, benchmark-backed ADEngine orchestration, and an agentic workflow for AI agents."
 readme = "README.rst"
 requires-python = ">=3.9"
 license = "BSD-2-Clause"
diff --git a/scripts/regen_skill.py b/scripts/regen_skill.py
index b73e0c56..e6a2e30d 100644
--- a/scripts/regen_skill.py
+++ b/scripts/regen_skill.py
@@ -46,9 +46,18 @@
 
 
 def _load_kb():
-    """Load pyod.utils.knowledge.algorithms once."""
+    """Load pyod.utils.knowledge.algorithms once.
+
+    Planned (not-yet-implemented) detectors are excluded so the skill's
+    detector counts and lists reflect only buildable detectors. A planned
+    entry such as ``LLMAD`` has no backing module, so the agent must not
+    count it or recommend it; it stays in the raw KB as a roadmap entry.
+    """
     from pyod.utils.ad_engine import ADEngine
-    return ADEngine().kb.algorithms
+    algos = ADEngine().kb.algorithms
+    return {name: meta for name, meta in algos.items()
+            if not (isinstance(meta, dict)
+                    and meta.get("status") == "planned")}
 
 
 def _format_complexity(complexity):