From 868d6c30fc5e05c8496c667e9761b9bfac50d4dc Mon Sep 17 00:00:00 2001
From: Yue Zhao <yzhao062@gmail.com>
Date: Tue, 19 May 2026 00:14:30 -0700
Subject: [PATCH] v3.5.3: KB-tools API for agent-driven and LLM-driven routing

Surface 1 (agent tools):
- ADEngine.get_kb_for_routing(profile, top_k=3, constraints=None) returns a
  structured KB snapshot (every shipped detector with strengths, weaknesses,
  best_for, avoid_when, complexity, benchmark_rank, modality_match) filtered
  by exclude_detectors + data_type_strict and sorted by modality-specific
  benchmark rank keys (TSB_AD_overall for time-series, BOND_deep for graph,
  NLP_ADBench_overall for text, MVTec_overall for image, ADBench_overall
  fallback). Each entry carries resolved_rank + resolved_rank_key for
  downstream tools.
- ADEngine.make_plan(detector_choices, justifications=None, params=None)
  validates names against the KB (case-sensitive, must be shipped) and
  returns a closed-schema DetectionPlan consumable by build_detector / run.

Surface 2 (programmatic API):
- ADEngine.plan_detection(profile, priority='balanced', constraints=None, *,
  top_k=3, llm_client=None, llm_strict=None) accepts a (prompt: str) -> str
  callable. Engine builds the routing prompt internally, invokes the
  callable, parses the response, enforces the constrained KB context
  post-parse (LLM cannot bypass exclude_detectors / data_type_strict), and
  returns a DetectionPlan with note='llm-driven via plan_detection(...)' +
  evidence=['llm_routing']. On LLM call or parse failure, falls back to
  rule routing with a RuntimeWarning. llm_strict=True per-call overrides
  the PYOD3_LLM_STRICT env var; precedence is explicit kwarg > env > default.
- pyod/utils/_llm.py: new module with LLMCallable Protocol, RoutingParseError,
  build_routing_prompt(kb_context, top_k), parse_routing_response(response,
  kb, top_k). Parser tolerates surrounding prose, markdown fences, BOM/CRLF,
  skips unknown / non-shipped detectors with a logged warning, dedupes,
  truncates to top_k.

top_k generalization: plan_detection's previous valid[1:3] hard cap is now
valid[1:top_k]. Default 3 preserves v3.5.2 behavior. Values < 1 clamp to 1.

Tests: 44 new in pyod/test/test_kb_router_surface1.py covering Surface 1
schema / filters / KB validation / ordering / clamp, make_plan single + multi
detector / unknown / non-list / contamination overlay / build_detector
consumption, plan_detection top_k variants, llm_client stub canned plan /
top_k truncation / malformed fallback / strict re-raise / None preserves
rule plan, parser robustness (prose / markdown / dedupe / truncate / no-array
raise / all-invalid raise / bare-string), the post-parse constrained KB
guard (catches both exclude_detectors and data_type_strict bypass), per-call
llm_strict three-way precedence, modality-specific rank-key ordering
(time-series + graph), keyword-only signature, prompt builder modality
annotation. All 205 existing ADEngine-related tests still pass (171 in
test_ad_engine.py + test_ad_engine_v3.py + test_ad_engine_compare.py).

Backward compatibility: every v3.5.2 caller pattern produces identical
output. The new top_k, llm_client, and llm_strict parameters are
keyword-only via a * separator before them in the signature.

Reviewed via /implement-review auto across 4 Codex rounds (3 substantive
fixes + 2 nit fixes); Round 4 verdict: zero new findings, commit-ready.

Out of scope: routing_rules.json rule authoring; LLM-decided top_k; built-in
CLI adapters for Codex / Claude Code; async llm_client. No breaking API
changes.
---
 CHANGES.txt                          |   1 +
 pyod/test/test_kb_router_surface1.py | 490 +++++++++++++++++++++++++++
 pyod/utils/_llm.py                   | 267 +++++++++++++++
 pyod/utils/ad_engine.py              | 330 +++++++++++++++++-
 pyod/version.py                      |  46 +--
 5 files changed, 1109 insertions(+), 25 deletions(-)
 create mode 100644 pyod/test/test_kb_router_surface1.py
 create mode 100644 pyod/utils/_llm.py

diff --git a/CHANGES.txt b/CHANGES.txt
index 3fd711eb..e8ff7e71 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -226,3 +226,4 @@ v<3.4.0>, <05/10/2026> -- ADEngine and MCP audit-cycle fixes from a UCI Ionosphe
 v<3.5.0>, <05/11/2026> -- Sustainable model persistence. New `pyod.utils.persistence` module with three additive helpers (`save`, `load`, `compat_load`); no breaking change to existing `joblib.dump` / `joblib.load` workflows. `save(clf, path, metadata=None)` writes a versioned envelope (`_pyod_persistence_version`, `pyod_version`, `sklearn_version`, `numpy_version`, `scipy_version`, `joblib_version`, `python_version`, `saved_at`, `model_class`, optional user metadata, model). `load(path, strict=False, return_metadata=False)` reads the envelope, compares the recorded dependency versions against the running environment, and emits a `UserWarning` on drift in any of sklearn, joblib, numpy, or scipy. Python-version drift is severity `info` and is diagnostic only: non-strict `load` does not warn and `strict=True` does not raise on `python_version`-only drift on the normal envelope path; after a compat repair, strict mode still refuses to return the repaired model, but the error follows the no-drift compat-repair branch and does not name `python_version`. `strict=True` escalates every `warn`-severity drift to `ValueError`, rejects raw legacy artifacts that have no envelope, and refuses to return a model that required a compatibility repair. `return_metadata=True` returns `(model, envelope_without_model_field)`. `compat_load(path, mmap_mode=None)` mirrors `joblib.load` with the BUILD-opcode dispatch entry patched on a subclass of `joblib.numpy_pickle.NumpyUnpickler`; when sklearn's `Tree.__setstate__` would raise `ValueError: node array from the pickle has an incompatible dtype`, the saved Tree-node state is realigned to the running sklearn dtype first. Realignment is allowlist-driven: `_TREE_NODE_FIELD_DEFAULTS` (currently `{"missing_go_to_left": 0}`, the pre-1.3 sklearn default) zero-fills documented missing fields, `_TREE_NODE_FIELD_RENAMES` (empty in v3.5.0) maps known renames with rename targets resolved BEFORE the missing-field-default check so a future rename does not also need a default entry, and any other dtype difference (unknown new field, kind change, signedness change, itemsize change, shape change) raises `ValueError` with a re-fit recommendation. Same-name byte-order-only differences realign safely. Current dtype is discovered dynamically from `sklearn.tree._tree.NODE_DTYPE` (no hardcoded layout). A single `UserWarning` recommending re-fit fires when at least one Tree was realigned; non-tree artifacts (ECOD, COPOD, HBOS, LOF, ...) pass through silently. `load()` falls through to `compat_load()` automatically when `joblib.load` raises the documented dtype prefix; the original exception is preserved via `raise ... from`, and a non-prefix `ValueError` from `joblib.load` propagates without invoking `compat_load`. Dependency floor: `requirements.txt` and `docs/requirements.txt` now pin `joblib>=1.5` because `compat_load` reuses `joblib.numpy_pickle._validate_fileobject_and_memmap` and the joblib 1.5 `NumpyUnpickler(filename, file_handle, ensure_native_byte_order, mmap_mode=...)` constructor; older joblib lacks both, and the import is guarded with a clear `ImportError` recommending an upgrade. Closes issue #519. Tests: 31 new in `test_persistence.py` covering Tree-dtype realignment (synthetic aged pickles produced by an `_OldDtypeTree` pickle-time shim), the committed binary fixture under `pyod/test/fixtures/iforest_sklearn_1_2_x.joblib` (a real sklearn 1.2.2 IsolationForest, regenerable via `regen_iforest_sklearn_1_2.py`), envelope round-trip, version-drift warnings including the `info`-only `python_version` silent case, strict-mode rejection paths, schema-version validation including a future-version reject, the strict-after-compat no-drift case, exception chaining, a synthetic rename test that proves `_TREE_NODE_FIELD_RENAMES` works without a paired `_TREE_NODE_FIELD_DEFAULTS` entry, and a monkey-patched `joblib.load` test that pins the exact-prefix fall-through gate (non-prefix `ValueError` propagates unchanged; prefix `ValueError` invokes `compat_load` exactly once). CI: new `persistence-nightly` job in `testing-cron.yml` installs pre-release `sklearn` / `numpy` / `scipy` / `joblib` (scientific-python nightly index) and runs only `test_persistence.py`; failure surfaces upstream dtype evolution before downstream users hit it and is not a release blocker. Docs: `docs/model_persistence.rst` rewritten with quick-start, trust-boundary, why-versioning, legacy-load decision tree, cross-sklearn-version compatibility section, troubleshooting table keyed on error text, strict-mode notes, and envelope-metadata-reading guidance. `docs/pyod.utils.rst` cross-references the new module. `examples/save_load_model_example.py` now leads with `persistence.save` / `persistence.load` and notes raw `joblib` as a secondary alternative. Deferred: a true header-only `inspect_artifact(path)` and `pyod inspect <path>` CLI require a `.pyod` zip container layout (metadata sidecar separate from the model payload) and remain Phase 3 work; deep-learning state-dict persistence stays scoped to its own future design. No breaking API changes.
 v<3.5.1>, <05/13/2026> -- External-contributor PR review pass (jbbqqf + tuanaiseo bundles) plus NSF funding acknowledgment. Bug fixes: LUNAR no longer shares its `MinMaxScaler` across instances because the constructor default was a mutable shared object; `LUNAR.__init__` now defaults `scaler=None`, `_resolve_scaler()` materializes a fresh `MinMaxScaler` per fit (or deep-copies a user-supplied instance, or disables scaling on `scaler=False`), and the fitted scaler lives on `self.scaler_` so `sklearn.base.clone()` round-trips (closes #502). DIF stops double-normalizing during fit: the inner `self.decision_function(X)` call that set `decision_scores_` was receiving an already-min-max-scaled `X`, and `decision_function` re-scales internally; the fix preserves the raw `X` and passes it to `decision_function`, so `decision_scores_` now matches `decision_function(X_train)` (closes #546). SOS perplexity inner loop replaces `np.sum(...)` with the ndarray `.sum()` method (closes #635); numerical equivalence test asserts bit-exact match. SUOD defers the optional `suod` import to the `SUOD()` constructor with an actionable `ImportError` instead of the old print-then-crash pattern at module top (closes #640). LOF docstring corrects the `novelty` default from `False` to `True` (matches the actual `__init__` default, which is required for PyOD's fit-then-predict contract because scikit-learn's `LocalOutlierFactor` only exposes `predict`/`decision_function` on unseen data in novelty mode); a regression test pins both the `inspect.signature` default and the docstring substring (closes #638). GAAL torch-optional handling: `pyod/models/gaal_base.py` (closes #660 via tuanaiseo), then a follow-up extends the same guarded-import + actionable `ImportError` pattern to `pyod/models/mo_gaal.py`, `pyod/models/so_gaal.py`, and `pyod/models/so_gaal_new.py` so user-visible imports `from pyod.models.mo_gaal import MO_GAAL` and `from pyod.models.so_gaal import SO_GAAL` no longer print-then-crash when torch is absent; all four GAAL files now raise the unified message pointing at `pip install pyod[torch]` or `pip install torch`. `pyod/models/__init__.py` adds an inline comment explaining why detector imports are deliberately omitted at the package level (several detectors require optional extras). Funding: README.rst gains an Acknowledgments section and docs/about.rst gains a Funding section, both citing NSF Award No. 2346158, "NSF POSE: Phase II: OpenAD: An Integrated Open-Source Ecosystem for Anomaly Detection," using the NSF PAPPG recipient-obligation form with the standard disclaimer; lead and sub-awardee organizations are listed separately from PI/co-PI names to avoid stale per-person affiliation claims. Tests: 6 new across `test_lof.py` (1), `test_dif.py` (1), `test_sos.py` (1), `test_lunar.py` (2), and `test_suod.py` (1). No breaking API changes.
 v<3.5.2>, <05/18/2026> -- Reproducibility and kwargs-forwarding bug fixes surfaced by the PyOD 3 paper (KDD 2027 ADS Cycle 1) §5 evidence work. Bug fixes: (1) Closes #685 (`ABOD`/`KNN`/`LUNAR`/`SOD` accepted arbitrary `**kwargs` and forwarded them unfiltered to `sklearn.neighbors.NearestNeighbors`, which crashed at fit time -- or, for KNN, at `__init__` time -- on any kwarg outside `NearestNeighbors`'s signature, including the sklearn-convention `random_state`, a `verbose` flag, or a typo like `n_neighbours`). The four detectors were introduced in commit b8f6c81 (fix for #654) with the over-forwarded `**kwargs`. The fix removes `**kwargs` from each `__init__` and stops forwarding `**self.kwargs` / `**kwargs` to `NearestNeighbors`; the six named forwarding parameters added in b8f6c81 (`algorithm`, `leaf_size`, `metric`, `p`, `metric_params`, `n_jobs`) still cover the use case #654 asked for. Unknown kwargs on ABOD / KNN / SOD now raise a clean `TypeError: <Detector>.__init__() got an unexpected keyword argument '...'` at construction time that points at the user's call site (the sklearn stack frame from the late-fit crash is gone); regression tests assert that the error message names the detector class and does NOT contain `NearestNeighbors` so a future regression that re-introduces the old shape is caught. LUNAR is the one #685 detector that is actually stochastic (it calls `train_test_split`, uses `np.random` in `generate_negative_samples`, and initializes plus trains a torch network), so it does not reject `random_state`; instead, `LUNAR.__init__` now declares an explicit `random_state=None` parameter that accepts either `int` or `numpy.random.RandomState` (sklearn convention; both forms go through `sklearn.utils.check_random_state`) and threads through (a) `torch.manual_seed` (and `torch.cuda.manual_seed_all` when CUDA is available) before the network is built, deriving a single int seed by drawing once from `check_random_state(random_state)`, (b) the numpy `RandomState` returned by the same `check_random_state` used as the `random_state` argument to `sklearn.model_selection.train_test_split`, and (c) the same `random_state` argument added to `generate_negative_samples`. After the fix, two `LUNAR(random_state=42)` instances fit on the same X produce identical `labels_` and `decision_scores_` (within 1e-6). Soft API removal: the accidental arbitrary-`**kwargs` surface added in b8f6c81 is gone. Code that relied on it -- for example `ABOD(some_unknown_kwarg=value)` -- now fails fast at the constructor call instead of at the `NearestNeighbors` constructor inside fit. The six named forwarding parameters still work; this is the only meaningful behavior change. (2) Closes #686 (`ADEngine.investigate` was non-deterministic on byte-identical input because no public API pinned `random_state`). The fix adds `random_state: int | None = None` to `ADEngine.__init__`; the engine stores the seed and passes it through `ADEngine.build_detector` -> `build_detector_from_plan(plan, kb, random_state=...)`. The factory then injects `random_state` into `plan['params']` only for detector classes whose `__init__` declares an explicit `random_state` parameter (verified via `inspect.signature`); detectors that do not declare it -- ABOD, KNN, SOD, and other deterministic classes -- are instantiated unchanged, so the v3.5.1 call shape for those classes is preserved bit-for-bit. A caller-supplied `plan['params']['random_state']` wins over the engine default to preserve explicit caller intent. The factory does `dict(plan.get('params', {}))` before injecting so the caller's plan is not mutated. `build_from_preset(...)` was likewise updated to forward the engine seed: `EmbeddingOD` presets `for_text` / `for_image` (called via `build_detector_from_plan` when `plan.get('preset')` is set) now receive `random_state` as a kwarg, `EmbeddingOD.__init__` accepts and stores it, and `resolve_detector(detector, contamination, random_state=...)` injects the seed into the inner shortcut detector (`'LUNAR'`, `'KNN'`, ...) when that detector class declares `random_state`. `EmbeddingOD._preprocess_fit` also passes `random_state=self.random_state` to the optional ``PCA(n_components=self.reduce_dim, ...)`` dimensionality-reducer so a preset plan with `reduce_dim` set is fully deterministic (PCA can otherwise pick a randomized SVD solver under `svd_solver='auto'` on high-dimensional embeddings, which would have left a stochastic preprocessing step before the seeded detector). The external encoder's own inference (sentence-transformers, DINOv2) is treated as deterministic given fixed weights and is NOT seeded by `EmbeddingOD.random_state`; the docstring documents this boundary. With this, `ADEngine(random_state=42).build_detector({'detector_name': 'EmbeddingOD', 'preset': 'for_text', 'params': {'quality': 'balanced'}})` now produces an `EmbeddingOD(detector='LUNAR', random_state=42)` and the inner `LUNAR` is seeded -- closing the round-2-flagged gap where `EmbeddingOD.for_text()` defaults to LUNAR and silently dropped the engine seed. With `ADEngine(random_state=42)`, repeated `investigate(X)` calls on the same X now produce byte-identical `state.consensus['labels']` and identical `state.analysis['consensus_analysis']['anomaly_ratio']`, and the engine seed propagates end-to-end through `detect()`, `investigate()` -> `run()`, post-recovery reruns, and the `EmbeddingOD` text / image preset path because every path instantiates through `self.build_detector()`. The previously-broken LUNAR direct-plan case is also covered: `ADEngine(random_state=42).run_detection(X, {'detector_name': 'LUNAR', 'params': {...}})` is now bit-stable across reruns. Backward compatibility: `ADEngine()` without a seed retains v3.5.1 behavior (no determinism guarantee). (3) Closes #469 (LODA results are not reproducible because `LODA.__init__` did not accept `random_state` and the inner `np.random.randn` + `np.random.permutation` calls fell back to numpy's module-level state). The fix adds `random_state: int | None = None` to `LODA.__init__`, threads it through `sklearn.utils.check_random_state`, and replaces the two `np.random.*` call sites with `rng.randn(...)` and `rng.permutation(...)` so two `LODA(random_state=42)` fits on the same X produce bit-identical `decision_scores_`. Because LODA now declares `random_state` in its signature, `ADEngine(random_state=42)` propagates the engine seed to LODA plans through the same `_accepts_random_state` factory path used for IForest / LUNAR. Tests: 31 new across `test_ad_engine.py::TestRandomStateDeterminism` (4 -- determinism + cross-seed + default + LUNAR-plan determinism), `test_ad_engine.py::TestRandomStateFactory` (11 -- IForest seed injection, plan-level override wins, KNN/ABOD/SOD not given a seed, plan dict not mutated, no-seed default unchanged, plus 3 preset-path tests for `EmbeddingOD.for_text` seed propagation, plan-level wins, and no-seed default, plus a monkeypatch test asserting `EmbeddingOD._preprocess_fit` constructs `PCA(random_state=...)` with the engine seed), `test_abod.py::TestABODKwargsRejection` (3 -- tightened to assert `ABOD` in the error message and `NearestNeighbors` not in it), `test_knn.py::TestKNNKwargsRejection` (3 -- same tightening for `KNN`), `test_lunar.py::TestLUNARKwargsAndRandomState` (4 -- unknown kwarg rejection with tightened message check + default construction + same-seed determinism + `RandomState` object input accepted), `test_sod.py::TestSODKwargsRejection` (3 -- same tightening for `SOD`), `test_loda.py::TestLODARandomState` (3 -- same-seed determinism + cross-seed differ + no-seed unchanged). Related progress on #599 (sklearn-style `random_state` across pyod): `ADEngine.__init__`, `LUNAR.__init__`, `LODA.__init__`, and `EmbeddingOD.__init__` now accept `random_state`; ABOD / KNN / SOD reject unknown kwargs cleanly at construction. Other detectors with internal stochasticity (e.g., deep-learning models that depend on torch state, `IForest` which already had `random_state`) are not in scope for v3.5.2 and remain follow-up work tracked under #599.
+v<3.5.3>, <05/19/2026> -- KB-tools API for agent-driven and LLM-API-driven routing. Surface 1 (agent tools): `ADEngine.get_kb_for_routing(profile, top_k=3, constraints=None)` returns a structured KB snapshot (every shipped detector with strengths, weaknesses, best_for, avoid_when, complexity, benchmark_rank, modality_match) filtered by `constraints.exclude_detectors` and `constraints.data_type_strict` (default True), sorted by benchmark rank for the profile modality. `ADEngine.make_plan(detector_choices, justifications=None, params=None)` validates the caller-chosen ordered detector list against the KB (case-sensitive; unknown / non-shipped names raise `ValueError`), overlays per-detector params with engine contamination resolution, and returns a closed-schema `DetectionPlan` consumable by `build_detector` / `run`. The pair lets agent runtimes (Claude Code, Codex CLI, MCP tool clients) reason over the KB directly and commit a routing decision without going through hand-coded rules. Surface 2 (programmatic API): `ADEngine.plan_detection(profile, llm_client=callable, top_k=3)` accepts a user-supplied `(prompt: str) -> str` callable wrapping any LLM SDK (Anthropic, OpenAI, vLLM, self-hosted). When `llm_client` is set, the engine builds the routing prompt internally via `pyod.utils._llm.build_routing_prompt`, invokes the callable, parses the response via `pyod.utils._llm.parse_routing_response`, and returns the same `DetectionPlan` shape. On LLM call failure or response parse failure, falls back to rule-driven routing with a `RuntimeWarning`; set `PYOD3_LLM_STRICT=1` to re-raise instead. `LLMCallable` is a Protocol -- PyOD ships no provider-specific adapter classes; users wrap their own SDK. The parser tolerates surrounding prose and markdown fences, skips unknown detector names with a logged warning, dedupes, and truncates to `top_k`; raises `RoutingParseError` if no JSON array is extractable or no valid detector survives KB validation. `top_k` generalization: `ADEngine.plan_detection(..., top_k=3)` exposes the previously hard-coded `valid[1:3]` alternatives slice as a parameter. Default 3 preserves v3.5.2 behavior; values < 1 are clamped to 1. Tests: 44 new in `test_kb_router_surface1.py` covering schema, filters, ordering, KB validation, top_k clamping, stub LLM client canned plan, top_k truncation of LLM response, malformed response fallback, `PYOD3_LLM_STRICT=1` re-raise, prose tolerance, markdown-fence tolerance, dedupe, and bare-string entries. All 205 existing ADEngine tests continue to pass. Backward compatibility: every v3.5.2 caller pattern (`plan_detection(profile)`, `plan_detection(profile, priority=...)`, `plan_detection(profile, constraints=...)`) produces identical output. The new `top_k=3` and `llm_client=None` parameters are keyword-only with backward-compatible defaults. Out of scope: `routing_rules.json` rule authoring (rules remain the offline fallback); LLM-decided `top_k` (caller decides); built-in CLI adapter classes for Codex / Claude Code (users wrap subscriptions themselves); async `llm_client`. No breaking API changes. Round 1 reviewer fixes (Codex via /implement-review auto): (a) High: `_plan_via_llm` now enforces the constrained KB context after parsing -- if the LLM returns a detector excluded by `constraints.exclude_detectors` or filtered by `data_type_strict`, the engine raises `RoutingParseError` and falls back to rule routing with a `RuntimeWarning`. Previously the LLM path validated only against the global KB and could bypass hard `exclude_detectors` constraints. (b) Medium: `get_kb_for_routing` now consults modality-specific benchmark-rank keys instead of `{modality}.title() + '_overall'` only -- `time_series` uses `TSB_AD_overall` / `TSB_AD_overall_iforest`, `graph` uses `BOND_deep` / `BOND_overall`, `text` uses `NLP_ADBench_overall`, `image` uses `MVTec_overall`, all with `ADBench_overall` as the universal fallback. Previously non-tabular modalities effectively sorted alphabetically because the legacy key form did not match the KB's actual rank fields. (c) Medium: new per-call kwarg `plan_detection(..., llm_strict: bool | None = None)`. Precedence: explicit `True` re-raises on LLM/parse failure; explicit `False` falls back with `RuntimeWarning`; `None` defers to `PYOD3_LLM_STRICT` env var. The env-only switch was process-global and incorrect for concurrent callers in the same process. Six additional regression tests cover the constraint bypass, modality rank-key ordering for time_series and graph, and the three-way llm_strict precedence (True/False/None). Round 2 reviewer fixes (Codex via /implement-review auto): (d) Med: `plan_detection`'s new `top_k`, `llm_client`, and `llm_strict` parameters are now actually keyword-only via a `*` separator before them in the signature, matching the release notes claim. (e) Med: `get_kb_for_routing` now stamps each returned detector entry with `resolved_rank` and `resolved_rank_key` fields carrying the modality-specific benchmark rank it used for sorting; `build_routing_prompt` reads those fields so the LLM-facing prompt now shows e.g. `rank=10 (TSB_AD_overall)` for time-series detectors instead of the empty `rank=` it previously rendered (because the prompt had hard-coded the legacy `{modality}.title() + '_overall'` key). Three additional regression tests cover (a) the keyword-only signature contract, (b) prompt rank annotation under time-series, and (c) the text-modality fallback path when the KB has no rank data.
diff --git a/pyod/test/test_kb_router_surface1.py b/pyod/test/test_kb_router_surface1.py
new file mode 100644
index 00000000..e8efd48b
--- /dev/null
+++ b/pyod/test/test_kb_router_surface1.py
@@ -0,0 +1,490 @@
+# -*- coding: utf-8 -*-
+"""Tests for pyod 3.5.3 Surface 1 (KB-tools for agent-driven routing).
+
+Covers:
+- ADEngine.get_kb_for_routing(profile, top_k, constraints)
+- ADEngine.make_plan(detector_choices, justifications, params)
+- ADEngine.plan_detection(..., top_k=K) parameter generalization
+"""
+
+import os
+import sys
+import unittest
+
+import numpy as np
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from pyod.utils.ad_engine import ADEngine
+
+
+class TestGetKbForRouting(unittest.TestCase):
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(200, 8))
+        self.X[-10:] += 4 * rng.normal(size=(10, 8))
+        self.profile = self.engine.profile_data(self.X, data_type='tabular')
+
+    def test_returns_expected_keys(self):
+        kb = self.engine.get_kb_for_routing(self.profile, top_k=3)
+        for k in ('task_profile', 'available_detectors',
+                  'top_k_requested', 'response_format_hint',
+                  'n_available'):
+            assert k in kb, f"missing key {k}"
+
+    def test_top_k_field_preserved(self):
+        kb = self.engine.get_kb_for_routing(self.profile, top_k=5)
+        assert kb['top_k_requested'] == 5
+        # Non-positive top_k is clamped to 1
+        kb1 = self.engine.get_kb_for_routing(self.profile, top_k=0)
+        assert kb1['top_k_requested'] == 1
+
+    def test_detector_entries_have_kb_fields(self):
+        kb = self.engine.get_kb_for_routing(self.profile)
+        assert kb['n_available'] > 0
+        sample = kb['available_detectors'][0]
+        for k in ('name', 'category', 'complexity_time', 'complexity_space',
+                  'strengths', 'weaknesses', 'best_for', 'avoid_when',
+                  'benchmark_rank', 'modality_match'):
+            assert k in sample, f"missing detector field {k}"
+        assert isinstance(sample['strengths'], list)
+        assert isinstance(sample['weaknesses'], list)
+
+    def test_exclude_constraint(self):
+        kb = self.engine.get_kb_for_routing(
+            self.profile, constraints={'exclude_detectors': ['IForest', 'KNN']})
+        names = [d['name'] for d in kb['available_detectors']]
+        assert 'IForest' not in names
+        assert 'KNN' not in names
+
+    def test_data_type_strict_filter(self):
+        # Default data_type_strict=True drops detectors whose data_types do
+        # not include the profile's modality.
+        kb_strict = self.engine.get_kb_for_routing(self.profile)
+        kb_lax = self.engine.get_kb_for_routing(
+            self.profile, constraints={'data_type_strict': False})
+        assert kb_strict['n_available'] <= kb_lax['n_available']
+
+    def test_pure_function(self):
+        # No state mutation; profile dict is not modified.
+        before = dict(self.profile)
+        self.engine.get_kb_for_routing(self.profile, top_k=3)
+        assert self.profile == before
+
+    def test_bad_profile_raises(self):
+        with self.assertRaises(ValueError):
+            self.engine.get_kb_for_routing("not a dict")
+
+    def test_ranking_order_iforest_first_on_tabular(self):
+        # KB benchmark_rank should place IForest near the top on ADBench.
+        kb = self.engine.get_kb_for_routing(self.profile)
+        names = [d['name'] for d in kb['available_detectors'][:5]]
+        assert 'IForest' in names
+
+    def test_ranking_uses_modality_specific_keys_time_series(self):
+        """Med-2 (Codex Round 1): time_series should sort by TSB_AD_overall.
+
+        Asserts the modality-specific rank key is consulted by checking
+        that the available_detectors list is sorted by TSB_AD_overall
+        (ascending) for any pair of detectors that both declare it.
+        """
+        ts_profile = dict(self.profile)
+        ts_profile['data_type'] = 'time_series'
+        kb = self.engine.get_kb_for_routing(ts_profile)
+        ts_ranked = [
+            (d['name'], (d.get('benchmark_rank') or {}).get('TSB_AD_overall'))
+            for d in kb['available_detectors']
+            if (d.get('benchmark_rank') or {}).get('TSB_AD_overall') is not None
+        ]
+        # Any pair with TSB_AD_overall must be in non-decreasing rank order.
+        ranks = [r for _, r in ts_ranked]
+        assert ranks == sorted(ranks), (
+            f"time_series ordering ignored TSB_AD_overall: {ts_ranked}")
+
+    def test_ranking_uses_modality_specific_keys_graph(self):
+        """Med-2 (Codex Round 1): graph should sort by BOND_deep / BOND_overall."""
+        g_profile = dict(self.profile)
+        g_profile['data_type'] = 'graph'
+        kb = self.engine.get_kb_for_routing(g_profile)
+        g_ranked = []
+        for d in kb['available_detectors']:
+            br = d.get('benchmark_rank') or {}
+            r = br.get('BOND_deep') or br.get('BOND_overall')
+            if r is not None:
+                g_ranked.append((d['name'], r))
+        ranks = [r for _, r in g_ranked]
+        assert ranks == sorted(ranks), (
+            f"graph ordering ignored BOND keys: {g_ranked}")
+
+
+class TestMakePlan(unittest.TestCase):
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+
+    def test_single_detector(self):
+        plan = self.engine.make_plan(['IForest'])
+        assert plan['detector_name'] == 'IForest'
+        assert plan['confidence'] == 0.7
+        assert plan['alternatives'] == []
+
+    def test_three_detectors_with_justifications(self):
+        plan = self.engine.make_plan(
+            ['IForest', 'KNN', 'ECOD'],
+            justifications=['rank', 'proximity', 'parameter-free'])
+        assert plan['detector_name'] == 'IForest'
+        assert plan['reason'] == 'rank'
+        assert len(plan['alternatives']) == 2
+        assert plan['alternatives'][0]['detector_name'] == 'KNN'
+        assert plan['alternatives'][0]['confidence'] == 0.5
+        assert plan['alternatives'][1]['detector_name'] == 'ECOD'
+
+    def test_default_justification(self):
+        plan = self.engine.make_plan(['IForest', 'KNN'])
+        assert 'caller-selected' in plan['reason']
+        assert 'caller-selected' in plan['alternatives'][0]['reason']
+
+    def test_unknown_detector_raises(self):
+        with self.assertRaises(ValueError) as cm:
+            self.engine.make_plan(['DoesNotExist'])
+        assert 'DoesNotExist' in str(cm.exception)
+        assert 'case-sensitive' in str(cm.exception)
+
+    def test_empty_choices_raises(self):
+        with self.assertRaises(ValueError):
+            self.engine.make_plan([])
+
+    def test_non_list_raises(self):
+        with self.assertRaises(ValueError):
+            self.engine.make_plan('IForest')
+
+    def test_params_overlay_keeps_contamination(self):
+        plan = self.engine.make_plan(
+            ['IForest'],
+            params=[{'n_estimators': 50}])
+        assert plan['params'].get('n_estimators') == 50
+        # _with_contamination should add contamination if KB has a default
+        # (it does for IForest); we check it's present, not the exact value.
+        assert 'contamination' in plan['params']
+
+    def test_build_detector_consumes_plan(self):
+        # The plan returned by make_plan must be consumable by build_detector
+        # so existing detector-construction code keeps working.
+        plan = self.engine.make_plan(['IForest'])
+        detector = self.engine.build_detector(plan)
+        assert detector is not None
+        # And it should be fittable
+        rng = np.random.RandomState(7)
+        X = rng.normal(size=(100, 5))
+        detector.fit(X)
+        assert hasattr(detector, 'decision_scores_')
+
+
+class TestPlanDetectionSignature(unittest.TestCase):
+    """Med-1 (Codex Round 2): new params must be keyword-only."""
+
+    def test_new_params_are_keyword_only(self):
+        import inspect
+        sig = inspect.signature(ADEngine.plan_detection)
+        for name in ('top_k', 'llm_client', 'llm_strict'):
+            kind = sig.parameters[name].kind
+            assert kind is inspect.Parameter.KEYWORD_ONLY, (
+                f"{name!r} should be KEYWORD_ONLY, got {kind!r}; "
+                "CHANGES.txt advertises keyword-only and callers must "
+                "not bind these positionally.")
+        # The v3.5.2 params remain positional-or-keyword for backward compat.
+        for name in ('profile', 'priority', 'constraints'):
+            kind = sig.parameters[name].kind
+            assert kind is inspect.Parameter.POSITIONAL_OR_KEYWORD, (
+                f"v3.5.2 param {name!r} must stay POSITIONAL_OR_KEYWORD; "
+                f"got {kind!r}")
+
+
+class TestPlanDetectionTopK(unittest.TestCase):
+    """plan_detection(..., top_k=K) generalizes the v3.5.2 valid[1:3] cap."""
+
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(300, 8))
+        self.profile = self.engine.profile_data(self.X, data_type='tabular')
+
+    def test_default_top_k_matches_v352_behavior(self):
+        # v3.5.2 used valid[1:3] which gave up to 2 alternatives.
+        plan = self.engine.plan_detection(self.profile)
+        assert len(plan.get('alternatives', [])) <= 2
+
+    def test_top_k_1(self):
+        plan = self.engine.plan_detection(self.profile, top_k=1)
+        assert plan.get('alternatives', []) == []
+
+    def test_top_k_5(self):
+        plan = self.engine.plan_detection(self.profile, top_k=5)
+        # plan returns primary + up to top_k-1 alternatives.
+        assert len(plan.get('alternatives', [])) <= 4
+
+    def test_top_k_clamped_to_1(self):
+        plan_neg = self.engine.plan_detection(self.profile, top_k=-1)
+        plan_zero = self.engine.plan_detection(self.profile, top_k=0)
+        plan_one = self.engine.plan_detection(self.profile, top_k=1)
+        # All three should yield the same single-primary plan (no alternatives)
+        assert plan_neg.get('alternatives') == plan_one.get('alternatives')
+        assert plan_zero.get('alternatives') == plan_one.get('alternatives')
+
+
+class TestSurface2LlmClient(unittest.TestCase):
+    """plan_detection(llm_client=...) Surface 2 routing."""
+
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(200, 8))
+        self.profile = self.engine.profile_data(self.X, data_type='tabular')
+
+    def test_stub_llm_returns_canned_plan(self):
+        def stub_llm(prompt: str) -> str:
+            return ('[{"detector":"IForest","justification":"top rank"},'
+                    '{"detector":"ECOD","justification":"parameter-free"},'
+                    '{"detector":"KNN","justification":"proximity"}]')
+        plan = self.engine.plan_detection(self.profile, llm_client=stub_llm)
+        assert plan['detector_name'] == 'IForest'
+        assert plan.get('note') == 'llm-driven via plan_detection(llm_client=...)'
+        assert plan.get('evidence') == ['llm_routing']
+        alts = [a['detector_name'] for a in plan.get('alternatives', [])]
+        assert alts == ['ECOD', 'KNN']
+
+    def test_top_k_truncates_llm_response(self):
+        def stub_llm(prompt: str) -> str:
+            return ('[{"detector":"IForest","justification":"a"},'
+                    '{"detector":"ECOD","justification":"b"},'
+                    '{"detector":"KNN","justification":"c"},'
+                    '{"detector":"LOF","justification":"d"},'
+                    '{"detector":"HBOS","justification":"e"}]')
+        plan = self.engine.plan_detection(self.profile, llm_client=stub_llm,
+                                          top_k=2)
+        assert plan['detector_name'] == 'IForest'
+        assert len(plan.get('alternatives', [])) == 1
+
+    def test_llm_response_cannot_select_excluded_detector(self):
+        """High-1 (Codex Round 1): LLM output must not bypass exclude_detectors."""
+        import warnings
+
+        def excluded_llm(prompt: str) -> str:
+            return '[{"detector":"IForest","justification":"trying to bypass"}]'
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            plan = self.engine.plan_detection(
+                self.profile,
+                constraints={'exclude_detectors': ['IForest']},
+                llm_client=excluded_llm)
+        # Must have fallen back to rule routing.
+        assert plan['detector_name'] != 'IForest'
+        assert plan.get('evidence') != ['llm_routing']
+        # And a RuntimeWarning must have explained the fallback.
+        assert any(issubclass(x.category, RuntimeWarning) for x in w)
+
+    def test_per_call_llm_strict_true_reraises(self):
+        """Med-3 (Codex Round 1): explicit llm_strict=True re-raises."""
+        from pyod.utils._llm import RoutingParseError
+        def bad_llm(prompt: str) -> str:
+            return "I cannot help with that."
+        with self.assertRaises(RoutingParseError):
+            self.engine.plan_detection(self.profile,
+                                        llm_client=bad_llm,
+                                        llm_strict=True)
+
+    def test_per_call_llm_strict_false_overrides_env(self):
+        """Med-3 (Codex Round 1): explicit llm_strict=False overrides env var."""
+        import os, warnings
+        os.environ['PYOD3_LLM_STRICT'] = '1'
+        try:
+            def bad_llm(prompt: str) -> str:
+                return "no json"
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                plan = self.engine.plan_detection(
+                    self.profile, llm_client=bad_llm, llm_strict=False)
+            # Did not raise -- explicit kwarg won over env var.
+            assert plan['detector_name']
+            assert any(issubclass(x.category, RuntimeWarning) for x in w)
+        finally:
+            del os.environ['PYOD3_LLM_STRICT']
+
+    def test_per_call_llm_strict_none_defers_to_env(self):
+        """Med-3 (Codex Round 1): llm_strict=None reads PYOD3_LLM_STRICT."""
+        import os
+        from pyod.utils._llm import RoutingParseError
+        os.environ['PYOD3_LLM_STRICT'] = '1'
+        try:
+            def bad_llm(prompt: str) -> str:
+                return "no json"
+            with self.assertRaises(RoutingParseError):
+                self.engine.plan_detection(self.profile,
+                                           llm_client=bad_llm,
+                                           llm_strict=None)
+        finally:
+            del os.environ['PYOD3_LLM_STRICT']
+
+    def test_malformed_response_falls_back_to_rules(self):
+        import warnings
+        def bad_llm(prompt: str) -> str:
+            return "Sorry, I cannot help with that."
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            plan = self.engine.plan_detection(self.profile,
+                                              llm_client=bad_llm)
+        assert plan['detector_name']  # rule-driven primary
+        assert plan.get('evidence') != ['llm_routing']
+        # A RuntimeWarning should have been emitted
+        runtime_warnings = [x for x in w
+                            if issubclass(x.category, RuntimeWarning)]
+        assert len(runtime_warnings) >= 1
+
+    def test_strict_mode_reraises(self):
+        import os
+        from pyod.utils._llm import RoutingParseError
+        def bad_llm(prompt: str) -> str:
+            return "no JSON here"
+        os.environ['PYOD3_LLM_STRICT'] = '1'
+        try:
+            with self.assertRaises(RoutingParseError):
+                self.engine.plan_detection(self.profile, llm_client=bad_llm)
+        finally:
+            del os.environ['PYOD3_LLM_STRICT']
+
+    def test_llm_client_none_preserves_rule_routing(self):
+        # llm_client=None must produce the exact v3.5.2 rule plan.
+        plan_default = self.engine.plan_detection(self.profile)
+        plan_none = self.engine.plan_detection(self.profile, llm_client=None)
+        assert plan_default == plan_none
+
+
+class TestPromptBuilderModalityRank(unittest.TestCase):
+    """Med-2 (Codex Round 2): the LLM prompt must carry the modality-
+    specific benchmark rank, not just the modality-title-overall key.
+    """
+
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        rng = np.random.RandomState(0)
+        self.X = rng.normal(size=(200, 8))
+
+    def _profile_for(self, data_type: str) -> dict:
+        # profile_data only inspects X; data_type just rides through.
+        prof = self.engine.profile_data(self.X, data_type=data_type)
+        return prof
+
+    def test_text_modality_prompt_falls_back_when_no_rank_data(self):
+        # text detectors currently ship without benchmark_rank entries
+        # in the KB. The prompt must still render successfully (no rank
+        # annotation, no crash) and must NOT use the old broken
+        # `Text_overall` key name (which never existed in the KB).
+        from pyod.utils._llm import build_routing_prompt
+        prof = self._profile_for('text')
+        kb = self.engine.get_kb_for_routing(prof)
+        prompt = build_routing_prompt(kb, top_k=3)
+        assert 'TASK PROFILE' in prompt
+        assert 'Text_overall' not in prompt, (
+            "build_routing_prompt should not emit the legacy "
+            "`Text_overall` key form -- it was never a real KB field")
+
+    def test_time_series_modality_prompt_shows_tsb_ad_rank(self):
+        from pyod.utils._llm import build_routing_prompt
+        prof = self._profile_for('time_series')
+        kb = self.engine.get_kb_for_routing(prof)
+        prompt = build_routing_prompt(kb, top_k=3)
+        # TSB_AD_overall is the documented primary key for time_series.
+        assert 'TSB_AD_overall' in prompt, (
+            "time_series prompt should annotate the rank with TSB_AD_overall")
+
+    def test_kb_entries_stamp_resolved_rank(self):
+        # The contract: get_kb_for_routing places `resolved_rank` and
+        # `resolved_rank_key` on each detector entry so downstream tools
+        # do not have to re-do the lookup.
+        prof = self._profile_for('time_series')
+        kb = self.engine.get_kb_for_routing(prof)
+        any_ranked = [d for d in kb['available_detectors']
+                      if d.get('resolved_rank') is not None]
+        assert any_ranked, (
+            "time_series KB should expose resolved_rank on at least "
+            "one entry")
+        for d in any_ranked:
+            assert d.get('resolved_rank_key') is not None, (
+                f"detector {d['name']} has resolved_rank but no "
+                "resolved_rank_key -- contract violation")
+
+
+class TestRoutingResponseParser(unittest.TestCase):
+    def setUp(self):
+        self.engine = ADEngine(random_state=42)
+        self.kb = self.engine.kb
+
+    def test_parse_plain_json_array(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = '[{"detector":"IForest","justification":"x"}]'
+        choices, justs = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+        assert justs == ['x']
+
+    def test_parse_tolerates_prose(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('I recommend the following three detectors:\n'
+                '[{"detector":"IForest","justification":"x"},'
+                '{"detector":"ECOD","justification":"y"}]\n'
+                'Hope this helps.')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest', 'ECOD']
+
+    def test_parse_tolerates_markdown_fences(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('```json\n'
+                '[{"detector":"IForest","justification":"x"}]\n'
+                '```')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+
+    def test_parse_skips_unknown_detectors(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('[{"detector":"BogusDetector","justification":"x"},'
+                '{"detector":"IForest","justification":"y"}]')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+
+    def test_parse_dedupes(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('[{"detector":"IForest","justification":"a"},'
+                '{"detector":"IForest","justification":"b"}]')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest']
+
+    def test_parse_truncates_to_top_k(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = ('[{"detector":"IForest"},'
+                '{"detector":"ECOD"},'
+                '{"detector":"KNN"},'
+                '{"detector":"LOF"}]')
+        choices, _ = parse_routing_response(resp, self.kb, top_k=2)
+        assert choices == ['IForest', 'ECOD']
+
+    def test_parse_raises_on_no_array(self):
+        from pyod.utils._llm import parse_routing_response, RoutingParseError
+        with self.assertRaises(RoutingParseError):
+            parse_routing_response("no json here", self.kb)
+
+    def test_parse_raises_on_all_invalid(self):
+        from pyod.utils._llm import parse_routing_response, RoutingParseError
+        resp = '[{"detector":"BogusOne"},{"detector":"BogusTwo"}]'
+        with self.assertRaises(RoutingParseError):
+            parse_routing_response(resp, self.kb)
+
+    def test_parse_accepts_bare_string_entries(self):
+        from pyod.utils._llm import parse_routing_response
+        resp = '["IForest", "ECOD"]'
+        choices, justs = parse_routing_response(resp, self.kb, top_k=3)
+        assert choices == ['IForest', 'ECOD']
+        assert justs == ['', '']
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pyod/utils/_llm.py b/pyod/utils/_llm.py
new file mode 100644
index 00000000..530c011f
--- /dev/null
+++ b/pyod/utils/_llm.py
@@ -0,0 +1,267 @@
+"""LLM-client Protocol, prompt builder, and routing-response parser.
+
+This module powers pyod 3.5.3's :meth:`ADEngine.plan_detection` Surface 2
+extension. When a user passes ``llm_client=callable``, the engine
+invokes :func:`build_routing_prompt` and :func:`parse_routing_response`
+through this module; when ``llm_client=None``, the rules path is
+unchanged.
+
+Public:
+    LLMCallable -- typing.Protocol; any (prompt: str) -> str
+    RoutingParseError -- raised when the parser cannot extract a plan
+    build_routing_prompt(kb_context, top_k) -> str
+    parse_routing_response(response, kb, top_k) -> (list[str], list[str])
+
+No optional dependencies are imported at module load; PyOD does not ship
+any provider-specific adapters.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+from typing import Any, Protocol
+
+
+logger = logging.getLogger(__name__)
+
+
+class LLMCallable(Protocol):
+    """Any ``(prompt: str) -> str`` callable.
+
+    Users supply an instance wrapping their preferred LLM SDK. Example
+    (Anthropic SDK):
+
+    .. code-block:: python
+
+        from anthropic import Anthropic
+        client = Anthropic(api_key=os.environ['ANTHROPIC_API_KEY'])
+
+        def my_llm(prompt: str) -> str:
+            return client.messages.create(
+                model='claude-opus-4-7',
+                max_tokens=4096,
+                messages=[{'role': 'user', 'content': prompt}],
+            ).content[0].text
+
+        plan = engine.plan_detection(profile, llm_client=my_llm)
+
+    PyOD ships no provider-specific adapter classes; users wrap their
+    own SDK in a ``(prompt) -> str`` callable.
+    """
+
+    def __call__(self, prompt: str) -> str: ...
+
+
+class RoutingParseError(ValueError):
+    """Raised when :func:`parse_routing_response` cannot extract a plan.
+
+    The engine catches this and falls back to rule-driven routing unless
+    the environment variable ``PYOD3_LLM_STRICT=1`` is set.
+    """
+
+
+def build_routing_prompt(kb_context: dict, top_k: int = 3) -> str:
+    """Render a routing prompt from a knowledge-base context dict.
+
+    Parameters
+    ----------
+    kb_context : dict
+        Output of :meth:`ADEngine.get_kb_for_routing`. Carries
+        ``task_profile`` and ``available_detectors`` lists.
+    top_k : int
+        Number of detectors the LLM should select. Default 3.
+
+    Returns
+    -------
+    str
+        A self-contained prompt instructing the LLM to return a JSON
+        array of ``{"detector": ..., "justification": ...}`` objects.
+
+    Notes
+    -----
+    The template avoids chain-of-thought scaffolding so the same prompt
+    works across diverse LLMs (Claude, GPT, Gemini, open-weight).
+    """
+    profile = kb_context.get("task_profile", {})
+    detectors = kb_context.get("available_detectors", [])
+    # Compress each detector entry to a single line: name + 1-line
+    # best_for + 1-line avoid_when + benchmark_rank (resolved per-modality).
+    # Prefer `resolved_rank` / `resolved_rank_key` that get_kb_for_routing
+    # stamped on each entry; fall back to the modality-title-overall key
+    # for older callers that bypass get_kb_for_routing.
+    lines = []
+    for d in detectors:
+        rank = d.get("resolved_rank")
+        rank_key = d.get("resolved_rank_key")
+        if rank is None:
+            bench = d.get("benchmark_rank") or {}
+            rank = bench.get(
+                f"{str(profile.get('data_type', 'tabular')).title()}_overall"
+            ) or bench.get("ADBench_overall")
+            rank_key = None
+        rank_str = (f" rank={rank} ({rank_key})" if rank is not None
+                    and rank_key else
+                    (f" rank={rank}" if rank is not None else ""))
+        strengths = "; ".join((d.get("strengths") or [])[:2])
+        weaknesses = "; ".join((d.get("weaknesses") or [])[:2])
+        best_for = d.get("best_for") or ""
+        avoid_when = d.get("avoid_when") or ""
+        lines.append(
+            f"- {d['name']} ({d.get('category', 'unknown')}{rank_str}): "
+            f"best_for={best_for!r}; avoid_when={avoid_when!r}; "
+            f"strengths=[{strengths}]; weaknesses=[{weaknesses}]")
+
+    profile_str = (
+        f"data_type={profile.get('data_type', 'tabular')}, "
+        f"n_samples={profile.get('n_samples', '?')}, "
+        f"n_features={profile.get('n_features', '?')}, "
+        f"contamination_estimate={profile.get('contamination_estimate', '?')}"
+    )
+
+    return (
+        "You are an anomaly-detection routing expert. Given the task "
+        "profile and a list of available detectors annotated with "
+        "strengths, weaknesses, best_for, avoid_when, and benchmark "
+        "rank, choose the ordered top-K detectors most likely to "
+        "succeed on this task.\n\n"
+        f"TASK PROFILE: {profile_str}\n\n"
+        f"AVAILABLE DETECTORS ({len(detectors)}):\n"
+        + "\n".join(lines) + "\n\n"
+        f"Return exactly {top_k} detectors as a JSON array of objects, "
+        'each shaped {"detector": "<name>", "justification": "<one '
+        'sentence>"}. Detector names are case-sensitive and must come '
+        "from the list above. Return ONLY the JSON array (no prose, "
+        "no markdown fences).\n"
+    )
+
+
+# Matches a balanced top-level JSON array. We do not parse arbitrary
+# nested arrays defensively; the spec asks for a flat list of objects.
+_JSON_ARRAY_RE = re.compile(r"\[[^\[\]]*?\]", re.DOTALL)
+
+
+def _extract_first_array(response: str) -> str | None:
+    """Return the first ``[...]`` substring that parses as a JSON list.
+
+    Tolerates surrounding prose, markdown fences, or repeated arrays.
+    Returns ``None`` if no parseable array is found.
+    """
+    # Strip ```json fences if present.
+    fenced = re.sub(r"```(?:json)?\s*", "", response)
+    fenced = re.sub(r"```", "", fenced)
+
+    # Try greedy first: from first '[' to last ']'.
+    first = fenced.find("[")
+    last = fenced.rfind("]")
+    if 0 <= first < last:
+        candidate = fenced[first: last + 1]
+        try:
+            data = json.loads(candidate)
+            if isinstance(data, list):
+                return candidate
+        except json.JSONDecodeError:
+            pass
+
+    # Fall back to balanced matches without nested arrays.
+    for m in _JSON_ARRAY_RE.findall(fenced):
+        try:
+            data = json.loads(m)
+            if isinstance(data, list):
+                return m
+        except json.JSONDecodeError:
+            continue
+    return None
+
+
+def parse_routing_response(response: str, kb: Any,
+                           top_k: int = 3) -> tuple[list[str], list[str]]:
+    """Parse an LLM routing response into ``(detector_choices, justifications)``.
+
+    Parameters
+    ----------
+    response : str
+        The raw LLM text. Expected to be a JSON array of
+        ``{"detector": str, "justification": str}`` objects, but
+        tolerates surrounding prose and markdown fences.
+    kb : pyod.utils.knowledge.KnowledgeBase
+        Used to validate detector names. Unknown names are skipped
+        with a warning.
+    top_k : int
+        Truncate to at most this many detectors. Default 3.
+
+    Returns
+    -------
+    detector_choices : list[str]
+        Validated, ordered list (length >= 1). Trimmed to ``top_k``.
+    justifications : list[str]
+        Parallel list, one short sentence per detector. Empty string
+        when the LLM omitted the field.
+
+    Raises
+    ------
+    RoutingParseError
+        If no JSON array can be extracted OR if fewer than 1 detector
+        survives validation against ``kb``.
+    """
+    if not isinstance(response, str):
+        raise RoutingParseError(
+            f"response must be a string; got {type(response).__name__}")
+
+    candidate = _extract_first_array(response)
+    if candidate is None:
+        raise RoutingParseError(
+            "no JSON array found in LLM response (response head: "
+            f"{response[:120]!r})")
+
+    try:
+        data = json.loads(candidate)
+    except json.JSONDecodeError as ex:
+        raise RoutingParseError(f"JSON parse error: {ex}") from ex
+
+    if not isinstance(data, list):
+        raise RoutingParseError(
+            f"expected JSON array; got {type(data).__name__}")
+
+    detector_choices: list[str] = []
+    justifications: list[str] = []
+    seen: set[str] = set()
+    for entry in data:
+        if isinstance(entry, str):
+            name = entry
+            just = ""
+        elif isinstance(entry, dict):
+            name = entry.get("detector") or entry.get("name") or ""
+            just = (entry.get("justification") or entry.get("reason")
+                    or "")
+        else:
+            continue
+        if not isinstance(name, str) or not name:
+            continue
+        # Drop duplicates so the LLM cannot pad top_k with repeats.
+        if name in seen:
+            continue
+        algo = kb.get_algorithm(name)
+        if algo is None:
+            logger.warning(
+                "parse_routing_response: skipping unknown detector %r "
+                "(not in KB)", name)
+            continue
+        if algo.get("status") != "shipped":
+            logger.warning(
+                "parse_routing_response: skipping non-shipped detector "
+                "%r (status=%r)", name, algo.get("status"))
+            continue
+        detector_choices.append(name)
+        justifications.append(just if isinstance(just, str) else "")
+        seen.add(name)
+        if len(detector_choices) >= top_k:
+            break
+
+    if not detector_choices:
+        raise RoutingParseError(
+            "no valid detector names in LLM response after KB "
+            f"validation (raw array: {candidate[:200]!r})")
+
+    return detector_choices, justifications
diff --git a/pyod/utils/ad_engine.py b/pyod/utils/ad_engine.py
index 5aeb4e00..b4492b4e 100644
--- a/pyod/utils/ad_engine.py
+++ b/pyod/utils/ad_engine.py
@@ -188,7 +188,10 @@ def _with_contamination(self, detector_name: str,
         return out
 
     def plan_detection(self, profile: dict, priority: str = 'balanced',
-                       constraints: dict | None = None) -> dict:
+                       constraints: dict | None = None, *,
+                       top_k: int = 3,
+                       llm_client=None,
+                       llm_strict: bool | None = None) -> dict:
         """Plan a detection pipeline.
 
         Parameters
@@ -199,12 +202,56 @@ def plan_detection(self, profile: dict, priority: str = 'balanced',
             'speed', 'accuracy', or 'balanced'.
         constraints : dict or None
             Optional: {'exclude_detectors': [...]}
+        top_k : int, default 3
+            Number of detectors in the returned plan (primary + ``top_k - 1``
+            alternatives). Default ``3`` preserves the v3.5.2 behaviour
+            (``valid[1:3]`` produced two alternatives plus the primary).
+            Values < 1 are clamped to 1.
+        llm_client : callable or None, default None
+            Optional ``(prompt: str) -> str`` callable (see
+            :class:`pyod.utils._llm.LLMCallable`). When provided, routing
+            consults the LLM with the KB context and parses its response
+            into a plan via :func:`pyod.utils._llm.parse_routing_response`.
+            If the LLM call or parser raises, falls back to rule routing
+            with a :class:`RuntimeWarning` (see ``llm_strict``). When
+            ``None`` (default), v3.5.2 rule routing is unchanged.
+        llm_strict : bool or None, default None
+            Per-call control for LLM-routing failure mode. ``True``
+            re-raises any exception from ``llm_client`` or the response
+            parser; ``False`` falls back to rule routing with a
+            :class:`RuntimeWarning`; ``None`` defers to the
+            ``PYOD3_LLM_STRICT`` environment variable
+            (``"1"`` re-raises, anything else falls back). The explicit
+            kwarg takes precedence so concurrent callers in the same
+            process can choose independently.
 
         Returns
         -------
         plan : dict (DetectionPlan, closed schema)
         """
         constraints = constraints or {}
+        top_k = max(1, int(top_k))
+
+        if llm_client is not None:
+            try:
+                return self._plan_via_llm(profile, top_k, llm_client,
+                                          constraints)
+            except Exception as ex:  # noqa: BLE001
+                if llm_strict is None:
+                    import os
+                    strict = os.environ.get('PYOD3_LLM_STRICT') == '1'
+                else:
+                    strict = bool(llm_strict)
+                if strict:
+                    raise
+                import warnings
+                warnings.warn(
+                    f"plan_detection: llm_client routing failed "
+                    f"({type(ex).__name__}: {ex}); falling back to "
+                    "rule routing. Pass llm_strict=True (or set "
+                    "PYOD3_LLM_STRICT=1) to re-raise.",
+                    RuntimeWarning, stacklevel=2)
+
         exclude = set(constraints.get('exclude_detectors', []))
 
         matched = evaluate_rules(profile, priority, self.kb)
@@ -260,7 +307,7 @@ def plan_detection(self, profile: dict, priority: str = 'balanced',
             reason=r.get('_reason', ''),
             evidence=r.get('_evidence', []),
             confidence=r.get('confidence', 0.5),
-            alternatives=[]) for r in valid[1:3]]
+            alternatives=[]) for r in valid[1:top_k]]
 
         return make_plan(
             detector_name=best['detector'],
@@ -272,6 +319,285 @@ def plan_detection(self, profile: dict, priority: str = 'balanced',
             confidence=best.get('confidence', 0.7),
             alternatives=alternatives)
 
+    # ------------------------------------------------------------------
+    # Surface 1: KB exposure for caller-driven (agent / LLM) routing
+    # ------------------------------------------------------------------
+
+    def get_kb_for_routing(self, profile: dict, top_k: int = 3,
+                           constraints: dict | None = None) -> dict:
+        """Return a structured KB snapshot for caller-driven detector
+        selection.
+
+        This is the agent-facing companion to :meth:`plan_detection`.
+        ``plan_detection`` consumes the KB through hand-coded rules and
+        returns a single plan; ``get_kb_for_routing`` exposes the KB
+        directly so a caller (LLM agent, MCP tool client, ...) can
+        reason over each detector's strengths, weaknesses, complexity,
+        and benchmark rank, then call :meth:`make_plan` to commit a
+        plan.
+
+        Parameters
+        ----------
+        profile : dict
+            Output of :meth:`profile_data`. Must include ``data_type``;
+            ``n_samples`` / ``n_features`` are passed through unchanged.
+        top_k : int, default 3
+            The number of detectors the caller intends to select. The KB
+            snapshot itself is returned in full (filtered + sorted); the
+            field is included in the returned dict so the response-format
+            hint can reference it.
+        constraints : dict or None, optional
+            ``{'exclude_detectors': list[str], 'data_type_strict': bool}``.
+            ``exclude_detectors`` is a hard filter. ``data_type_strict``
+            (default ``True``) drops detectors whose KB ``data_types``
+            field does not include ``profile['data_type']``.
+
+        Returns
+        -------
+        dict
+            ``{'task_profile': {...}, 'available_detectors': [...],
+            'top_k_requested': int, 'response_format_hint': str,
+            'n_available': int}``.
+
+        Notes
+        -----
+        Pure function; no LLM calls, no state mutation.
+        """
+        if not isinstance(profile, dict):
+            raise ValueError("profile must be a dict from profile_data()")
+        top_k = max(1, int(top_k))
+        constraints = constraints or {}
+        exclude = set(constraints.get('exclude_detectors') or [])
+        data_type_strict = constraints.get('data_type_strict', True)
+        target_modality = profile.get('data_type', 'tabular')
+
+        catalog = self.list_detectors(data_type=None, status='shipped')
+        available: list[dict] = []
+        for entry in catalog:
+            name = entry.get('name') if isinstance(entry, dict) else str(entry)
+            if name in exclude:
+                continue
+            dts = entry.get('data_types') or []
+            modality_match = (target_modality in dts) if dts else True
+            if data_type_strict and not modality_match:
+                continue
+            complexity = entry.get('complexity') or {}
+            available.append({
+                'name': name,
+                'category': entry.get('category', 'unknown'),
+                'complexity_time': complexity.get('time'),
+                'complexity_space': complexity.get('space'),
+                'strengths': entry.get('strengths') or [],
+                'weaknesses': entry.get('weaknesses') or [],
+                'best_for': entry.get('best_for'),
+                'avoid_when': entry.get('avoid_when'),
+                'benchmark_rank': entry.get('benchmark_rank') or {},
+                'modality_match': modality_match,
+            })
+
+        # Modality-aware benchmark-rank keys. Each modality lists its
+        # preferred KB rank fields in priority order; the first non-None
+        # value sets the sort key. `ADBench_overall` is the universal
+        # fallback because the KB ships rank for nearly every tabular
+        # detector there. Detectors missing every key sort last (999).
+        _MODALITY_RANK_KEYS = {
+            'tabular': ['ADBench_overall'],
+            'time_series': ['TSB_AD_overall', 'TSB_AD_overall_iforest',
+                            'ADBench_overall'],
+            'timeseries': ['TSB_AD_overall', 'TSB_AD_overall_iforest',
+                           'ADBench_overall'],
+            'graph': ['BOND_deep', 'BOND_overall', 'ADBench_overall'],
+            'text': ['NLP_ADBench_overall', 'ADBench_overall'],
+            'image': ['MVTec_overall', 'ADBench_overall'],
+            'synthetic': ['ADBench_overall'],
+        }
+        rank_key_candidates = _MODALITY_RANK_KEYS.get(
+            str(target_modality).lower(),
+            [f"{str(target_modality).title()}_overall", 'ADBench_overall'])
+
+        def _rank(d):
+            br = d.get('benchmark_rank') or {}
+            for k in rank_key_candidates:
+                v = br.get(k)
+                if v is not None:
+                    return v
+            return 999
+
+        # Stamp the resolved (rank, rank_key) on each entry so downstream
+        # consumers (e.g., build_routing_prompt) can render the modality-
+        # specific rank without re-doing the lookup. None when no rank
+        # field is present in the KB for this detector under this modality.
+        for d in available:
+            br = d.get('benchmark_rank') or {}
+            resolved = None
+            resolved_key = None
+            for k in rank_key_candidates:
+                v = br.get(k)
+                if v is not None:
+                    resolved = v
+                    resolved_key = k
+                    break
+            d['resolved_rank'] = resolved
+            d['resolved_rank_key'] = resolved_key
+
+        available.sort(key=lambda d: (_rank(d), d['name']))
+
+        # Strip non-JSON-safe fields from the profile copy
+        profile_safe = {k: v for k, v in profile.items() if k != 'data'}
+
+        return {
+            'task_profile': profile_safe,
+            'available_detectors': available,
+            'top_k_requested': top_k,
+            'response_format_hint': (
+                "To commit your selection, call ADEngine.make_plan with "
+                "detector_choices=['detName1', ...] (ordered list of "
+                f"top-{top_k} names from available_detectors[*].name; "
+                "case-sensitive) and justifications=['why1', ...] "
+                "(parallel list, one short sentence each)."
+            ),
+            'n_available': len(available),
+        }
+
+    def make_plan(self, detector_choices: list,
+                  justifications: list | None = None,
+                  params: list | None = None) -> dict:
+        """Commit a caller-driven detector plan and return a DetectionPlan.
+
+        Companion to :meth:`get_kb_for_routing`. The caller (LLM agent,
+        rule engine, human script) selects ``len(detector_choices)``
+        detectors and this method validates names against the KB, fills
+        per-detector defaults, and packages the result as a
+        :func:`pyod.utils._kb_router.make_plan`-shaped dict so existing
+        consumers (``build_detector``, ``run``, downstream MCP clients)
+        keep working unchanged.
+
+        Parameters
+        ----------
+        detector_choices : list of str
+            Ordered list of detector class names. ``detector_choices[0]``
+            is the primary; the rest become ``alternatives`` in plan
+            order. Length must be >= 1. Names must match KB entries
+            (case-sensitive) with ``status='shipped'``; otherwise
+            ``ValueError`` is raised.
+        justifications : list of str, optional
+            Parallel to ``detector_choices``. One short sentence per
+            choice. ``None`` is accepted and yields autogenerated
+            reasons.
+        params : list of dict, optional
+            Parallel to ``detector_choices``. Per-detector constructor
+            kwargs. ``None`` -> KB defaults overlaid with the
+            engine's contamination resolution.
+
+        Returns
+        -------
+        dict
+            Closed-schema DetectionPlan: ``{'detector_name',
+            'params', 'reason', 'evidence', 'confidence',
+            'alternatives', 'note'}``.
+
+        Raises
+        ------
+        ValueError
+            If ``detector_choices`` is empty or any name is unknown /
+            not ``status='shipped'`` in the KB.
+        """
+        if not detector_choices:
+            raise ValueError(
+                "detector_choices must be non-empty; got an empty list")
+        if not isinstance(detector_choices, list):
+            raise ValueError(
+                "detector_choices must be a list of strings; "
+                f"got {type(detector_choices).__name__}")
+
+        justifications = list(justifications or [])
+        params_list = list(params or [])
+        while len(justifications) < len(detector_choices):
+            justifications.append('')
+        while len(params_list) < len(detector_choices):
+            params_list.append({})
+
+        unknown = []
+        not_shipped = []
+        for name in detector_choices:
+            algo = self.kb.get_algorithm(name)
+            if algo is None:
+                unknown.append(name)
+                continue
+            if algo.get('status') != 'shipped':
+                not_shipped.append(name)
+        if unknown:
+            raise ValueError(
+                "Unknown detector name(s) (case-sensitive). Names must "
+                "match KB entries from ADEngine.list_detectors(): "
+                f"{unknown!r}")
+        if not_shipped:
+            raise ValueError(
+                f"Detector(s) not shipped (cannot be built): {not_shipped!r}")
+
+        primary = detector_choices[0]
+        primary_params = self._with_contamination(
+            primary, params_list[0] or {})
+        alternatives = []
+        for i, det in enumerate(detector_choices[1:], start=1):
+            alt_params = self._with_contamination(det, params_list[i] or {})
+            alt_reason = (justifications[i] or
+                          'caller-selected via make_plan')
+            alternatives.append(make_plan(
+                detector_name=det,
+                params=alt_params,
+                reason=alt_reason,
+                evidence=['caller_selection'],
+                confidence=0.5,
+                alternatives=[]))
+
+        primary_reason = (justifications[0] or
+                          'caller-selected via make_plan')
+        return make_plan(
+            detector_name=primary,
+            params=primary_params,
+            reason=primary_reason,
+            evidence=['caller_selection'],
+            confidence=0.7,
+            alternatives=alternatives,
+            note='caller-driven via make_plan')
+
+    def _plan_via_llm(self, profile: dict, top_k: int, llm_client,
+                      constraints: dict | None = None) -> dict:
+        """Route via an LLM client (internal; see plan_detection)."""
+        from ._llm import (
+            RoutingParseError,
+            build_routing_prompt,
+            parse_routing_response,
+        )
+        kb_context = self.get_kb_for_routing(
+            profile, top_k=top_k, constraints=constraints or {})
+        prompt = build_routing_prompt(kb_context, top_k=top_k)
+        response = llm_client(prompt)
+        detector_choices, justifications = parse_routing_response(
+            response, self.kb, top_k=top_k)
+        # LLM output is untrusted: enforce the constrained KB context
+        # (exclude_detectors + data_type_strict) after parsing. Without
+        # this, a hostile or buggy client could return an excluded or
+        # modality-mismatched detector and get an LLM-sourced plan.
+        # parse_routing_response only validates against the global KB.
+        allowed = {d['name'] for d in kb_context.get(
+            'available_detectors', [])}
+        blocked = [name for name in detector_choices if name not in allowed]
+        if blocked:
+            raise RoutingParseError(
+                "LLM selected detector(s) outside the constrained KB "
+                f"context: {blocked!r}. The constrained context "
+                f"excluded {sorted(constraints.get('exclude_detectors') or [])!r}.")
+        plan = self.make_plan(
+            detector_choices=detector_choices,
+            justifications=justifications)
+        # Tag the plan so downstream code can distinguish LLM-sourced
+        # plans from caller-driven or rule-driven ones.
+        plan['note'] = 'llm-driven via plan_detection(llm_client=...)'
+        plan['evidence'] = ['llm_routing']
+        return plan
+
     # ------------------------------------------------------------------
     # Detector construction
     # ------------------------------------------------------------------
diff --git a/pyod/version.py b/pyod/version.py
index 3a0dbdae..b40277e9 100644
--- a/pyod/version.py
+++ b/pyod/version.py
@@ -1,23 +1,23 @@
-"""
-``pyod`` is a python toolbox for scalable outlier detection
-"""
-# Based on NiLearn package
-# License: simplified BSD
-
-# PEP0440 compatible formatted version, see:
-# https://www.python.org/dev/peps/pep-0440/
-#
-# Generic release markers:
-# X.Y
-# X.Y.Z # For bug fix releases
-#
-# Admissible pre-release markers:
-# X.YaN # Alpha release
-# X.YbN # Beta release
-# X.YrcN # Release Candidate
-# X.Y # Final release
-#
-# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
-# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-#
-__version__ = '3.5.2'  # pragma: no cover
+"""
+``pyod`` is a python toolbox for scalable outlier detection
+"""
+# Based on NiLearn package
+# License: simplified BSD
+
+# PEP0440 compatible formatted version, see:
+# https://www.python.org/dev/peps/pep-0440/
+#
+# Generic release markers:
+# X.Y
+# X.Y.Z # For bug fix releases
+#
+# Admissible pre-release markers:
+# X.YaN # Alpha release
+# X.YbN # Beta release
+# X.YrcN # Release Candidate
+# X.Y # Final release
+#
+# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
+# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
+#
+__version__ = '3.5.3'  # pragma: no cover