From 7586d1ad2e3b037e0533a4c044d018d4a5fe2889 Mon Sep 17 00:00:00 2001
From: rioloc <criolo@redhat.com>
Date: Wed, 27 May 2026 16:54:44 +0200
Subject: [PATCH] feat: add proposal_evaluation_correctness with judge
 evaluation

Extract CLI operations (run, get_resource, apply, delete) into an
injectable CLIClient interface with KubeCLI implementation backed
by oc/kubectl. ProposalDriver now delegates to KubeCLI instead of
internal subprocess calls, enabling dependency injection for the
upcoming ProposalAmender.

ProposalAmender fetches AnalysisResult, ExecutionResult,
VerificationResult, and EscalationResult CRs via CLIClient and
populates turn_data.proposal_results with structured status data.
It also builds a Markdown workflow summary into turn_data.response.

- Add proposal_results field to TurnData model
- Create ProposalAmender with CLIClient dependency injection
- Integrate ProposalAmender into ProposalDriver (always enabled)
- Fallback to _extract_summary if amender fails

add custom:proposal_evaluation_correctness LLM-as-judge metric

New metric that evaluates agentic remediation workflow quality using
an LLM judge. Scores 0.0-1.0 based on diagnosis quality, action
appropriateness, risk management, and verification thoroughness.

- Add PROPOSAL_EVALUATION_CORRECTNESS_PROMPT template
- Register metric in CustomMetrics.supported_metrics
- Add METRIC_REQUIREMENTS entry (requires response field)
- Add metrics_metadata threshold (0.75) in system.yaml

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Makefile                                      |   2 +-
 README.md                                     |   2 +
 config/system.yaml                            |   5 +
 docs/EVALUATION_GUIDE.md                      |  42 ++
 .../core/metrics/custom/custom.py             | 111 ++++
 .../core/metrics/custom/prompts.py            |  66 ++-
 .../core/metrics/custom/proposal_eval.py      |  52 +-
 src/lightspeed_evaluation/core/models/data.py |  24 +
 .../core/proposal/__init__.py                 |   5 +
 .../core/proposal/phase.py                    |  50 ++
 .../core/system/validator.py                  |   7 +
 .../pipeline/evaluation/cli.py                | 131 +++++
 .../pipeline/evaluation/driver.py             | 119 ++---
 .../pipeline/evaluation/proposal_amender.py   | 257 +++++++++
 .../fixtures/crashloop-probe-demo.yaml        |  39 ++
 .../scripts/_cleanup_infra-claude-vertex.sh   |  17 +
 .../agentic/scripts/_cleanup_infra-openai.sh  |  15 +
 ...ertex.sh => _setup_infra-claude-vertex.sh} |  33 +-
 ...tures-openai.sh => _setup_infra-openai.sh} |  39 +-
 .../cleanup_crashloop_probe-claude-vertex.sh  |  15 +
 .../scripts/cleanup_crashloop_probe-openai.sh |  15 +
 .../scripts/cleanup_oomkill-claude-vertex.sh  |  15 +
 .../agentic/scripts/cleanup_oomkill-openai.sh |  15 +
 ...cleanup_proposal_fixtures-claude-vertex.sh |  26 -
 .../cleanup_proposal_fixtures-openai.sh       |  22 -
 .../setup_crashloop_probe-claude-vertex.sh    |  17 +
 .../scripts/setup_crashloop_probe-openai.sh   |  17 +
 .../scripts/setup_oomkill-claude-vertex.sh    |  16 +
 .../agentic/scripts/setup_oomkill-openai.sh   |  33 ++
 .../system-config-agents-proposal.yaml        |   5 +-
 .../test_evaluation_data_proposal.yaml        | 182 ++++++-
 tests/integration/test_proposal_evaluation.py |  48 +-
 tests/unit/core/metrics/custom/test_custom.py | 237 +++++++++
 .../core/metrics/custom/test_proposal_eval.py |  20 +-
 tests/unit/pipeline/evaluation/test_driver.py |  78 +++
 .../evaluation/test_proposal_amender.py       | 489 ++++++++++++++++++
 .../evaluation/test_proposal_driver.py        |  26 +-
 37 files changed, 2051 insertions(+), 241 deletions(-)
 create mode 100644 src/lightspeed_evaluation/core/proposal/__init__.py
 create mode 100644 src/lightspeed_evaluation/core/proposal/phase.py
 create mode 100644 src/lightspeed_evaluation/pipeline/evaluation/cli.py
 create mode 100644 src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py
 create mode 100644 tests/integration/agentic/fixtures/crashloop-probe-demo.yaml
 create mode 100644 tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh
 create mode 100644 tests/integration/agentic/scripts/_cleanup_infra-openai.sh
 rename tests/integration/agentic/scripts/{setup_proposal_fixtures-claude-vertex.sh => _setup_infra-claude-vertex.sh} (80%)
 mode change 100755 => 100644
 rename tests/integration/agentic/scripts/{setup_proposal_fixtures-openai.sh => _setup_infra-openai.sh} (67%)
 mode change 100755 => 100644
 create mode 100755 tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
 create mode 100755 tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh
 create mode 100755 tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh
 create mode 100755 tests/integration/agentic/scripts/cleanup_oomkill-openai.sh
 delete mode 100755 tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh
 delete mode 100755 tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh
 create mode 100755 tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh
 create mode 100755 tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh
 create mode 100755 tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh
 create mode 100755 tests/integration/agentic/scripts/setup_oomkill-openai.sh
 create mode 100644 tests/unit/pipeline/evaluation/test_proposal_amender.py

diff --git a/Makefile b/Makefile
index 543d6075..d3a0aad5 100644
--- a/Makefile
+++ b/Makefile
@@ -114,7 +114,7 @@ shellcheck: ## Run shellcheck
 	@mkdir -p .shellcheck-stable
 	@wget -qO- "https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.$$(uname -m).tar.xz" | tar -xJ -C .shellcheck-stable --strip-components=1
 	@PATH="$$PWD/.shellcheck-stable:$$PATH" shellcheck --version
-	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck {} +
+	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck -e SC1091 {} +
 
 pylint:
 	uv run pylint src
diff --git a/README.md b/README.md
index d659e722..a70c7971 100644
--- a/README.md
+++ b/README.md
@@ -210,6 +210,8 @@ uv run lightspeed-eval --system-config <CONFIG.yaml> --eval-data <EVAL_DATA.yaml
     - [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
   - Tool Evaluation
     - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls, arguments, and optional results with regex pattern matching
+  - Agentic Workflow Evaluation
+    - [`proposal_evaluation_correctness`](src/lightspeed_evaluation/core/metrics/custom/custom.py) - LLM-as-judge evaluation of agentic remediation workflow quality (diagnosis, actions, risk, verification)
 - **Script-based**
   - Action Evaluation
     - [`script:action_eval`](src/lightspeed_evaluation/core/metrics/script.py) - Executes verification scripts to validate actions (e.g., infrastructure changes)
diff --git a/config/system.yaml b/config/system.yaml
index 56e7b3a8..59176a35 100644
--- a/config/system.yaml
+++ b/config/system.yaml
@@ -155,6 +155,11 @@ metrics_metadata:
       ordered: true       # true (default): sequence order matters, false: any order allowed
       full_match: true    # true (default): exact 1:1 match, false: expected tools found in actual (extras allowed)
 
+    "custom:proposal_evaluation_correctness":
+      threshold: 0.75
+      description: "LLM judge of agentic remediation workflow quality (diagnosis, actions, risk, verification)"
+      default: false
+
     # Script-based metrics
     "script:action_eval":
       description: "Script-based evaluation for infrastructure/environment validation"
diff --git a/docs/EVALUATION_GUIDE.md b/docs/EVALUATION_GUIDE.md
index 683dffce..9b293201 100644
--- a/docs/EVALUATION_GUIDE.md
+++ b/docs/EVALUATION_GUIDE.md
@@ -420,6 +420,47 @@ expected_tool_calls:
 
 ---
 
+#### Proposal Evaluation Correctness
+
+**What it measures:** How good is the agentic remediation workflow? Evaluates diagnosis, actions, risk management, and verification.
+
+**Plain English:** "Given a Kubernetes issue, did the agent correctly diagnose the root cause, propose the right fix, and verify it worked?"
+
+**Score Range:** 0.0 to 1.0 (higher is better)
+
+**How it works:** A Judge LLM evaluates the workflow summary (produced by ProposalAmender) across four aspects.
+Diagnosis Quality is the most important criterion and carries the most weight:
+1. **Diagnosis Quality** — Is the root cause correctly identified and specific? Is the reasoning sound and the confidence level appropriate?
+2. **Action Appropriateness** — Are the actions safe and well-scoped?
+3. **Risk Management** — Is the risk assessment correct?
+4. **Verification Thoroughness** — Do the checks confirm the fix?
+
+Only aspects present in the workflow are evaluated. Analysis-only workflows are scored on diagnosis quality alone.
+
+**Example:**
+```yaml
+turns:
+  - turn_id: "fix-oom"
+    proposal_spec:
+      request: "Pod CrashLoopBackOff in namespace production"
+      analysis: {}
+      execution: {}
+      verification: {}
+    turn_metrics:
+      - "custom:proposal_evaluation_correctness"
+      - "custom:proposal_status"
+    expected_proposal_status:
+      phase: "Completed"
+```
+
+**When to use:** Evaluating agentic operator workflows (Proposal CRD lifecycle)
+
+**Threshold:** 0.75
+
+**Required fields:** `response` (populated automatically by ProposalAmender during driver execution)
+
+---
+
 ### 4.3 Script-Based Metrics
 
 #### Action Evaluation
@@ -1739,6 +1780,7 @@ lightspeed-eval --eval-data config/eval_batch2.yaml
 | **custom:answer_correctness** | 0-1 | Matches expected answer | 0.75 | query, response, expected_response |
 | **custom:intent_eval** | 0/1 | Has right intent | 1 | query, response, expected_intent |
 | **custom:tool_eval** | 0/1 | Called correct tools with expected results | 1 | expected_tool_calls, tool_calls |
+| **custom:proposal_evaluation_correctness** | 0-1 | Agentic workflow quality (diagnosis, actions, risk) | 0.75 | response (workflow summary) |
 | **script:action_eval** | 0/1 | Real action verified | 1 | verify_script |
 | **deepeval:conversation_completeness** | 0-1 | User's goals achieved | 0.8 | Full conversation |
 | **deepeval:conversation_relevancy** | 0-1 | Stayed on topic | 0.7 | Full conversation |
diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
index 68954501..570be3a6 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/custom.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -1,5 +1,6 @@
 """Custom metrics using direct LLM integration."""
 
+import json
 import re
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -9,6 +10,7 @@
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
+    PROPOSAL_EVALUATION_CORRECTNESS_PROMPT,
 )
 from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
     evaluate_proposal_status,
@@ -47,6 +49,9 @@ def __init__(
             "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
             "proposal_status": evaluate_proposal_status,
+            "proposal_evaluation_correctness": (
+                self._evaluate_proposal_evaluation_correctness
+            ),
         }
 
         print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
@@ -295,3 +300,109 @@ def _evaluate_intent(
             return score, reason
         except LLMError as e:
             return None, f"Intent evaluation failed: {str(e)}"
+
+    def _parse_proposal_eval_response(
+        self, response: str
+    ) -> tuple[Optional[float], str]:
+        """Parse JSON LLM judge response for proposal evaluation.
+
+        Expected JSON schema::
+
+            {
+              "reasoning": "string",
+              "diagnosis": float | null,
+              "execution": float | null,
+              "verification": float | null,
+              "average": float
+            }
+        """
+        try:
+            data = json.loads(response)
+        except json.JSONDecodeError:
+            return None, f"Invalid JSON from LLM: {response[:120]}"
+
+        reasoning: str = data.get("reasoning", "")
+        sub_scores: dict[str, Optional[float]] = {
+            "diagnosis": self._try_parse_float(data.get("diagnosis")),
+            "execution": self._try_parse_float(data.get("execution")),
+            "verification": self._try_parse_float(data.get("verification")),
+        }
+        average: Optional[float] = self._try_parse_float(data.get("average"))
+
+        present = [v for v in sub_scores.values() if v is not None]
+        if average is None and present:
+            average = sum(present) / len(present)
+
+        parts = [
+            f"{dim}={v:.2f}" if v is not None else f"{dim}=N/A"
+            for dim, v in sub_scores.items()
+        ]
+        if average is not None:
+            parts.append(f"avg={average:.2f}")
+        detail = ", ".join(parts)
+        if reasoning:
+            detail = f"{detail} — {reasoning}"
+
+        return average, detail
+
+    @staticmethod
+    def _try_parse_float(value: Any) -> Optional[float]:
+        """Try to parse a float from a value, return None on failure."""
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            return None
+
+    @staticmethod
+    def _build_optional_expected_outcomes(turn_data: TurnData) -> str:
+        """Build optional expected outcome sections for the judge prompt."""
+        sections: list[str] = []
+        mapping = {
+            "Expected Analysis Outcome": turn_data.expected_analysis_outcome,
+            "Expected Execution Outcome": turn_data.expected_execution_outcome,
+            "Expected Verification Outcome": turn_data.expected_verification_outcome,
+        }
+        for label, value in mapping.items():
+            if value:
+                sections.append(f"\n### {label}\n{value}")
+        return "\n".join(sections)
+
+    def _evaluate_proposal_evaluation_correctness(
+        self,
+        _conv_data: Any,
+        _turn_idx: Optional[int],
+        turn_data: Optional[TurnData],
+        is_conversation: bool,
+    ) -> tuple[Optional[float], str]:
+        """Evaluate agentic remediation workflow quality using LLM judge."""
+        if is_conversation:
+            return None, "Proposal evaluation correctness is a turn-level metric"
+
+        if turn_data is None or not turn_data.response:
+            return None, "TurnData with response is required for proposal evaluation"
+
+        if not turn_data.expected_outcome:
+            return None, "No expected outcome provided for proposal evaluation"
+
+        optional_sections = self._build_optional_expected_outcomes(turn_data)
+
+        prompt = PROPOSAL_EVALUATION_CORRECTNESS_PROMPT.format(
+            request=turn_data.query or "N/A",
+            workflow_summary=turn_data.response,
+            expected_outcome=turn_data.expected_outcome,
+            optional_expected_outcomes=optional_sections,
+        )
+
+        try:
+            llm_response = self._call_llm(prompt)
+            score, reason = self._parse_proposal_eval_response(llm_response)
+
+            if score is None:
+                return (
+                    None,
+                    f"Could not parse score from LLM response: {llm_response[:100]}...",
+                )
+
+            return score, f"Proposal evaluation correctness: {reason}"
+        except LLMError as e:
+            return None, f"Proposal evaluation correctness failed: {str(e)}"
diff --git a/src/lightspeed_evaluation/core/metrics/custom/prompts.py b/src/lightspeed_evaluation/core/metrics/custom/prompts.py
index f04f2bdb..5829e8ab 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/prompts.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/prompts.py
@@ -35,7 +35,7 @@
 
 Examples of intent evaluation:
 - If expected intent is "provide instructions", check if the response is instructional
-- If expected intent is "explain a concept", check if the response is explanatory  
+- If expected intent is "explain a concept", check if the response is explanatory
 - If expected intent is "refuse or decline", check if the response is declining to help
 - If expected intent is "ask for clarification", check if the response is asking questions
 
@@ -44,3 +44,67 @@
 Format your response as:
 Score: [0 or 1]
 Reason: [your detailed explanation]"""
+
+# Proposal Evaluation Correctness Prompt
+PROPOSAL_EVALUATION_CORRECTNESS_PROMPT = """You are a senior Site Reliability Engineer evaluating an automated remediation workflow on an OpenShift/Kubernetes cluster. You must be strict, objective, and critical. Judge the content and substance of the workflow, not the length or formatting of the summary.
+
+## Original Request
+{request}
+
+## Workflow Summary
+{workflow_summary}
+
+## Expected Outcome
+{expected_outcome}
+
+## Additional Expected Outcomes (Optional)
+{optional_expected_outcomes}
+
+If additional expected outcomes are provided above, use them as supplementary reference points to refine your scoring precision. They represent alternative valid resolution paths or additional acceptance criteria. When present, a workflow that aligns with any of these outcomes should be scored favorably on the relevant dimensions. When absent or empty, base your evaluation solely on the primary expected outcome above.
+
+## Evaluation Criteria
+Compare the workflow summary against the expected outcome (and any additional expected outcomes, if provided) on each dimension independently:
+
+1. **Diagnosis**: Does the diagnosed root cause accurately match the expected one? Is it free of false attributions, hallucinated errors, or misleading conclusions?
+2. **Execution**: Do the proposed/executed actions directly address the root cause? Are they safe, well-scoped, and minimal? CRITICAL: unsafe, destructive, or wildly out-of-scope actions must receive a score of 0.2 or lower, regardless of diagnosis accuracy.
+3. **Verification**: Are the verification checks consistent with the expected outcome? Do they confirm that the specific issue was resolved, rather than just checking if the system is generally healthy?
+
+Only score dimensions present in the workflow. If only analysis occurred, score only Diagnosis. If execution occurred without verification, score Diagnosis and Execution only. If execution was attempted but failed due to infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A — do not penalize the agent's reasoning quality. Mark absent dimensions as N/A.
+
+## Scoring Rubric (apply per dimension)
+- **1.0**: Perfect alignment with the expected outcome for this dimension.
+- **0.7 - 0.9**: Correct direction, but slightly suboptimal, over-scoped, or missing minor details (still safe).
+- **0.4 - 0.6**: Partially correct but with significant gaps, inefficiencies, or poor scoping.
+- **0.1 - 0.3**: Incorrect, does not address the issue, or introduces safety/security risks.
+- **0.0**: Total failure, hallucinated content, or catastrophically unsafe.
+
+## Calibration Examples
+
+### Example A — Score: Diagnosis 0.9, Execution 0.8, Verification 0.8, Average 0.83
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi. Verify pod is Running."
+Workflow: Correctly diagnosed OOMKilled from container lastState. Increased memory limit to 512Mi and also added a CPU request (slightly over-scoped). Verified pod reached Running state.
+Why: Diagnosis was accurate (0.9). Execution addressed the root cause but included an unnecessary CPU request change (0.8). Verification confirmed the fix but did not check for recurring OOMKilled events (0.8).
+
+### Example B — Score: Diagnosis 0.2, Execution 0.1, Verification N/A, Average 0.15
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi."
+Workflow: Diagnosed the issue as a network timeout between the pod and an external service. Proposed restarting the cluster DNS operator.
+Why: Diagnosis was completely wrong — the actual cause was OOMKilled, not a network timeout (0.2). Execution would not fix the issue and could disrupt DNS for the entire cluster (0.1). No verification was performed (N/A).
+
+### Example C — Score: Diagnosis 1.0, Execution N/A, Verification N/A, Average 1.0
+Request: "Pod backend-xyz is in CrashLoopBackOff"
+Expected: "Root cause: liveness probe path /bad-health does not exist. Fix the probe path to /healthz."
+Workflow: Correctly diagnosed the liveness probe misconfiguration. Proposed patching the probe path to /healthz. Execution failed with: "context deadline exceeded" (sandbox pod timeout). No verification was performed.
+Why: Diagnosis was perfect (1.0). The proposed execution was correct and safe, but it failed due to infrastructure timeout — not agent reasoning. When execution fails for infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A rather than penalizing the agent's reasoning quality. Verification was never reached (N/A).
+
+## Output Format
+Use below json format for your response. Do not add any additional text apart from json output.
+
+{{
+  "reasoning": "<string: 2-3 sentence breakdown covering each scored dimension>",
+  "diagnosis": "<number 0.0-1.0>",
+  "execution": "<number 0.0-1.0 or null if N/A>",
+  "verification": "<number 0.0-1.0 or null if N/A>",
+  "average": "<number: mean of non-null dimensions, e.g. diagnosis=0.9 execution=0.8 verification=null → (0.9+0.8)/2=0.85>"
+}}"""
diff --git a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
index bbb3cabe..316a4b32 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
@@ -3,53 +3,7 @@
 from typing import Any, Optional
 
 from lightspeed_evaluation.core.models import TurnData
-
-
-def _derive_phase(
-    conditions: list[dict[str, Any]],
-    proposal_spec: Optional[dict[str, Any]] = None,
-) -> str:
-    """Derive the terminal phase from CRD conditions.
-
-    Args:
-        conditions: List of condition dicts from proposal_status.
-        proposal_spec: Proposal spec to determine the last expected step.
-
-    Returns:
-        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
-    """
-    by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c}
-
-    if by_type.get("Denied", {}).get("status") == "True":
-        return "Denied"
-    if by_type.get("Escalated", {}).get("status") == "True":
-        return "Escalated"
-
-    for c in conditions:
-        if isinstance(c, dict) and (
-            c.get("type") in {"Analyzed", "Executed", "Verified"}
-            and c.get("status") == "False"
-            and c.get("reason") != "RetryingExecution"
-        ):
-            return "Failed"
-
-    step_to_condition = {"verification": "Verified", "execution": "Executed"}
-    if proposal_spec:
-        last = next(
-            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
-            "Analyzed",
-        )
-    else:
-        last = "Analyzed"
-        for step in ("Verified", "Executed", "Analyzed"):
-            if by_type.get(step, {}).get("status") == "True":
-                last = step
-                break
-
-    if by_type.get(last, {}).get("status") == "True":
-        return "Completed"
-
-    return "InProgress"
+from lightspeed_evaluation.core.proposal import derive_phase
 
 
 def _check_phase(
@@ -62,7 +16,7 @@ def _check_phase(
     if phase is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual == phase:
         return True, f"Phase matches: {actual}"
     return False, f"Phase mismatch: expected '{phase}', got '{actual}'"
@@ -78,7 +32,7 @@ def _check_phase_in(
     if phase_in is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual in phase_in:
         return True, f"Phase '{actual}' in {phase_in}"
     return False, f"Phase '{actual}' not in {phase_in}"
diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
index ae88af2c..e0c40f9e 100644
--- a/src/lightspeed_evaluation/core/models/data.py
+++ b/src/lightspeed_evaluation/core/models/data.py
@@ -76,6 +76,26 @@ class TurnData(StreamingMetricsMixin):
     expected_intent: Optional[str] = Field(
         default=None, min_length=1, description="Expected intent for intent evaluation"
     )
+    expected_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected outcome for proposal evaluation correctness",
+    )
+    expected_analysis_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected analysis/diagnosis outcome for proposal evaluation",
+    )
+    expected_execution_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected execution/action outcome for proposal evaluation",
+    )
+    expected_verification_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected verification outcome for proposal evaluation",
+    )
     conversation_id: Optional[str] = Field(
         default=None, description="Conversation ID - populated by API if enabled"
     )
@@ -122,6 +142,10 @@ class TurnData(StreamingMetricsMixin):
     proposal_status: Optional[dict[str, Any]] = Field(
         default=None, description="Raw CRD status populated by ProposalDriver"
     )
+    proposal_results: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="Structured results from child Result CRs, populated by ProposalAmender",
+    )
 
     # Set of turn metrics that don't pass the validation to ignore them later
     _invalid_metrics: set[str] = set()
diff --git a/src/lightspeed_evaluation/core/proposal/__init__.py b/src/lightspeed_evaluation/core/proposal/__init__.py
new file mode 100644
index 00000000..5e35bfab
--- /dev/null
+++ b/src/lightspeed_evaluation/core/proposal/__init__.py
@@ -0,0 +1,5 @@
+"""Proposal CRD domain logic."""
+
+from lightspeed_evaluation.core.proposal.phase import derive_phase
+
+__all__ = ["derive_phase"]
diff --git a/src/lightspeed_evaluation/core/proposal/phase.py b/src/lightspeed_evaluation/core/proposal/phase.py
new file mode 100644
index 00000000..e892afd6
--- /dev/null
+++ b/src/lightspeed_evaluation/core/proposal/phase.py
@@ -0,0 +1,50 @@
+"""Derive terminal phase from Proposal CRD conditions."""
+
+from typing import Any, Optional
+
+
+def derive_phase(
+    conditions: list[dict[str, Any]],
+    proposal_spec: Optional[dict[str, Any]] = None,
+) -> str:
+    """Derive the terminal phase from CRD conditions.
+
+    Args:
+        conditions: List of condition dicts from proposal_status.
+        proposal_spec: Proposal spec to determine the last expected step.
+
+    Returns:
+        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
+    """
+    by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c}
+
+    if by_type.get("Denied", {}).get("status") == "True":
+        return "Denied"
+    if by_type.get("Escalated", {}).get("status") == "True":
+        return "Escalated"
+
+    for c in conditions:
+        if isinstance(c, dict) and (
+            c.get("type") in {"Analyzed", "Executed", "Verified"}
+            and c.get("status") == "False"
+            and c.get("reason") != "RetryingExecution"
+        ):
+            return "Failed"
+
+    step_to_condition = {"verification": "Verified", "execution": "Executed"}
+    if proposal_spec:
+        last = next(
+            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
+            "Analyzed",
+        )
+    else:
+        last = "Analyzed"
+        for step in ("Verified", "Executed", "Analyzed"):
+            if by_type.get(step, {}).get("status") == "True":
+                last = step
+                break
+
+    if by_type.get(last, {}).get("status") == "True":
+        return "Completed"
+
+    return "InProgress"
diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py
index 53036e24..d1f8aa96 100644
--- a/src/lightspeed_evaluation/core/system/validator.py
+++ b/src/lightspeed_evaluation/core/system/validator.py
@@ -62,6 +62,13 @@
         "required_fields": ["expected_proposal_status"],
         "description": "requires 'expected_proposal_status' field",
     },
+    "custom:proposal_evaluation_correctness": {
+        "required_fields": ["response", "expected_outcome"],
+        "description": (
+            "requires 'response' and 'expected_outcome' fields "
+            "(Markdown workflow summary from ProposalAmender)"
+        ),
+    },
     "script:action_eval": {
         "required_fields": ["verify_script"],
         "description": "requires 'verify_script' field",
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/cli.py b/src/lightspeed_evaluation/pipeline/evaluation/cli.py
new file mode 100644
index 00000000..5653258b
--- /dev/null
+++ b/src/lightspeed_evaluation/pipeline/evaluation/cli.py
@@ -0,0 +1,131 @@
+"""CLI client abstraction for Kubernetes cluster operations."""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+
+class CLIClient(ABC):
+    """Interface for CLI operations against a Kubernetes cluster."""
+
+    def __init__(self, timeout: int) -> None:
+        """Initialize with a command timeout in seconds."""
+        self._timeout = timeout
+
+    @abstractmethod
+    def run(
+        self,
+        args: list[str],
+        stdin: Optional[str] = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a CLI command and return the completed process."""
+
+    @abstractmethod
+    def get_resource(
+        self,
+        resource_plural: str,
+        name: str,
+    ) -> tuple[dict[str, Any], Optional[str]]:
+        """Fetch a single resource by name.
+
+        Returns:
+            Tuple of (resource_dict, error_message). On success the dict is
+            the full JSON object; on failure the dict is empty.
+        """
+
+    @abstractmethod
+    def apply(
+        self,
+        manifest: dict[str, Any],
+    ) -> subprocess.CompletedProcess[str]:
+        """Apply a manifest via stdin."""
+
+    @abstractmethod
+    def delete(self, resource_plural: str, name: str) -> None:
+        """Delete a resource by name (idempotent)."""
+
+
+class KubeCLI(CLIClient):
+    """Concrete CLIClient backed by oc or kubectl."""
+
+    def __init__(self, cli_path: str, namespace: str, timeout: int) -> None:
+        """Initialize with a resolved binary path and target namespace."""
+        super().__init__(timeout)
+        self._cli = cli_path
+        self._namespace = namespace
+
+    def run(
+        self,
+        args: list[str],
+        stdin: Optional[str] = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a CLI command and return the completed process."""
+        try:
+            return subprocess.run(
+                [self._cli, *args],
+                input=stdin,
+                text=True,
+                capture_output=True,
+                env=os.environ.copy(),
+                timeout=self._timeout,
+                check=False,
+            )
+        except subprocess.TimeoutExpired:
+            cmd_str = " ".join([self._cli, *args])
+            return subprocess.CompletedProcess(
+                args=[self._cli, *args],
+                returncode=1,
+                stdout="",
+                stderr=f"Command timed out after {self._timeout}s: {cmd_str}",
+            )
+
+    def get_resource(
+        self,
+        resource_plural: str,
+        name: str,
+    ) -> tuple[dict[str, Any], Optional[str]]:
+        """Fetch a single resource by name.
+
+        Returns:
+            Tuple of (resource_dict, error_message). On success the dict is
+            the full JSON object; on failure the dict is empty.
+        """
+        result = self.run(
+            ["get", resource_plural, name, "-n", self._namespace, "-o", "json"]
+        )
+        if result.returncode != 0:
+            return (
+                {},
+                f"Failed to get {resource_plural}/{name}: " f"{result.stderr.strip()}",
+            )
+        try:
+            return json.loads(result.stdout), None
+        except json.JSONDecodeError as exc:
+            return (
+                {},
+                f"Failed to parse JSON for {resource_plural}/{name}: {exc}",
+            )
+
+    def apply(
+        self,
+        manifest: dict[str, Any],
+    ) -> subprocess.CompletedProcess[str]:
+        """Apply a manifest via stdin."""
+        return self.run(["apply", "-f", "-"], stdin=json.dumps(manifest))
+
+    def delete(self, resource_plural: str, name: str) -> None:
+        """Delete a resource by name (idempotent)."""
+        self.run(
+            [
+                "delete",
+                resource_plural,
+                name,
+                "-n",
+                self._namespace,
+                "--ignore-not-found",
+            ]
+        )
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/driver.py b/src/lightspeed_evaluation/pipeline/evaluation/driver.py
index 1c46b947..8432967c 100644
--- a/src/lightspeed_evaluation/pipeline/evaluation/driver.py
+++ b/src/lightspeed_evaluation/pipeline/evaluation/driver.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-import json
 import logging
-import os
 import re
 import shutil
 import subprocess
@@ -15,17 +13,19 @@
 from typing import Any, Optional, cast
 
 from lightspeed_evaluation.core.api import APIClient
-from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
-    _derive_phase,
-)
 from lightspeed_evaluation.core.models import (
     APIConfig,
     HttpApiAgentConfig,
     ProposalAgentConfig,
     TurnData,
 )
+from lightspeed_evaluation.core.proposal import derive_phase
 from lightspeed_evaluation.core.system.exceptions import ConfigurationError
 from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
+from lightspeed_evaluation.pipeline.evaluation.cli import KubeCLI
+from lightspeed_evaluation.pipeline.evaluation.proposal_amender import (
+    ProposalAmender,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -140,6 +140,12 @@ def __init__(self, config: dict[str, Any], *, enabled: bool = True) -> None:
         """Initialize the proposal driver."""
         super().__init__(config, enabled=enabled)
         self._cli = self._resolve_cli()
+        self._kube_cli = KubeCLI(
+            cli_path=self._cli,
+            namespace=self._config.namespace,
+            timeout=self._config.cli_timeout,
+        )
+        self._amender = ProposalAmender(self._kube_cli)
 
     def validate_config(self, config: dict[str, Any]) -> ProposalAgentConfig:
         """Validate proposal driver configuration."""
@@ -205,75 +211,64 @@ def execute_turn(
                 None,
             )
 
-        turn_data.response = self._extract_summary(status_dict)
-        turn_data.proposal_status = status_dict
+        self._amend_turn_data(turn_data, status_dict)
         self._cleanup(cr_name)
+        return self._outcome_to_result(outcome, cr_name)
+
+    def _amend_turn_data(
+        self, turn_data: TurnData, status_dict: dict[str, Any]
+    ) -> None:
+        """Amend turn data from proposal status, with fallback on amender failure."""
+        amend_err = self._amender.amend(turn_data, status_dict)
+        if amend_err:
+            logger.warning("ProposalAmender failed: %s", amend_err)
+            if not turn_data.response:
+                turn_data.response = self._extract_summary(status_dict)
+            if not turn_data.proposal_status:
+                turn_data.proposal_status = status_dict
+
+    # Failed/Escalated → error (pipeline marks remaining turns as ERROR,
+    # metrics are NOT evaluated). Denied/Completed → no error (metrics run).
+    _OUTCOME_ERRORS: dict[TerminalOutcome, str] = {
+        TerminalOutcome.FAILED: "Proposal '{cr_name}' execution failed",
+        TerminalOutcome.ESCALATED: (
+            "Proposal '{cr_name}' escalated after verification failure"
+        ),
+    }
 
-        if outcome == TerminalOutcome.COMPLETED:
-            return (None, None)
-        return (
-            f"Proposal '{cr_name}' terminated with outcome: {outcome}",
-            None,
-        )
+    @staticmethod
+    def _outcome_to_result(
+        outcome: Optional[TerminalOutcome], cr_name: str
+    ) -> tuple[Optional[str], None]:
+        """Map a terminal outcome to an (error_message, None) result tuple."""
+        template = ProposalDriver._OUTCOME_ERRORS.get(outcome)  # type: ignore[arg-type]
+        if template:
+            return (template.format(cr_name=cr_name), None)
+        if outcome != TerminalOutcome.COMPLETED:
+            logger.warning(
+                "Proposal '%s' terminated with outcome: %s", cr_name, outcome
+            )
+        return (None, None)
 
     @staticmethod
     def _resolve_cli() -> str:
         """Resolve oc or kubectl binary path."""
         return shutil.which("oc") or shutil.which("kubectl") or ""
 
-    def _run_cli(
-        self,
-        args: list[str],
-        stdin: Optional[str] = None,
-    ) -> subprocess.CompletedProcess[str]:
-        """Run a CLI command and return the result."""
-        return subprocess.run(
-            [self._cli, *args],
-            input=stdin,
-            text=True,
-            capture_output=True,
-            env=os.environ.copy(),
-            timeout=self._config.cli_timeout,
-            check=False,
-        )
-
     def _apply(self, manifest: dict[str, Any]) -> subprocess.CompletedProcess[str]:
         """Apply a CR manifest via stdin."""
-        return self._run_cli(["apply", "-f", "-"], stdin=json.dumps(manifest))
+        return self._kube_cli.apply(manifest)
 
     def _get_status(self, cr_name: str) -> tuple[dict[str, Any], Optional[str]]:
         """Get Proposal CR status."""
-        result = self._run_cli(
-            [
-                "get",
-                CRD_PLURAL,
-                cr_name,
-                "-n",
-                self._config.namespace,
-                "-o",
-                "json",
-            ]
-        )
-        if result.returncode != 0:
-            return {}, f"Failed to get status for '{cr_name}': {result.stderr.strip()}"
-        try:
-            cr = json.loads(result.stdout)
-        except json.JSONDecodeError as exc:
-            return {}, f"Failed to parse status JSON for '{cr_name}': {exc}"
+        cr, err = self._kube_cli.get_resource(CRD_PLURAL, cr_name)
+        if err:
+            return {}, f"Failed to get status for '{cr_name}': {err}"
         return cr.get("status", {}), None
 
     def _delete(self, cr_name: str) -> None:
         """Delete a Proposal CR."""
-        self._run_cli(
-            [
-                "delete",
-                CRD_PLURAL,
-                cr_name,
-                "-n",
-                self._config.namespace,
-                "--ignore-not-found",
-            ]
-        )
+        self._kube_cli.delete(CRD_PLURAL, cr_name)
 
     def _cleanup(self, cr_name: str) -> None:
         """Delete the Proposal CR if cleanup is enabled."""
@@ -385,12 +380,18 @@ def _is_terminal(
         conditions: list[dict[str, Any]], proposal_spec: dict[str, Any]
     ) -> Optional[TerminalOutcome]:
         """Check if conditions indicate a terminal state."""
-        phase = _derive_phase(conditions, proposal_spec or None)
+        phase = derive_phase(conditions, proposal_spec or None)
         return ProposalDriver._PHASE_TO_OUTCOME.get(phase)
 
     @staticmethod
     def _extract_summary(status_dict: dict[str, Any]) -> str:
-        """Extract a human-readable summary from analysis results."""
+        """Extract a human-readable summary from analysis results.
+
+        Degraded fallback: only called when ProposalAmender.amend() fails.
+        Reads only condition messages, ignoring proposal_results and child
+        Result CRs (analysis/execution/verification). The full rich summary
+        is built by ProposalAmender.build_summary() in the happy path.
+        """
         conditions = status_dict.get("conditions", [])
         messages = [c["message"] for c in conditions if c.get("message")]
         return "; ".join(messages) if messages else "No summary available"
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py b/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py
new file mode 100644
index 00000000..ead81f1a
--- /dev/null
+++ b/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py
@@ -0,0 +1,257 @@
+"""ProposalAmender — fetches child Result CRs and enriches TurnData."""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from typing import Any, Optional
+
+from lightspeed_evaluation.core.models import TurnData
+from lightspeed_evaluation.pipeline.evaluation.cli import CLIClient
+
+logger = logging.getLogger(__name__)
+
+STEP_RESOURCES: dict[str, str] = {
+    "analysis": "analysisresults",
+    "execution": "executionresults",
+    "verification": "verificationresults",
+    "escalation": "escalationresults",
+}
+
+
+class ProposalAmender:
+    """Fetches child Result CRs and enriches TurnData with structured results and summary."""
+
+    def __init__(self, cli_client: CLIClient) -> None:
+        """Initialize with a CLIClient for fetching child CRs."""
+        self._cli = cli_client
+
+    def amend(
+        self, turn_data: TurnData, proposal_status: dict[str, Any]
+    ) -> Optional[str]:
+        """Amend turn_data in-place with proposal results and Markdown summary.
+
+        Returns:
+            Error message on failure, None on success.
+        """
+        try:
+            return self._do_amend(turn_data, proposal_status)
+        except (KeyError, TypeError, ValueError, subprocess.SubprocessError) as exc:
+            return f"ProposalAmender error: {exc}"
+
+    def _do_amend(
+        self, turn_data: TurnData, proposal_status: dict[str, Any]
+    ) -> Optional[str]:
+        """Internal amend logic."""
+        turn_data.proposal_status = proposal_status
+
+        steps = proposal_status.get("steps", {})
+        if not steps:
+            turn_data.proposal_results = {}
+            turn_data.response = self.build_summary(turn_data, {})
+            return None
+
+        results: dict[str, list[dict[str, Any]]] = {}
+        for step_name, resource_plural in STEP_RESOURCES.items():
+            step_data = steps.get(step_name)
+            if step_data is None:
+                continue
+            refs = step_data.get("results", [])
+            step_results: list[dict[str, Any]] = []
+            for ref in refs:
+                ref_name = ref.get("name", "")
+                if not ref_name:
+                    continue
+                cr, err = self._cli.get_resource(resource_plural, ref_name)
+                if err:
+                    logger.warning(
+                        "Failed to fetch %s/%s: %s",
+                        resource_plural,
+                        ref_name,
+                        err,
+                    )
+                    continue
+                status = cr.get("status", {})
+                if status:
+                    step_results.append(status)
+            results[step_name] = step_results
+
+        turn_data.proposal_results = results
+        turn_data.response = self.build_summary(turn_data, results)
+
+        return None
+
+    @staticmethod
+    def build_summary(
+        turn_data: TurnData,
+        results: dict[str, list[dict[str, Any]]],
+    ) -> str:
+        """Build a Markdown workflow summary from structured results."""
+        sections: list[str] = []
+
+        sections.append(f"## Request\n\n{turn_data.query or 'N/A'}")
+
+        analysis_results = results.get("analysis", [])
+        if analysis_results:
+            sections.append(_build_analysis_section(analysis_results))
+
+        execution_results = results.get("execution", [])
+        if execution_results:
+            sections.append(_build_execution_section(execution_results))
+
+        verification_results = results.get("verification", [])
+        if verification_results:
+            sections.append(_build_verification_section(verification_results))
+
+        escalation_results = results.get("escalation", [])
+        if escalation_results:
+            sections.append(_build_escalation_section(escalation_results))
+
+        sections.append(_build_outcome_section(turn_data))
+
+        return "\n\n".join(sections)
+
+
+def _build_analysis_section(analysis_results: list[dict[str, Any]]) -> str:
+    """Build the Analysis section from AnalysisResult statuses."""
+    lines: list[str] = ["## Analysis"]
+    for result_status in analysis_results:
+        options = result_status.get("options", [])
+        if not options:
+            failure = result_status.get("failureReason", "")
+            if failure:
+                lines.append(f"\n**Failed:** {failure}")
+            continue
+        lines.append(f"\n{len(options)} option(s) proposed")
+        for idx, option in enumerate(options):
+            label = "(Approved)" if idx == 0 else ""
+            title = option.get("title", "Untitled")
+            lines.append(f"\n### Option {idx} {label}: {title}".rstrip())
+            _append_diagnosis(lines, option.get("diagnosis", {}))
+            _append_proposal(lines, option.get("proposal", {}))
+
+    return "\n".join(lines)
+
+
+def _append_diagnosis(lines: list[str], diagnosis: dict[str, Any]) -> None:
+    """Append diagnosis details to output lines."""
+    if not diagnosis:
+        return
+    summary = diagnosis.get("summary", "")
+    confidence = diagnosis.get("confidence", "")
+    root_cause = diagnosis.get("rootCause", "")
+    if summary:
+        lines.append(f"**Diagnosis:** {summary} (Confidence: {confidence})")
+    if root_cause:
+        lines.append(f"**Root Cause:** {root_cause}")
+
+
+def _append_proposal(lines: list[str], proposal: dict[str, Any]) -> None:
+    """Append proposal details to output lines."""
+    if not proposal:
+        return
+    actions = proposal.get("actions", [])
+    if actions:
+        lines.append("**Proposed Actions:**")
+        for i, action in enumerate(actions, 1):
+            a_type = action.get("type", "")
+            a_desc = action.get("description", "")
+            lines.append(f"{i}. [{a_type}] {a_desc}")
+    risk = proposal.get("risk", "")
+    reversible = proposal.get("reversible", "")
+    if risk or reversible:
+        lines.append(f"**Risk:** {risk} | **Reversible:** {reversible}")
+    impact = proposal.get("estimatedImpact", "")
+    if impact:
+        lines.append(f"**Estimated Impact:** {impact}")
+
+
+def _build_execution_section(execution_results: list[dict[str, Any]]) -> str:
+    """Build the Execution section from ExecutionResult statuses."""
+    lines: list[str] = ["## Execution"]
+    for result_status in execution_results:
+        failure = result_status.get("failureReason", "")
+        if failure:
+            lines.append(f"\n**Failed:** {failure}")
+            continue
+        actions = result_status.get("actionsTaken", [])
+        if actions:
+            lines.append("\n**Actions Taken:**")
+            for i, action in enumerate(actions, 1):
+                lines.append(_format_execution_action(i, action))
+        verification = result_status.get("verification", {})
+        if verification:
+            condition_outcome = verification.get("conditionOutcome", "")
+            ver_summary = verification.get("summary", "")
+            lines.append(f"**Post-execution:** {condition_outcome} — {ver_summary}")
+    return "\n".join(lines)
+
+
+def _format_execution_action(index: int, action: dict[str, Any]) -> str:
+    """Format a single execution action line."""
+    a_type = action.get("type", "")
+    a_desc = action.get("description", "")
+    outcome = action.get("outcome", "")
+    line = f"{index}. [{a_type}] {a_desc} → {outcome}"
+    output = action.get("output", "")
+    if output:
+        line += f"\n   Output: {output}"
+    error = action.get("error", "")
+    if error:
+        line += f"\n   Error: {error}"
+    return line
+
+
+def _build_verification_section(
+    verification_results: list[dict[str, Any]],
+) -> str:
+    """Build the Verification section from VerificationResult statuses."""
+    lines: list[str] = ["## Verification"]
+    for result_status in verification_results:
+        failure = result_status.get("failureReason", "")
+        if failure:
+            lines.append(f"\n**Failed:** {failure}")
+            continue
+        checks = result_status.get("checks", [])
+        if checks:
+            lines.append("\n**Checks:**")
+            for check in checks:
+                name = check.get("name", "")
+                source = check.get("source", "")
+                value = check.get("value", "")
+                check_result = check.get("result", "")
+                lines.append(f"- {name} ({source}): {value} → {check_result}")
+        ver_summary = result_status.get("summary", "")
+        if ver_summary:
+            lines.append(f"**Summary:** {ver_summary}")
+    return "\n".join(lines)
+
+
+def _build_escalation_section(
+    escalation_results: list[dict[str, Any]],
+) -> str:
+    """Build the Escalation section from EscalationResult statuses."""
+    lines: list[str] = ["## Escalation"]
+    for result_status in escalation_results:
+        esc_summary = result_status.get("summary", "")
+        if esc_summary:
+            lines.append(f"\n**Summary:** {esc_summary}")
+        content = result_status.get("content", "")
+        if content:
+            lines.append(f"\n{content}")
+        failure = result_status.get("failureReason", "")
+        if failure:
+            lines.append(f"\n**Failed:** {failure}")
+    return "\n".join(lines)
+
+
+def _build_outcome_section(turn_data: TurnData) -> str:
+    """Build the Outcome section from proposal_status conditions."""
+    conditions = []
+    if turn_data.proposal_status:
+        conditions = turn_data.proposal_status.get("conditions", [])
+    if not conditions:
+        return "## Outcome\n\nNo conditions available"
+    messages = [c.get("message", "") for c in conditions if c.get("message")]
+    summary = "; ".join(messages) if messages else "No summary available"
+    return f"## Outcome\n\n{summary}"
diff --git a/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml b/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml
new file mode 100644
index 00000000..0c4622bc
--- /dev/null
+++ b/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml
@@ -0,0 +1,39 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: crashloop-probe-demo
+  namespace: lightspeed-evaluation-test
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: crashloop-probe-demo
+  template:
+    metadata:
+      labels:
+        app: crashloop-probe-demo
+    spec:
+      containers:
+        - name: nginx
+          image: registry.access.redhat.com/ubi9/nginx-124:latest
+          ports:
+            - containerPort: 8080
+          livenessProbe:
+            httpGet:
+              path: /nonexistent-health
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 3
+            failureThreshold: 2
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+            capabilities:
+              drop: ["ALL"]
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+              cpu: "100m"
diff --git a/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh b/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh
new file mode 100644
index 00000000..7edde567
--- /dev/null
+++ b/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Common infrastructure cleanup for Claude/Vertex AI provider.
+# Sourced by per-scenario cleanup scripts — do NOT run directly.
+#
+# Deletes "eval-" prefixed operator resources (reverse order of creation)
+# and the test namespace.
+
+OPERATOR_NS="${OPERATOR_NS:-openshift-lightspeed}"
+TEST_NS="${TEST_NS:-lightspeed-evaluation-test}"
+
+oc delete proposals --all -n "$TEST_NS" --ignore-not-found
+oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found
+oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
+oc delete agent eval-default --ignore-not-found
+oc delete llmprovider eval-vertex-ai --ignore-not-found
+oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
+oc delete namespace "$TEST_NS" --ignore-not-found
diff --git a/tests/integration/agentic/scripts/_cleanup_infra-openai.sh b/tests/integration/agentic/scripts/_cleanup_infra-openai.sh
new file mode 100644
index 00000000..8bc62214
--- /dev/null
+++ b/tests/integration/agentic/scripts/_cleanup_infra-openai.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Common infrastructure cleanup for OpenAI provider.
+# Sourced by per-scenario cleanup scripts — do NOT run directly.
+#
+# Deletes "eval-" prefixed operator resources (reverse order of creation).
+
+OPERATOR_NS="${OPERATOR_NS:-openshift-lightspeed}"
+TEST_NS="${TEST_NS:-lightspeed-evaluation-test}"
+
+oc delete proposals --all -n "$TEST_NS" --ignore-not-found
+oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found
+oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
+oc delete agent eval-default --ignore-not-found
+oc delete llmprovider eval-openai --ignore-not-found
+oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
diff --git a/tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh b/tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh
old mode 100755
new mode 100644
similarity index 80%
rename from tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh
rename to tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh
index 70689a06..b989898c
--- a/tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh
+++ b/tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-set -euo pipefail
-
-# Deploy agentic infrastructure + OOMKill test workload for integration tests.
-# All operator-level resources use an "eval-" prefix to avoid conflicts with
-# existing cluster resources.
+# Common infrastructure setup for Claude via Vertex AI.
+# Sourced by per-scenario setup scripts — do NOT run directly.
+#
+# Deploys: namespace, Secret, LLMProvider, Agent, SandboxTemplate.
+# All operator-level resources use an "eval-" prefix.
 #
 # Required env vars:
 #   GCP_CREDENTIALS_FILE          — path to GCP credentials JSON file
@@ -15,8 +15,10 @@ set -euo pipefail
 #   CLOUD_ML_REGION  — default: global
 #   AGENT_MODEL      — default: claude-opus-4-6
 
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
+set -euo pipefail
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 GCP_CREDENTIALS_FILE="${GCP_CREDENTIALS_FILE:-$HOME/.config/gcloud/application_default_credentials.json}"
 CLOUD_ML_REGION="${CLOUD_ML_REGION:-global}"
@@ -34,19 +36,17 @@ if [ ! -f "$GCP_CREDENTIALS_FILE" ]; then
   exit 1
 fi
 
-# 1. Test namespace + OOMKill workload (static fixtures, already test-scoped)
+# 1. Test namespace
 oc apply -f "$SCRIPT_DIR/../fixtures/namespace.yaml"
-oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
-
 
-# 2. Secret (GCP credentials) — prefixed name in operator namespace
+# 2. Secret (GCP credentials)
 oc create secret generic eval-llm-credentials \
   --from-file=credentials.json="$GCP_CREDENTIALS_FILE" \
   --from-literal=ANTHROPIC_VERTEX_PROJECT_ID="$ANTHROPIC_VERTEX_PROJECT_ID" \
   --from-literal=CLOUD_ML_REGION="$CLOUD_ML_REGION" \
   -n "$OPERATOR_NS" --dry-run=client -o yaml | oc apply -f -
 
-# 3. LLMProvider — references prefixed secret
+# 3. LLMProvider
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: LLMProvider
@@ -61,7 +61,7 @@ spec:
     region: $CLOUD_ML_REGION
 EOF
 
-# 4. Agent — references prefixed LLMProvider
+# 4. Agent
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: Agent
@@ -132,9 +132,4 @@ spec:
         emptyDir: {}
 EOF
 
-# 6. Wait for OOMKill workload to start crashing
-echo "Waiting for oomkill-demo pod to appear..."
-oc wait --for=condition=Available=false deployment/oomkill-demo \
-  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
-sleep 10
-echo "Setup complete."
+echo "Infrastructure setup complete (Claude/Vertex AI)."
diff --git a/tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh b/tests/integration/agentic/scripts/_setup_infra-openai.sh
old mode 100755
new mode 100644
similarity index 67%
rename from tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh
rename to tests/integration/agentic/scripts/_setup_infra-openai.sh
index 34110ca1..dac09a7e
--- a/tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh
+++ b/tests/integration/agentic/scripts/_setup_infra-openai.sh
@@ -1,10 +1,9 @@
 #!/usr/bin/env bash
-set -euo pipefail
-
-# Deploy agentic infrastructure + OOMKill test workload for integration tests
-# using OpenAI as the LLM provider.
-# All operator-level resources use an "eval-" prefix to avoid conflicts with
-# existing cluster resources.
+# Common infrastructure setup for OpenAI provider.
+# Sourced by per-scenario setup scripts — do NOT run directly.
+#
+# Deploys: namespace, Secret, LLMProvider, Agent, SandboxTemplate.
+# All operator-level resources use an "eval-" prefix.
 #
 # Required env vars:
 #   OPENAI_API_KEY   — OpenAI API key
@@ -12,13 +11,11 @@ set -euo pipefail
 #
 # Optional env vars:
 #   AGENT_MODEL      — default: gpt-5.2
-#
-# To test with Claude via Vertex AI instead, use
-# setup_proposal_fixtures-claude-vertex.sh and set the corresponding env vars
-# (GCP_CREDENTIALS_FILE, ANTHROPIC_VERTEX_PROJECT_ID, SANDBOX_IMAGE).
 
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
+set -euo pipefail
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
 AGENT_MODEL="${AGENT_MODEL:-gpt-5.2}"
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -30,13 +27,10 @@ for var in OPENAI_API_KEY SANDBOX_IMAGE; do
   fi
 done
 
-# 1. Test namespace + OOMKill workload (static fixtures, already test-scoped)
+# 1. Test namespace
 oc apply -f "$SCRIPT_DIR/../fixtures/namespace.yaml"
-oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
 
-# 2. Secret (OpenAI API key + provider override) — prefixed name in operator namespace
-# LIGHTSPEED_AGENT_PROVIDER and OPENAI_MODEL are injected via envFrom so the
-# sandbox picks the OpenAI provider instead of defaulting to claude.
+# 2. Secret (OpenAI API key + provider override)
 cat <<EOF | oc apply -n "$OPERATOR_NS" -f -
 apiVersion: v1
 kind: Secret
@@ -49,7 +43,7 @@ stringData:
   OPENAI_MODEL: ${AGENT_MODEL}
 EOF
 
-# 3. LLMProvider — references prefixed secret
+# 3. LLMProvider
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: LLMProvider
@@ -62,7 +56,7 @@ spec:
       name: eval-llm-credentials
 EOF
 
-# 4. Agent — references prefixed LLMProvider
+# 4. Agent
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: Agent
@@ -133,9 +127,4 @@ spec:
         emptyDir: {}
 EOF
 
-# 6. Wait for OOMKill workload to start crashing
-echo "Waiting for oomkill-demo pod to appear..."
-oc wait --for=condition=Available=false deployment/oomkill-demo \
-  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
-sleep 10
-echo "Setup complete."
+echo "Infrastructure setup complete (OpenAI)."
diff --git a/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
new file mode 100755
index 00000000..552f868d
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down CrashLoopBackOff (liveness probe) test workload and Claude/Vertex AI infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up CrashLoopBackOff probe integration test resources..."
+oc delete deployment crashloop-probe-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_cleanup_infra-claude-vertex.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh b/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh
new file mode 100755
index 00000000..09adcfcf
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down CrashLoopBackOff (liveness probe) test workload and OpenAI provider infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up CrashLoopBackOff probe integration test resources..."
+oc delete deployment crashloop-probe-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-openai.sh
+source "$SCRIPT_DIR/_cleanup_infra-openai.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh
new file mode 100755
index 00000000..65eb9879
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down OOMKill test workload and Claude/Vertex AI provider infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up OOMKill integration test resources..."
+oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_cleanup_infra-claude-vertex.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh b/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh
new file mode 100755
index 00000000..6729724c
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down OOMKill test workload and OpenAI provider infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up OOMKill integration test resources..."
+oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-openai.sh
+source "$SCRIPT_DIR/_cleanup_infra-openai.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh
deleted file mode 100755
index 4f84d111..00000000
--- a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Tear down integration test resources. Only deletes "eval-" prefixed operator
-# resources to avoid touching anything that isn't ours.
-
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
-
-echo "Cleaning up integration test resources..."
-
-# Delete test workload + any leftover Proposals in test namespace
-oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
-oc delete proposals --all -n "$TEST_NS" --ignore-not-found
-oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found
-
-# Delete prefixed operator resources (reverse order of creation)
-oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
-oc delete agent eval-default --ignore-not-found
-oc delete llmprovider eval-vertex-ai --ignore-not-found
-oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
-
-# Delete test namespace (cascades remaining namespaced resources)
-oc delete namespace "$TEST_NS" --ignore-not-found
-
-echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh b/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh
deleted file mode 100755
index 550b95d5..00000000
--- a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Tear down integration test resources deployed by setup_proposal_fixtures.sh
-# (OpenAI provider). Only deletes "eval-" prefixed operator resources to avoid
-# touching anything that isn't ours.
-
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
-
-echo "Cleaning up integration test resources..."
-
-# Delete test workload (Proposal cleanup is handled by the driver via cleanup_proposals)
-oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
-
-# Delete prefixed operator resources (reverse order of creation)
-oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
-oc delete agent eval-default --ignore-not-found
-oc delete llmprovider eval-openai --ignore-not-found
-oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
-
-echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh b/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh
new file mode 100755
index 00000000..71677724
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy CrashLoopBackOff (misconfigured liveness probe) test workload
+# with Claude/Vertex AI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_setup_infra-claude-vertex.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/crashloop-probe-demo.yaml"
+
+echo "Waiting for crashloop-probe-demo pod to appear..."
+oc wait --for=condition=Available=false deployment/crashloop-probe-demo \
+  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
+sleep 10
+echo "Setup complete."
diff --git a/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh b/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh
new file mode 100755
index 00000000..0383f79c
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy CrashLoopBackOff (misconfigured liveness probe) test workload
+# with OpenAI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-openai.sh
+source "$SCRIPT_DIR/_setup_infra-openai.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/crashloop-probe-demo.yaml"
+
+echo "Waiting for crashloop-probe-demo pod to appear..."
+oc wait --for=condition=Available=false deployment/crashloop-probe-demo \
+  -n "$TEST_NS" --timeout=60s 2>/dev/null
+sleep 10
+echo "Setup complete."
diff --git a/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh b/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh
new file mode 100755
index 00000000..16e4b98a
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy OOMKill test workload with Claude/Vertex AI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_setup_infra-claude-vertex.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
+
+echo "Waiting for oomkill-demo pod to appear..."
+oc wait --for=condition=Available=false deployment/oomkill-demo \
+  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
+sleep 10
+echo "Setup complete."
diff --git a/tests/integration/agentic/scripts/setup_oomkill-openai.sh b/tests/integration/agentic/scripts/setup_oomkill-openai.sh
new file mode 100755
index 00000000..1080d733
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_oomkill-openai.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy OOMKill test workload with OpenAI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-openai.sh
+source "$SCRIPT_DIR/_setup_infra-openai.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
+
+echo "Waiting for oomkill-demo pod to reach OOMKilled or CrashLoopBackOff..."
+TIMEOUT=120
+INTERVAL=5
+ELAPSED=0
+while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
+  TERMINATED_REASON=$(oc get pod -l app=oomkill-demo -n "$TEST_NS" \
+    -o jsonpath='{.items[0].status.containerStatuses[0].state.terminated.reason}' 2>/dev/null || true)
+  WAITING_REASON=$(oc get pod -l app=oomkill-demo -n "$TEST_NS" \
+    -o jsonpath='{.items[0].status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true)
+
+  if [ "$TERMINATED_REASON" = "OOMKilled" ] || [ "$WAITING_REASON" = "CrashLoopBackOff" ]; then
+    echo "Pod reached expected state (terminated=$TERMINATED_REASON, waiting=$WAITING_REASON)."
+    echo "Setup complete."
+    exit 0
+  fi
+
+  sleep "$INTERVAL"
+  ELAPSED=$((ELAPSED + INTERVAL))
+done
+
+echo "ERROR: Timed out after ${TIMEOUT}s waiting for OOMKilled or CrashLoopBackOff." >&2
+exit 1
diff --git a/tests/integration/system-config-agents-proposal.yaml b/tests/integration/system-config-agents-proposal.yaml
index d0902a23..0fb0db82 100644
--- a/tests/integration/system-config-agents-proposal.yaml
+++ b/tests/integration/system-config-agents-proposal.yaml
@@ -1,3 +1,6 @@
+logging:
+  source_level: DEBUG
+
 core:
   max_threads: 1
   fail_on_invalid_data: true
@@ -12,7 +15,7 @@ agents:
     namespace: lightspeed-evaluation-test
     auto_approve: true
     cleanup_proposals: true
-    timeout: 900
+    timeout: 1200
     poll_interval: 5
 
 storage:
diff --git a/tests/integration/test_evaluation_data_proposal.yaml b/tests/integration/test_evaluation_data_proposal.yaml
index 27456b68..e597ecf2 100644
--- a/tests/integration/test_evaluation_data_proposal.yaml
+++ b/tests/integration/test_evaluation_data_proposal.yaml
@@ -1,13 +1,42 @@
-- conversation_group_id: proposal_full_lifecycle
-  description: Full lifecycle — analysis, execution, verification
-  tag: proposal
-  setup_script: agentic/scripts/setup_proposal_fixtures-openai.sh
-  cleanup_script: agentic/scripts/cleanup_proposal_fixtures-openai.sh
+- conversation_group_id: proposal_analysis_only
+  description: Analysis-only — no execution or verification (query from proposal_spec.request)
+  tag: proposal_analysis_only
+  setup_script: agentic/scripts/setup_oomkill-openai.sh
+  cleanup_script: agentic/scripts/cleanup_oomkill-openai.sh
   conversation_metrics: []
   conversation_metrics_metadata: {}
   turns:
     - turn_id: turn_1
       response: null
+      proposal_spec:
+        request: >-
+          A pod named oomkill-demo in namespace lightspeed-evaluation-test
+          is in CrashLoopBackOff. Analyze the root cause.
+        targetNamespaces:
+          - lightspeed-evaluation-test
+        tools:
+          skills:
+            # TODO(rioloc): replace with a stable image
+            - image: quay.io/harpatil/agentic-skills:latest
+              paths:
+                - /skills/find-token
+        analysis:
+          agent: eval-default
+      expected_proposal_status:
+        phase: Completed
+      turn_metrics:
+        - custom:proposal_status
+      turn_metrics_metadata: {}
+
+- conversation_group_id: proposal_oomkill_openai
+  description: OOMKill full lifecycle — status check + LLM-as-judge evaluation
+  tag: proposal_oomkill
+  setup_script: agentic/scripts/setup_oomkill-openai.sh
+  cleanup_script: agentic/scripts/cleanup_oomkill-openai.sh
+  conversation_metrics: []
+  conversation_metrics_metadata: {}
+  turns:
+    - turn_id: turn_1
       proposal_spec:
         request: >-
           A pod named oomkill-demo in namespace lightspeed-evaluation-test
@@ -29,24 +58,42 @@
           agent: eval-default
       expected_proposal_status:
         phase: Completed
+      expected_outcome: >-
+        Root cause: the pod oomkill-demo is OOMKilled because its container
+        memory limit is too low. Remediation: increase the container memory
+        limit (e.g. to 512Mi) and verify the pod reaches Running state
+        without further OOMKill events.
+      expected_analysis_outcome: >-
+        The agent should identify that the pod oomkill-demo is being
+        OOMKilled because its container memory limit is set too low
+        (e.g. 128Mi), causing the kernel to terminate the process.
+      expected_execution_outcome: >-
+        The agent should increase the container memory limit (e.g. to
+        512Mi) by patching the pod or its owning workload resource.
+      expected_verification_outcome: >-
+        The agent should verify the pod reaches Running state without
+        further OOMKill events.
       turn_metrics:
         - custom:proposal_status
-      turn_metrics_metadata: {}
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
 
-- conversation_group_id: proposal_analysis_only
-  description: Analysis-only — no execution or verification (query from proposal_spec.request)
-  tag: proposal
-  setup_script: agentic/scripts/setup_proposal_fixtures-openai.sh
-  cleanup_script: agentic/scripts/cleanup_proposal_fixtures-openai.sh
+- conversation_group_id: proposal_oomkill_claude-vertex
+  description: Judge evaluation — LLM-as-judge on full lifecycle workflow
+  tag: proposal_oomkill
+  setup_script: agentic/scripts/setup_oomkill-claude-vertex.sh
+  cleanup_script: agentic/scripts/cleanup_oomkill-claude-vertex.sh
   conversation_metrics: []
   conversation_metrics_metadata: {}
   turns:
     - turn_id: turn_1
-      response: null
       proposal_spec:
         request: >-
           A pod named oomkill-demo in namespace lightspeed-evaluation-test
-          is in CrashLoopBackOff. Analyze the root cause.
+          is in CrashLoopBackOff due to OOMKill. Analyze the root cause,
+          fix the memory configuration, and verify the fix.
         targetNamespaces:
           - lightspeed-evaluation-test
         tools:
@@ -57,8 +104,115 @@
                 - /skills/find-token
         analysis:
           agent: eval-default
+        execution:
+          agent: eval-default
+        verification:
+          agent: eval-default
+      expected_outcome: >-
+        Root cause: the pod oomkill-demo is OOMKilled because its container
+        memory limit is too low. Remediation: increase the container memory
+        limit (e.g. to 512Mi) and verify the pod reaches Running state
+        without further OOMKill events.
+      expected_analysis_outcome: >-
+        The agent should identify that the pod oomkill-demo is being
+        OOMKilled because its container memory limit is set too low
+        (e.g. 128Mi), causing the kernel to terminate the process.
+      expected_execution_outcome: >-
+        The agent should increase the container memory limit (e.g. to
+        512Mi) by patching the pod or its owning workload resource.
+      expected_verification_outcome: >-
+        The agent should verify the pod reaches Running state without
+        further OOMKill events.
+      turn_metrics:
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
+
+- conversation_group_id: proposal_crashloop_probe_openai
+  description: CrashLoopBackOff caused by misconfigured liveness probe — full lifecycle
+  tag: proposal_crashloop_probe
+  setup_script: agentic/scripts/setup_crashloop_probe-openai.sh
+  cleanup_script: agentic/scripts/cleanup_crashloop_probe-openai.sh
+  conversation_metrics: []
+  conversation_metrics_metadata: {}
+  turns:
+    - turn_id: turn_1
+      proposal_spec:
+        request: >-
+          A pod named crashloop-probe-demo in namespace lightspeed-evaluation-test
+          is in CrashLoopBackOff. The container keeps restarting. Analyze the
+          root cause, fix the issue, and verify the pod is stable.
+        targetNamespaces:
+          - lightspeed-evaluation-test
+        tools:
+          skills:
+            # TODO(rioloc): replace with a stable image
+            - image: quay.io/harpatil/agentic-skills:latest
+              paths:
+                - /skills/find-token
+        analysis:
+          agent: eval-default
+        execution:
+          agent: eval-default
+        verification:
+          agent: eval-default
       expected_proposal_status:
         phase: Completed
+      expected_outcome: >-
+        Root cause: the liveness probe on pod crashloop-probe-demo is
+        configured to check an endpoint that does not exist (/nonexistent-health),
+        causing the kubelet to kill and restart the container repeatedly.
+        Remediation: fix the liveness probe path to point to a valid health
+        endpoint (e.g. / or remove the probe) and verify the pod reaches
+        Running state without further restarts.
       turn_metrics:
         - custom:proposal_status
-      turn_metrics_metadata: {}
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
+
+- conversation_group_id: proposal_crashloop_probe_claude-vertex
+  description: CrashLoopBackOff caused by misconfigured liveness probe — full lifecycle (Claude/Vertex)
+  tag: proposal_crashloop_probe
+  setup_script: agentic/scripts/setup_crashloop_probe-claude-vertex.sh
+  cleanup_script: agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
+  conversation_metrics: []
+  conversation_metrics_metadata: {}
+  turns:
+    - turn_id: turn_1
+      proposal_spec:
+        request: >-
+          A pod named crashloop-probe-demo in namespace lightspeed-evaluation-test
+          is in CrashLoopBackOff. The container keeps restarting. Analyze the
+          root cause, fix the issue, and verify the pod is stable.
+        targetNamespaces:
+          - lightspeed-evaluation-test
+        tools:
+          skills:
+            # TODO(rioloc): replace with a stable image
+            - image: quay.io/harpatil/agentic-skills:latest
+              paths:
+                - /skills/find-token
+        analysis:
+          agent: eval-default
+        execution:
+          agent: eval-default
+        verification:
+          agent: eval-default
+      expected_proposal_status:
+        phase: Completed
+      expected_outcome: >-
+        Root cause: the liveness probe on pod crashloop-probe-demo is
+        configured to check an endpoint that does not exist (/nonexistent-health),
+        causing the kubelet to kill and restart the container repeatedly.
+        Remediation: fix the liveness probe path to point to a valid health
+        endpoint (e.g. / or remove the probe) and verify the pod reaches
+        Running state without further restarts.
+      turn_metrics:
+        - custom:proposal_status
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
diff --git a/tests/integration/test_proposal_evaluation.py b/tests/integration/test_proposal_evaluation.py
index 06238126..f30d9a15 100644
--- a/tests/integration/test_proposal_evaluation.py
+++ b/tests/integration/test_proposal_evaluation.py
@@ -8,10 +8,8 @@
     - OPENAI_API_KEY, SANDBOX_IMAGE env vars set
     - Network connectivity to the cluster API
 
-The default setup/cleanup scripts use OpenAI as the LLM provider.
-To test with a different provider (e.g. Claude via Vertex AI), set the
-corresponding env vars and point the eval data at the provider-specific
-scripts (e.g. setup_proposal_fixtures-claude-vertex.sh).
+Each scenario has its own setup/cleanup scripts that source a shared
+infrastructure script per provider (e.g. _setup_infra-openai.sh).
 
 Run with: pytest tests/integration/test_proposal_evaluation.py -v -m agentic
 """
@@ -103,8 +101,8 @@ class TestProposalDriverEvaluation:
     """End-to-end tests for ProposalDriver evaluation pipeline."""
 
     @pytest.mark.timeout(1200)
-    def test_full_lifecycle(self, tmp_path: Path) -> None:
-        """Test full Proposal lifecycle: analysis, execution, verification.
+    def test_oomkill_full_lifecycle(self, tmp_path: Path) -> None:
+        """Test OOMKill full lifecycle: status check + judge evaluation.
 
         Verifies:
         - Setup script deploys infrastructure and test workload
@@ -126,9 +124,9 @@ def test_full_lifecycle(self, tmp_path: Path) -> None:
         )
         all_data = validator.load_evaluation_data(str(PROPOSAL_EVAL_DATA_PATH))
         eval_data = [
-            d for d in all_data if d.conversation_group_id == "proposal_full_lifecycle"
+            d for d in all_data if d.conversation_group_id == "proposal_oomkill_openai"
         ]
-        assert len(eval_data) == 1, "Should find proposal_full_lifecycle data"
+        assert len(eval_data) == 1, "Should find proposal_oomkill_openai data"
 
         evaluate(system_config, eval_data)
 
@@ -201,6 +199,40 @@ def test_analysis_only(self, tmp_path: Path) -> None:
                 f"got reason={by_type['Verified'].get('reason')}"
             )
 
+    @pytest.mark.timeout(1200)
+    def test_oomkill_claude_vertex(self, tmp_path: Path) -> None:
+        """Test OOMKill full lifecycle with Claude/Vertex AI provider.
+
+        Verifies:
+        - ProposalDriver populates response with workflow summary
+        - custom:proposal_evaluation_correctness metric runs against response
+        - Pipeline completes without errors
+        """
+        loader = ConfigLoader()
+        system_config = loader.load_system_config(str(PROPOSAL_CONFIG_PATH))
+        system_config.storage = [
+            FileBackendConfig(output_dir=str(tmp_path / "eval_output"))
+        ]
+
+        validator = DataValidator(
+            api_enabled=True,
+            fail_on_invalid_data=system_config.core.fail_on_invalid_data,
+        )
+        all_data = validator.load_evaluation_data(str(PROPOSAL_EVAL_DATA_PATH))
+        eval_data = [
+            d
+            for d in all_data
+            if d.conversation_group_id == "proposal_oomkill_claude-vertex"
+        ]
+        assert len(eval_data) == 1, "Should find proposal_oomkill_claude-vertex data"
+
+        evaluate(system_config, eval_data)
+
+        turn = eval_data[0].turns[0]
+        assert (
+            turn.response and turn.response.strip()
+        ), "Response should be populated by ProposalDriver"
+
     @pytest.mark.timeout(120)
     def test_timeout_handling(self, tmp_path: Path) -> None:
         """Test that a very short timeout is handled gracefully.
diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py
index 51a002c7..80749ff9 100644
--- a/tests/unit/core/metrics/custom/test_custom.py
+++ b/tests/unit/core/metrics/custom/test_custom.py
@@ -1,10 +1,15 @@
 """Tests for custom metrics module."""
 
+# pylint: disable=protected-access
+
+import json
+
 from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
+from lightspeed_evaluation.core.system.exceptions import LLMError
 
 
 class TestCustomMetricsToolEval:
@@ -175,3 +180,235 @@ def test_config_from_system_defaults_via_metric_manager(
         assert score == 1.0
         assert "partial" in reason
         assert "unordered" in reason
+
+
+def _make_custom_metrics(mocker: MockerFixture) -> CustomMetrics:
+    """Create a CustomMetrics instance with mocked LLM manager."""
+    mock_llm_manager = mocker.Mock()
+    mock_llm_manager.get_model_name.return_value = "test-model"
+    mock_llm_manager.get_llm_params.return_value = {"parameters": {}}
+    return CustomMetrics(mock_llm_manager)
+
+
+def _make_scope(turn_data: TurnData, is_conversation: bool = False) -> EvaluationScope:
+    """Create an EvaluationScope for a turn."""
+    return EvaluationScope(
+        turn_idx=0, turn_data=turn_data, is_conversation=is_conversation
+    )
+
+
+METRIC_NAME = "proposal_evaluation_correctness"
+
+_LLM_RESPONSE_ALL_DIMS = json.dumps(
+    {
+        "reasoning": (
+            "Diagnosis was accurate. Execution addressed root cause. "
+            "Verification confirmed fix."
+        ),
+        "diagnosis": 0.9,
+        "execution": 0.8,
+        "verification": 0.7,
+        "average": 0.80,
+    }
+)
+
+_LLM_RESPONSE_NO_VERIFICATION = json.dumps(
+    {
+        "reasoning": "Diagnosis correct. Execution appropriate but no verification.",
+        "diagnosis": 0.9,
+        "execution": 0.8,
+        "verification": None,
+        "average": 0.85,
+    }
+)
+
+
+class TestParseProposalEvalResponse:
+    """Test _parse_proposal_eval_response parser."""
+
+    def test_all_dimensions(self, mocker: MockerFixture) -> None:
+        """Test parsing response with all three dimensions scored."""
+        cm = _make_custom_metrics(mocker)
+        score, detail = cm._parse_proposal_eval_response(_LLM_RESPONSE_ALL_DIMS)
+
+        assert score == 0.80
+        assert "diagnosis=0.90" in detail
+        assert "execution=0.80" in detail
+        assert "verification=0.70" in detail
+        assert "avg=0.80" in detail
+        assert "Diagnosis was accurate" in detail
+
+    def test_dimension_na(self, mocker: MockerFixture) -> None:
+        """Test parsing response with N/A dimension."""
+        cm = _make_custom_metrics(mocker)
+        score, detail = cm._parse_proposal_eval_response(_LLM_RESPONSE_NO_VERIFICATION)
+
+        assert score == 0.85
+        assert "verification=N/A" in detail
+        assert "diagnosis=0.90" in detail
+
+    def test_fallback_average_from_sub_scores(self, mocker: MockerFixture) -> None:
+        """Test average is computed from sub-scores when average key is missing."""
+        cm = _make_custom_metrics(mocker)
+        response = json.dumps(
+            {
+                "reasoning": "ok",
+                "diagnosis": 0.9,
+                "execution": 0.7,
+                "verification": None,
+            }
+        )
+
+        score, detail = cm._parse_proposal_eval_response(response)
+
+        assert score is not None
+        assert abs(score - 0.80) < 0.01
+        assert "avg=0.80" in detail
+
+    def test_invalid_json_returns_none(self, mocker: MockerFixture) -> None:
+        """Test that invalid JSON response returns None score."""
+        cm = _make_custom_metrics(mocker)
+        score, detail = cm._parse_proposal_eval_response("I cannot evaluate this.")
+
+        assert score is None
+        assert "Invalid JSON" in detail
+
+
+class TestProposalEvaluationCorrectness:
+    """Test custom:proposal_evaluation_correctness metric."""
+
+    def test_returns_score_from_llm(self, mocker: MockerFixture) -> None:
+        """Test successful LLM evaluation returns parsed score."""
+        cm = _make_custom_metrics(mocker)
+        mocker.patch.object(cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS)
+
+        turn = TurnData(
+            turn_id="t1",
+            query="Fix pod crash",
+            response="## Request\n\nFix pod crash\n\n## Analysis\n...",
+            expected_outcome="Root cause: OOMKilled. Remediation: increase memory limit.",
+        )
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score == 0.80
+        assert "diagnosis=0.90" in reason
+        assert "avg=0.80" in reason
+
+    def test_conversation_level_returns_none(self, mocker: MockerFixture) -> None:
+        """Test conversation-level scope returns None score."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(turn_id="t1", query="q", response="r")
+        scope = _make_scope(turn, is_conversation=True)
+
+        score, reason = cm.evaluate(METRIC_NAME, None, scope)
+
+        assert score is None
+        assert "turn-level" in reason
+
+    def test_missing_response_returns_none(self, mocker: MockerFixture) -> None:
+        """Test missing response returns None score."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(turn_id="t1", query="Fix pod crash")
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "response" in reason.lower()
+
+    def test_missing_expected_outcome_returns_none(self, mocker: MockerFixture) -> None:
+        """Test missing expected_outcome returns None score."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(
+            turn_id="t1",
+            query="Fix pod crash",
+            response="## Analysis\nDiagnosis: memory too low",
+        )
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "expected outcome" in reason.lower()
+
+    def test_none_turn_data_returns_none(self, mocker: MockerFixture) -> None:
+        """Test None turn data returns None score."""
+        cm = _make_custom_metrics(mocker)
+        scope = EvaluationScope(turn_idx=0, turn_data=None, is_conversation=False)
+
+        score, reason = cm.evaluate(METRIC_NAME, None, scope)
+
+        assert score is None
+        assert "TurnData" in reason
+
+    def test_unparseable_llm_response(self, mocker: MockerFixture) -> None:
+        """Test unparseable LLM response returns None score."""
+        cm = _make_custom_metrics(mocker)
+        mocker.patch.object(cm, "_call_llm", return_value="I cannot evaluate this.")
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="## Request\nq",
+            expected_outcome="Expected outcome",
+        )
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "parse" in reason.lower() or "Could not" in reason
+
+    def test_llm_error_handled(self, mocker: MockerFixture) -> None:
+        """Test LLM error is caught and returns None score."""
+        cm = _make_custom_metrics(mocker)
+        mocker.patch.object(cm, "_call_llm", side_effect=LLMError("timeout"))
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="## Request\nq",
+            expected_outcome="Expected outcome",
+        )
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "timeout" in reason
+
+    def test_prompt_contains_query_response_and_expected(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Test that the LLM prompt includes query, response, and expected."""
+        cm = _make_custom_metrics(mocker)
+        call_spy = mocker.patch.object(
+            cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS
+        )
+
+        turn = TurnData(
+            turn_id="t1",
+            query="Fix OOMKilled pod",
+            response="## Analysis\nDiagnosis: memory too low",
+            expected_outcome="Root cause: OOMKilled. Increase memory limit to 512Mi.",
+        )
+        cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        prompt: str = call_spy.call_args[0][0]
+        assert "Fix OOMKilled pod" in prompt
+        assert "Diagnosis: memory too low" in prompt
+        assert "Increase memory limit to 512Mi" in prompt
+
+    def test_prompt_contains_sre_persona(self, mocker: MockerFixture) -> None:
+        """Test that the prompt includes the SRE persona."""
+        cm = _make_custom_metrics(mocker)
+        call_spy = mocker.patch.object(
+            cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS
+        )
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="r",
+            expected_outcome="e",
+        )
+        cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        prompt: str = call_spy.call_args[0][0]
+        assert "senior Site Reliability Engineer" in prompt
diff --git a/tests/unit/core/metrics/custom/test_proposal_eval.py b/tests/unit/core/metrics/custom/test_proposal_eval.py
index 2c3875aa..cc756dff 100644
--- a/tests/unit/core/metrics/custom/test_proposal_eval.py
+++ b/tests/unit/core/metrics/custom/test_proposal_eval.py
@@ -3,10 +3,10 @@
 from typing import Any, Optional
 
 from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
-    _derive_phase,
     evaluate_proposal_status,
 )
 from lightspeed_evaluation.core.models import TurnData
+from lightspeed_evaluation.core.proposal import derive_phase
 
 
 def _make_turn(
@@ -61,7 +61,7 @@ class TestDerivePhase:
     def test_completed_analysis_only(self) -> None:
         """Analysis-only with Analyzed=True derives Completed."""
         conditions = [{"type": "Analyzed", "status": "True"}]
-        assert _derive_phase(conditions, {"analysis": {}}) == "Completed"
+        assert derive_phase(conditions, {"analysis": {}}) == "Completed"
 
     def test_completed_full_lifecycle(self) -> None:
         """Full lifecycle with all conditions True derives Completed."""
@@ -71,7 +71,7 @@ def test_completed_full_lifecycle(self) -> None:
             {"type": "Verified", "status": "True"},
         ]
         spec: dict[str, Any] = {"analysis": {}, "execution": {}, "verification": {}}
-        assert _derive_phase(conditions, spec) == "Completed"
+        assert derive_phase(conditions, spec) == "Completed"
 
     def test_completed_execution_no_verification(self) -> None:
         """Execution without verification with Executed=True derives Completed."""
@@ -80,7 +80,7 @@ def test_completed_execution_no_verification(self) -> None:
             {"type": "Executed", "status": "True"},
         ]
         spec: dict[str, Any] = {"analysis": {}, "execution": {}}
-        assert _derive_phase(conditions, spec) == "Completed"
+        assert derive_phase(conditions, spec) == "Completed"
 
     def test_failed_condition(self) -> None:
         """Any condition with status False derives Failed."""
@@ -88,7 +88,7 @@ def test_failed_condition(self) -> None:
             {"type": "Analyzed", "status": "True"},
             {"type": "Executed", "status": "False", "reason": "Error"},
         ]
-        assert _derive_phase(conditions) == "Failed"
+        assert derive_phase(conditions) == "Failed"
 
     def test_retrying_execution_not_failed(self) -> None:
         """RetryingExecution reason does not count as failure."""
@@ -97,22 +97,22 @@ def test_retrying_execution_not_failed(self) -> None:
             {"type": "Verified", "status": "False", "reason": "RetryingExecution"},
         ]
         spec: dict[str, Any] = {"analysis": {}, "execution": {}, "verification": {}}
-        assert _derive_phase(conditions, spec) == "InProgress"
+        assert derive_phase(conditions, spec) == "InProgress"
 
     def test_denied(self) -> None:
         """Denied=True derives Denied."""
         conditions = [{"type": "Denied", "status": "True"}]
-        assert _derive_phase(conditions) == "Denied"
+        assert derive_phase(conditions) == "Denied"
 
     def test_escalated(self) -> None:
         """Escalated=True derives Escalated."""
         conditions = [{"type": "Escalated", "status": "True"}]
-        assert _derive_phase(conditions) == "Escalated"
+        assert derive_phase(conditions) == "Escalated"
 
     def test_in_progress(self) -> None:
         """Unknown status derives InProgress."""
         conditions = [{"type": "Analyzed", "status": "Unknown"}]
-        assert _derive_phase(conditions) == "InProgress"
+        assert derive_phase(conditions) == "InProgress"
 
     def test_no_proposal_spec_infers_last_step(self) -> None:
         """Without proposal_spec, infers last step from conditions."""
@@ -121,7 +121,7 @@ def test_no_proposal_spec_infers_last_step(self) -> None:
             {"type": "Executed", "status": "True"},
             {"type": "Verified", "status": "True"},
         ]
-        assert _derive_phase(conditions, None) == "Completed"
+        assert derive_phase(conditions, None) == "Completed"
 
 
 class TestPhaseCheck:
diff --git a/tests/unit/pipeline/evaluation/test_driver.py b/tests/unit/pipeline/evaluation/test_driver.py
index 78a85f5d..564ed780 100644
--- a/tests/unit/pipeline/evaluation/test_driver.py
+++ b/tests/unit/pipeline/evaluation/test_driver.py
@@ -14,6 +14,7 @@
     AgentDriver,
     HttpApiDriver,
     ProposalDriver,
+    TerminalOutcome,
 )
 from lightspeed_evaluation.pipeline.evaluation.registry import AgentDriverRegistry
 
@@ -176,6 +177,83 @@ def test_validate_config_invalid(self) -> None:
             )
 
 
+class TestProposalDriverExecuteTurn:
+    """Unit tests for ProposalDriver.execute_turn outcome routing."""
+
+    @pytest.fixture()
+    def _proposal_driver(self, mocker: MockerFixture) -> ProposalDriver:
+        """Create a ProposalDriver with mocked infrastructure."""
+        mocker.patch("shutil.which", return_value="/usr/bin/oc")
+        driver = ProposalDriver(
+            {"type": "proposal", "namespace": "ns", "timeout": 10, "poll_interval": 1},
+            enabled=True,
+        )
+        apply_result = mocker.Mock(returncode=0, stderr="")
+        mocker.patch.object(driver, "_apply", return_value=apply_result)
+        mocker.patch.object(driver, "_cleanup")
+        mocker.patch.object(driver, "_approve_when_ready", return_value=None)
+        mocker.patch.object(driver._amender, "amend", return_value=None)
+        return driver
+
+    def _stub_terminal(
+        self,
+        mocker: MockerFixture,
+        driver: ProposalDriver,
+        outcome: TerminalOutcome,
+    ) -> None:
+        """Stub _get_status and _is_terminal to return a given outcome immediately."""
+        mocker.patch.object(
+            driver, "_get_status", return_value=({"conditions": []}, None)
+        )
+        mocker.patch.object(driver, "_is_terminal", return_value=outcome)
+
+    def test_completed_returns_no_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test COMPLETED outcome returns no error."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.COMPLETED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is None
+
+    def test_failed_returns_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test FAILED outcome returns an error message."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.FAILED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is not None
+        assert "execution failed" in error
+
+    def test_denied_returns_no_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test DENIED outcome returns no error so metrics are still evaluated."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.DENIED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is None
+
+    def test_escalated_returns_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test ESCALATED outcome returns an error message."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.ESCALATED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is not None
+        assert "escalated" in error
+
+
 class TestAgentDriverBase:
     """Unit tests for AgentDriver base class defaults."""
 
diff --git a/tests/unit/pipeline/evaluation/test_proposal_amender.py b/tests/unit/pipeline/evaluation/test_proposal_amender.py
new file mode 100644
index 00000000..ac70b3ab
--- /dev/null
+++ b/tests/unit/pipeline/evaluation/test_proposal_amender.py
@@ -0,0 +1,489 @@
+"""Unit tests for ProposalAmender module."""
+
+import subprocess
+from typing import Any, Optional
+
+from pytest_mock import MockerFixture
+
+from lightspeed_evaluation.core.models import TurnData
+from lightspeed_evaluation.pipeline.evaluation.cli import CLIClient, KubeCLI
+from lightspeed_evaluation.pipeline.evaluation.proposal_amender import (
+    ProposalAmender,
+)
+
+
+class MockCLI(CLIClient):
+    """Mock CLIClient that returns pre-configured resources."""
+
+    def __init__(self, resources: Optional[dict[str, dict[str, Any]]] = None) -> None:
+        """Initialize with a map of resource name -> full CR dict."""
+        super().__init__(timeout=30)
+        self._resources: dict[str, dict[str, Any]] = resources or {}
+
+    def run(
+        self,
+        args: list[str],
+        stdin: Optional[str] = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Not used in amender tests."""
+        return subprocess.CompletedProcess(
+            args=args, returncode=0, stdout="", stderr=""
+        )
+
+    def get_resource(
+        self,
+        resource_plural: str,
+        name: str,
+    ) -> tuple[dict[str, Any], Optional[str]]:
+        """Return pre-configured resource or error."""
+        key = f"{resource_plural}/{name}"
+        if key in self._resources:
+            return self._resources[key], None
+        return {}, f"not found: {key}"
+
+    def apply(
+        self,
+        manifest: dict[str, Any],
+    ) -> subprocess.CompletedProcess[str]:
+        """Not used in amender tests."""
+        return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="")
+
+    def delete(self, resource_plural: str, name: str) -> None:
+        """Not used in amender tests."""
+
+
+def _make_turn(query: str = "Fix pod crash") -> TurnData:
+    """Create a minimal TurnData for testing."""
+    return TurnData(turn_id="t1", query=query)
+
+
+def _get_results(turn: TurnData) -> dict[str, Any]:
+    """Extract proposal_results, failing if None."""
+    assert turn.proposal_results is not None
+    return dict(turn.proposal_results)
+
+
+def _get_response(turn: TurnData) -> str:
+    """Extract response, failing if None."""
+    assert turn.response is not None
+    return str(turn.response)
+
+
+DIAGNOSIS_STATUS: dict[str, Any] = {
+    "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}],
+    "options": [
+        {
+            "title": "Increase memory limit",
+            "summary": "Bump memory to 512Mi",
+            "diagnosis": {
+                "summary": "OOMKilled due to low memory",
+                "confidence": "High",
+                "rootCause": "Memory limit 256Mi too low",
+            },
+            "proposal": {
+                "description": "Patch deployment memory",
+                "actions": [
+                    {"type": "patch", "description": "Set memory limit to 512Mi"},
+                ],
+                "risk": "Low",
+                "reversible": "Reversible",
+                "estimatedImpact": "Brief pod restart",
+            },
+        },
+        {
+            "title": "Scale horizontally",
+            "diagnosis": {
+                "summary": "High load causing OOM",
+                "confidence": "Medium",
+                "rootCause": "Single replica under load",
+            },
+            "proposal": {
+                "description": "Add replicas",
+                "actions": [
+                    {"type": "scale", "description": "Scale to 3 replicas"},
+                ],
+                "risk": "Low",
+                "reversible": "Reversible",
+                "estimatedImpact": "No downtime",
+            },
+        },
+    ],
+}
+
+EXECUTION_STATUS: dict[str, Any] = {
+    "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}],
+    "actionsTaken": [
+        {
+            "type": "patch",
+            "description": "Set memory limit to 512Mi",
+            "outcome": "Succeeded",
+            "output": "deployment.apps/web patched",
+        },
+    ],
+    "verification": {
+        "conditionOutcome": "Improved",
+        "summary": "Pod running with new limits",
+    },
+}
+
+VERIFICATION_STATUS: dict[str, Any] = {
+    "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}],
+    "checks": [
+        {
+            "name": "pod-running",
+            "source": "oc",
+            "value": "Running",
+            "result": "Passed",
+        },
+    ],
+    "summary": "All checks passed",
+}
+
+
+class TestAmendAnalysisOnly:
+    """Test ProposalAmender with analysis-only workflow."""
+
+    def test_populates_proposal_results(self) -> None:
+        """Test amend populates proposal_results with analysis data."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Analyzed", "status": "True", "message": "Done"}],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        err = amender.amend(turn, status)
+
+        assert err is None
+        assert turn.proposal_status == status
+        results = _get_results(turn)
+        assert "analysis" in results
+        assert len(results["analysis"]) == 1
+        assert results["analysis"][0]["options"][0]["title"] == "Increase memory limit"
+
+    def test_response_contains_diagnosis(self) -> None:
+        """Test Markdown summary includes diagnosis details."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Analyzed", "status": "True", "message": "Done"}],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "OOMKilled due to low memory" in response
+        assert "Memory limit 256Mi too low" in response
+        assert "Confidence: High" in response
+
+    def test_option_zero_marked_approved(self) -> None:
+        """Test option 0 is marked as (Approved) in summary."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "(Approved)" in response
+        assert "### Option 0 (Approved): Increase memory limit" in response
+        assert "### Option 1 : Scale horizontally" in response
+
+    def test_no_execution_or_verification_in_results(self) -> None:
+        """Test analysis-only workflow has no execution/verification keys."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        results = _get_results(turn)
+        assert "execution" not in results
+        assert "verification" not in results
+
+
+class TestAmendFullPipeline:
+    """Test ProposalAmender with analysis + execution + verification."""
+
+    def _make_cli(self) -> MockCLI:
+        """Create MockCLI with all three result CRs."""
+        return MockCLI(
+            {
+                "analysisresults/ar-1": {"status": DIAGNOSIS_STATUS},
+                "executionresults/er-1": {"status": EXECUTION_STATUS},
+                "verificationresults/vr-1": {"status": VERIFICATION_STATUS},
+            }
+        )
+
+    def _make_status(self) -> dict[str, Any]:
+        """Create proposal status with all three steps."""
+        return {
+            "conditions": [
+                {"type": "Analyzed", "status": "True"},
+                {"type": "Executed", "status": "True"},
+                {"type": "Verified", "status": "True", "message": "All passed"},
+            ],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+                "execution": {
+                    "results": [{"name": "er-1", "outcome": "Succeeded"}],
+                },
+                "verification": {
+                    "results": [{"name": "vr-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+    def test_all_results_populated(self) -> None:
+        """Test all three step results are populated."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        results = _get_results(turn)
+        assert "analysis" in results
+        assert "execution" in results
+        assert "verification" in results
+        assert len(results["analysis"]) == 1
+        assert len(results["execution"]) == 1
+        assert len(results["verification"]) == 1
+
+    def test_execution_in_summary(self) -> None:
+        """Test Markdown includes execution actions."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        response = _get_response(turn)
+        assert "## Execution" in response
+        assert "Set memory limit to 512Mi" in response
+        assert "Succeeded" in response
+        assert "deployment.apps/web patched" in response
+
+    def test_verification_in_summary(self) -> None:
+        """Test Markdown includes verification checks."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        response = _get_response(turn)
+        assert "## Verification" in response
+        assert "pod-running" in response
+        assert "Passed" in response
+        assert "All checks passed" in response
+
+
+class TestAmendEdgeCases:
+    """Test ProposalAmender edge cases."""
+
+    def test_empty_steps(self) -> None:
+        """Test status with no steps produces empty results."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {"conditions": []}
+
+        amender.amend(turn, status)
+
+        assert not turn.proposal_results
+        response = _get_response(turn)
+        assert "## Request" in response
+
+    def test_step_with_no_results(self) -> None:
+        """Test step present but no result refs gives empty list."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {"results": []},
+            },
+        }
+
+        amender.amend(turn, status)
+
+        results = _get_results(turn)
+        assert results["analysis"] == []
+
+    def test_failed_fetch_logged_and_skipped(self, mocker: MockerFixture) -> None:
+        """Test failed CR fetch is logged and result is skipped."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "missing-cr", "outcome": "Succeeded"}],
+                },
+            },
+        }
+        mock_logger = mocker.patch(
+            "lightspeed_evaluation.pipeline.evaluation.proposal_amender.logger"
+        )
+
+        err = amender.amend(turn, status)
+
+        assert err is None
+        results = _get_results(turn)
+        assert results["analysis"] == []
+        mock_logger.warning.assert_called_once()
+
+    def test_analysis_with_failure_reason(self) -> None:
+        """Test analysis result with failureReason."""
+        failed_status: dict[str, Any] = {
+            "conditions": [{"type": "Completed", "status": "False"}],
+            "failureReason": "LLM timeout",
+        }
+        cli = MockCLI({"analysisresults/ar-fail": {"status": failed_status}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Analyzed", "status": "False"}],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-fail", "outcome": "Failed"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "LLM timeout" in response
+
+    def test_escalation_results(self) -> None:
+        """Test escalation step results are captured."""
+        esc_status: dict[str, Any] = {
+            "conditions": [],
+            "summary": "Requires manual intervention",
+            "content": "Cluster admin must review",
+        }
+        cli = MockCLI({"escalationresults/esc-1": {"status": esc_status}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Escalated", "status": "True"}],
+            "steps": {
+                "escalation": {
+                    "results": [{"name": "esc-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        results = _get_results(turn)
+        assert "escalation" in results
+        response = _get_response(turn)
+        assert "Requires manual intervention" in response
+        assert "Cluster admin must review" in response
+
+    def test_outcome_section_from_conditions(self) -> None:
+        """Test outcome section uses condition messages."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [
+                {"type": "Analyzed", "status": "True", "message": "Analysis complete"},
+                {"type": "Executed", "status": "True", "message": "Execution done"},
+            ],
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "## Outcome" in response
+        assert "Analysis complete; Execution done" in response
+
+    def test_subprocess_error_caught_by_amend(self, mocker: MockerFixture) -> None:
+        """Test that subprocess errors are caught by the broadened except clause."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+        mocker.patch.object(
+            cli,
+            "get_resource",
+            side_effect=subprocess.SubprocessError("unexpected"),
+        )
+
+        err = amender.amend(turn, status)
+
+        assert err is not None
+        assert "ProposalAmender error" in err
+
+
+class TestKubeCLITimeoutHandling:
+    """Test KubeCLI normalizes subprocess.TimeoutExpired to CompletedProcess."""
+
+    def test_run_returns_completed_process_on_timeout(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Test that TimeoutExpired is caught and a failing CompletedProcess returned."""
+        mocker.patch(
+            "subprocess.run",
+            side_effect=subprocess.TimeoutExpired(cmd="oc get pods", timeout=30),
+        )
+
+        cli = KubeCLI(cli_path="oc", namespace="test-ns", timeout=30)
+        result = cli.run(["get", "pods"])
+
+        assert result.returncode == 1
+        assert result.stdout == ""
+        assert "timed out after 30s" in result.stderr
+
+    def test_get_resource_returns_error_on_timeout(self, mocker: MockerFixture) -> None:
+        """Test that get_resource returns an error tuple when run times out."""
+        mocker.patch(
+            "subprocess.run",
+            side_effect=subprocess.TimeoutExpired(cmd="oc get pods", timeout=30),
+        )
+
+        cli = KubeCLI(cli_path="oc", namespace="test-ns", timeout=30)
+        resource, err = cli.get_resource("pods", "my-pod")
+
+        assert resource == {}
+        assert err is not None
+        assert "timed out" in err
diff --git a/tests/unit/pipeline/evaluation/test_proposal_driver.py b/tests/unit/pipeline/evaluation/test_proposal_driver.py
index 32d5fb22..3bcbc022 100644
--- a/tests/unit/pipeline/evaluation/test_proposal_driver.py
+++ b/tests/unit/pipeline/evaluation/test_proposal_driver.py
@@ -57,6 +57,7 @@ def test_valid_config_all_fields(self) -> None:
                 "auto_approve": False,
                 "cleanup_proposals": False,
                 "timeout": 60,
+                "cli_timeout": 15,
                 "poll_interval": 5,
             }
         )
@@ -64,6 +65,7 @@ def test_valid_config_all_fields(self) -> None:
         assert config.auto_approve is False
         assert config.cleanup_proposals is False
         assert config.timeout == 60
+        assert config.cli_timeout == 15
         assert config.poll_interval == 5
 
     def test_valid_config_defaults(self) -> None:
@@ -72,6 +74,7 @@ def test_valid_config_defaults(self) -> None:
         assert config.auto_approve is True
         assert config.cleanup_proposals is True
         assert config.timeout == 900
+        assert config.cli_timeout == 30
         assert config.poll_interval == 2
 
     def test_missing_namespace(self) -> None:
@@ -94,6 +97,16 @@ def test_invalid_timeout_negative(self) -> None:
         with pytest.raises(ValidationError):
             ProposalAgentConfig.model_validate({**VALID_CONFIG, "timeout": -1})
 
+    def test_invalid_cli_timeout_zero(self) -> None:
+        """Test cli_timeout=0 raises ValidationError."""
+        with pytest.raises(ValidationError):
+            ProposalAgentConfig.model_validate({**VALID_CONFIG, "cli_timeout": 0})
+
+    def test_invalid_cli_timeout_negative(self) -> None:
+        """Test negative cli_timeout raises ValidationError."""
+        with pytest.raises(ValidationError):
+            ProposalAgentConfig.model_validate({**VALID_CONFIG, "cli_timeout": -1})
+
     def test_invalid_poll_interval_zero(self) -> None:
         """Test poll_interval=0 raises ValidationError."""
         with pytest.raises(ValidationError):
@@ -476,7 +489,9 @@ def test_happy_path_completed(
 
         assert error is None
         assert conv_id is None
-        assert turn.response == "Analysis done; Passed"
+        response = str(turn.response)
+        assert "Analysis done" in response
+        assert "Passed" in response
         assert turn.proposal_status == terminal_status
         driver._cleanup.assert_called_once_with("eval-abcd1234")
 
@@ -584,14 +599,13 @@ def test_failed_terminal(
         error, _ = driver.execute_turn(turn)
 
         assert error is not None
-        assert "Failed" in error
+        assert "failed" in error
         assert turn.proposal_status == status
-        assert turn.response == "LLM error"
 
     def test_denied_terminal(
         self, mocker: MockerFixture, driver: ProposalDriver
     ) -> None:
-        """Test denied proposal returns error."""
+        """Test denied proposal populates turn data and returns no error."""
         mock_time = mocker.patch(f"{MODULE}.time")
         mock_time.monotonic.side_effect = [0.0, 0.0, 0.0, 1.0]
 
@@ -607,8 +621,8 @@ def test_denied_terminal(
         turn = TurnData(turn_id="t1", query="Q", proposal_spec=SPEC_FULL)
         error, _ = driver.execute_turn(turn)
 
-        assert error is not None
-        assert "Denied" in error
+        assert error is None
+        assert turn.proposal_status == status
 
     def test_get_status_error(
         self, mocker: MockerFixture, driver: ProposalDriver