diff --git a/Makefile b/Makefile
index 543d6075..d3a0aad5 100644
--- a/Makefile
+++ b/Makefile
@@ -114,7 +114,7 @@ shellcheck: ## Run shellcheck
 	@mkdir -p .shellcheck-stable
 	@wget -qO- "https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.$$(uname -m).tar.xz" | tar -xJ -C .shellcheck-stable --strip-components=1
 	@PATH="$$PWD/.shellcheck-stable:$$PATH" shellcheck --version
-	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck {} +
+	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck -e SC1091 {} +
 
 pylint:
 	uv run pylint src
diff --git a/README.md b/README.md
index d659e722..a70c7971 100644
--- a/README.md
+++ b/README.md
@@ -210,6 +210,8 @@ uv run lightspeed-eval --system-config <CONFIG.yaml> --eval-data <EVAL_DATA.yaml
     - [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
   - Tool Evaluation
     - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls, arguments, and optional results with regex pattern matching
+  - Agentic Workflow Evaluation
+    - [`proposal_evaluation_correctness`](src/lightspeed_evaluation/core/metrics/custom/custom.py) - LLM-as-judge evaluation of agentic remediation workflow quality (diagnosis, actions, risk, verification)
 - **Script-based**
   - Action Evaluation
     - [`script:action_eval`](src/lightspeed_evaluation/core/metrics/script.py) - Executes verification scripts to validate actions (e.g., infrastructure changes)
diff --git a/config/system.yaml b/config/system.yaml
index 56e7b3a8..59176a35 100644
--- a/config/system.yaml
+++ b/config/system.yaml
@@ -155,6 +155,11 @@ metrics_metadata:
       ordered: true       # true (default): sequence order matters, false: any order allowed
       full_match: true    # true (default): exact 1:1 match, false: expected tools found in actual (extras allowed)
 
+    "custom:proposal_evaluation_correctness":
+      threshold: 0.75
+      description: "LLM judge of agentic remediation workflow quality (diagnosis, actions, risk, verification)"
+      default: false
+
     # Script-based metrics
     "script:action_eval":
       description: "Script-based evaluation for infrastructure/environment validation"
diff --git a/docs/EVALUATION_GUIDE.md b/docs/EVALUATION_GUIDE.md
index 683dffce..9b293201 100644
--- a/docs/EVALUATION_GUIDE.md
+++ b/docs/EVALUATION_GUIDE.md
@@ -420,6 +420,47 @@ expected_tool_calls:
 
 ---
 
+#### Proposal Evaluation Correctness
+
+**What it measures:** How good is the agentic remediation workflow? Evaluates diagnosis, actions, risk management, and verification.
+
+**Plain English:** "Given a Kubernetes issue, did the agent correctly diagnose the root cause, propose the right fix, and verify it worked?"
+
+**Score Range:** 0.0 to 1.0 (higher is better)
+
+**How it works:** A Judge LLM evaluates the workflow summary (produced by ProposalAmender) across four aspects.
+Diagnosis Quality is the most important criterion and carries the most weight:
+1. **Diagnosis Quality** — Is the root cause correctly identified and specific? Is the reasoning sound and the confidence level appropriate?
+2. **Action Appropriateness** — Are the actions safe and well-scoped?
+3. **Risk Management** — Is the risk assessment correct?
+4. **Verification Thoroughness** — Do the checks confirm the fix?
+
+Only aspects present in the workflow are evaluated. Analysis-only workflows are scored on diagnosis quality alone.
+
+**Example:**
+```yaml
+turns:
+  - turn_id: "fix-oom"
+    proposal_spec:
+      request: "Pod CrashLoopBackOff in namespace production"
+      analysis: {}
+      execution: {}
+      verification: {}
+    turn_metrics:
+      - "custom:proposal_evaluation_correctness"
+      - "custom:proposal_status"
+    expected_proposal_status:
+      phase: "Completed"
+```
+
+**When to use:** Evaluating agentic operator workflows (Proposal CRD lifecycle)
+
+**Threshold:** 0.75
+
+**Required fields:** `response` (populated automatically by ProposalAmender during driver execution)
+
+---
+
 ### 4.3 Script-Based Metrics
 
 #### Action Evaluation
@@ -1739,6 +1780,7 @@ lightspeed-eval --eval-data config/eval_batch2.yaml
 | **custom:answer_correctness** | 0-1 | Matches expected answer | 0.75 | query, response, expected_response |
 | **custom:intent_eval** | 0/1 | Has right intent | 1 | query, response, expected_intent |
 | **custom:tool_eval** | 0/1 | Called correct tools with expected results | 1 | expected_tool_calls, tool_calls |
+| **custom:proposal_evaluation_correctness** | 0-1 | Agentic workflow quality (diagnosis, actions, risk) | 0.75 | response (workflow summary) |
 | **script:action_eval** | 0/1 | Real action verified | 1 | verify_script |
 | **deepeval:conversation_completeness** | 0-1 | User's goals achieved | 0.8 | Full conversation |
 | **deepeval:conversation_relevancy** | 0-1 | Stayed on topic | 0.7 | Full conversation |
diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
index 68954501..234c9fd8 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/custom.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -1,5 +1,6 @@
 """Custom metrics using direct LLM integration."""
 
+import json
 import re
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -9,6 +10,7 @@
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
+    PROPOSAL_EVALUATION_CORRECTNESS_PROMPT,
 )
 from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
     evaluate_proposal_status,
@@ -47,6 +49,9 @@ def __init__(
             "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
             "proposal_status": evaluate_proposal_status,
+            "proposal_evaluation_correctness": (
+                self._evaluate_proposal_evaluation_correctness
+            ),
         }
 
         print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
@@ -295,3 +300,119 @@ def _evaluate_intent(
             return score, reason
         except LLMError as e:
             return None, f"Intent evaluation failed: {str(e)}"
+
+    def _parse_proposal_eval_response(
+        self, response: str
+    ) -> tuple[Optional[float], str]:
+        """Parse JSON LLM judge response for proposal evaluation.
+
+        Expected JSON schema::
+
+            {
+              "reasoning": "string",
+              "diagnosis": float | null,
+              "execution": float | null,
+              "verification": float | null,
+              "average": float
+            }
+        """
+        try:
+            data = json.loads(response)
+        except json.JSONDecodeError:
+            return None, f"Invalid JSON from LLM: {response[:120]}"
+
+        reasoning: str = data.get("reasoning", "")
+        sub_scores: dict[str, Optional[float]] = {
+            "diagnosis": self._try_parse_float(data.get("diagnosis")),
+            "execution": self._try_parse_float(data.get("execution")),
+            "verification": self._try_parse_float(data.get("verification")),
+        }
+        average: Optional[float] = self._try_parse_float(data.get("average"))
+
+        present = [v for v in sub_scores.values() if v is not None]
+        if average is None and present:
+            average = sum(present) / len(present)
+
+        parts = [
+            f"{dim}={v:.2f}" if v is not None else f"{dim}=N/A"
+            for dim, v in sub_scores.items()
+        ]
+        if average is not None:
+            parts.append(f"avg={average:.2f}")
+        detail = ", ".join(parts)
+        if reasoning:
+            detail = f"{detail} — {reasoning}"
+
+        return average, detail
+
+    @staticmethod
+    def _try_parse_float(value: Any) -> Optional[float]:
+        """Try to parse a float from a value, return None on failure."""
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            return None
+
+    @staticmethod
+    def _build_optional_expected_outcomes(turn_data: TurnData) -> str:
+        """Build optional expected outcome sections for the judge prompt."""
+        sections: list[str] = []
+        mapping = {
+            "Expected Analysis Outcome": turn_data.expected_analysis_outcome,
+            "Expected Execution Outcome": turn_data.expected_execution_outcome,
+            "Expected Verification Outcome": turn_data.expected_verification_outcome,
+        }
+        for label, value in mapping.items():
+            if value:
+                sections.append(f"\n### {label}\n{value}")
+        return "\n".join(sections)
+
+    @staticmethod
+    def _build_workflow_phases(turn_data: TurnData) -> str:
+        """Build the workflow phases string for the judge prompt."""
+        phases = turn_data.proposal_phases
+        if phases:
+            return "Phases executed: " + ", ".join(phases)
+        return "Phases executed: unknown (score only dimensions visible in the workflow summary)"
+
+    def _evaluate_proposal_evaluation_correctness(
+        self,
+        _conv_data: Any,
+        _turn_idx: Optional[int],
+        turn_data: Optional[TurnData],
+        is_conversation: bool,
+    ) -> tuple[Optional[float], str]:
+        """Evaluate agentic remediation workflow quality using LLM judge."""
+        if is_conversation:
+            return None, "Proposal evaluation correctness is a turn-level metric"
+
+        if turn_data is None or not turn_data.response:
+            return None, "TurnData with response is required for proposal evaluation"
+
+        if not turn_data.expected_outcome:
+            return None, "No expected outcome provided for proposal evaluation"
+
+        optional_sections = self._build_optional_expected_outcomes(turn_data)
+        workflow_phases = self._build_workflow_phases(turn_data)
+
+        prompt = PROPOSAL_EVALUATION_CORRECTNESS_PROMPT.format(
+            request=turn_data.query or "N/A",
+            workflow_phases=workflow_phases,
+            workflow_summary=turn_data.response,
+            expected_outcome=turn_data.expected_outcome,
+            optional_expected_outcomes=optional_sections,
+        )
+
+        try:
+            llm_response = self._call_llm(prompt)
+            score, reason = self._parse_proposal_eval_response(llm_response)
+
+            if score is None:
+                return (
+                    None,
+                    f"Could not parse score from LLM response: {llm_response[:100]}...",
+                )
+
+            return score, f"Proposal evaluation correctness: {reason}"
+        except LLMError as e:
+            return None, f"Proposal evaluation correctness failed: {str(e)}"
diff --git a/src/lightspeed_evaluation/core/metrics/custom/prompts.py b/src/lightspeed_evaluation/core/metrics/custom/prompts.py
index f04f2bdb..d31a1859 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/prompts.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/prompts.py
@@ -35,7 +35,7 @@
 
 Examples of intent evaluation:
 - If expected intent is "provide instructions", check if the response is instructional
-- If expected intent is "explain a concept", check if the response is explanatory  
+- If expected intent is "explain a concept", check if the response is explanatory
 - If expected intent is "refuse or decline", check if the response is declining to help
 - If expected intent is "ask for clarification", check if the response is asking questions
 
@@ -44,3 +44,70 @@
 Format your response as:
 Score: [0 or 1]
 Reason: [your detailed explanation]"""
+
+# Proposal Evaluation Correctness Prompt
+PROPOSAL_EVALUATION_CORRECTNESS_PROMPT = """You are a senior Site Reliability Engineer evaluating an automated remediation workflow on an OpenShift/Kubernetes cluster. You must be strict, objective, and critical. Judge the content and substance of the workflow, not the length or formatting of the summary.
+
+## Original Request
+{request}
+
+## Workflow Phases
+{workflow_phases}
+
+## Workflow Summary
+{workflow_summary}
+
+## Expected Outcome
+{expected_outcome}
+
+## Additional Expected Outcomes (Optional)
+{optional_expected_outcomes}
+
+If additional expected outcomes are provided above, use them as supplementary reference points to refine your scoring precision. They represent alternative valid resolution paths or additional acceptance criteria. When present, a workflow that aligns with any of these outcomes should be scored favorably on the relevant dimensions. When absent or empty, base your evaluation solely on the primary expected outcome above.
+
+## Evaluation Criteria
+Compare the workflow summary against the expected outcome (and any additional expected outcomes, if provided) on each dimension independently:
+
+1. **Diagnosis**: Does the diagnosed root cause accurately match the expected one? Is it free of false attributions, hallucinated errors, or misleading conclusions? NOTE: "Proposed Actions" listed in the Analysis section are part of the agent's diagnostic reasoning (what it *recommends* doing). Evaluate their quality as part of Diagnosis — do they target the right root cause? Are the recommendations sound and safe?
+2. **Execution**: Were the remediation actions actually carried out? Did they produce the intended effect? Are they safe, well-scoped, and minimal? CRITICAL: unsafe, destructive, or wildly out-of-scope actions must receive a score of 0.2 or lower, regardless of diagnosis accuracy. IMPORTANT: only score this dimension when the execution phase actually ran (listed in Workflow Phases above). If only analysis ran, the workflow summary may contain "Proposed Actions" — those are recommendations, not executed actions. Do NOT score them under Execution; they belong to Diagnosis.
+3. **Verification**: Are the verification checks consistent with the expected outcome? Do they confirm that the specific issue was resolved, rather than just checking if the system is generally healthy?
+
+**Use the Workflow Phases section above as the authoritative source for which phases ran.** Only score dimensions whose corresponding phase is listed. If execution was attempted but failed due to infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A — do not penalize the agent's reasoning quality. Mark absent dimensions as null.
+
+## Scoring Rubric (apply per dimension)
+- **1.0**: Perfect alignment with the expected outcome for this dimension.
+- **0.7 - 0.9**: Correct direction, but slightly suboptimal, over-scoped, or missing minor details (still safe).
+- **0.4 - 0.6**: Partially correct but with significant gaps, inefficiencies, or poor scoping.
+- **0.1 - 0.3**: Incorrect, does not address the issue, or introduces safety/security risks.
+- **0.0**: Total failure, hallucinated content, or catastrophically unsafe.
+
+## Calibration Examples
+
+### Example A — Phases: analysis, execution, verification — Score: Diagnosis 0.9, Execution 0.8, Verification 0.8, Average 0.83
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi. Verify pod is Running."
+Workflow: Correctly diagnosed OOMKilled from container lastState. Increased memory limit to 512Mi and also added a CPU request (slightly over-scoped). Verified pod reached Running state.
+Why: Diagnosis was accurate (0.9). Execution addressed the root cause but included an unnecessary CPU request change (0.8). Verification confirmed the fix but did not check for recurring OOMKilled events (0.8).
+
+### Example B — Phases: analysis, execution — Score: Diagnosis 0.2, Execution 0.1, Verification N/A, Average 0.15
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi."
+Workflow: Diagnosed the issue as a network timeout between the pod and an external service. Executed a restart of the cluster DNS operator.
+Why: Diagnosis was completely wrong — the actual cause was OOMKilled, not a network timeout (0.2). Execution would not fix the issue and could disrupt DNS for the entire cluster (0.1). Verification was not configured (N/A).
+
+### Example C — Phases: analysis — Score: Diagnosis 1.0, Execution N/A, Verification N/A, Average 1.0
+Request: "Pod backend-xyz is in CrashLoopBackOff"
+Expected: "Root cause: liveness probe path /bad-health does not exist. Fix the probe path to /healthz."
+Workflow: Correctly diagnosed the liveness probe misconfiguration. Proposed patching the probe path to /healthz. Execution failed with: "context deadline exceeded" (sandbox pod timeout). No verification was performed.
+Why: Diagnosis was perfect (1.0). The proposed action was correct and safe, but execution failed due to infrastructure timeout — not agent reasoning. When execution fails for infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A rather than penalizing the agent's reasoning quality. Verification was never reached (N/A).
+
+## Output Format
+Use below json format for your response. Do not add any additional text apart from json output.
+
+{{
+  "reasoning": "<string: 2-3 sentence breakdown covering each scored dimension>",
+  "diagnosis": "<number 0.0-1.0>",
+  "execution": "<number 0.0-1.0 or null if N/A>",
+  "verification": "<number 0.0-1.0 or null if N/A>",
+  "average": "<number: mean of non-null dimensions, e.g. diagnosis=0.9 execution=0.8 verification=null → (0.9+0.8)/2=0.85>"
+}}"""
diff --git a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
index bbb3cabe..316a4b32 100644
--- a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
+++ b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
@@ -3,53 +3,7 @@
 from typing import Any, Optional
 
 from lightspeed_evaluation.core.models import TurnData
-
-
-def _derive_phase(
-    conditions: list[dict[str, Any]],
-    proposal_spec: Optional[dict[str, Any]] = None,
-) -> str:
-    """Derive the terminal phase from CRD conditions.
-
-    Args:
-        conditions: List of condition dicts from proposal_status.
-        proposal_spec: Proposal spec to determine the last expected step.
-
-    Returns:
-        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
-    """
-    by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c}
-
-    if by_type.get("Denied", {}).get("status") == "True":
-        return "Denied"
-    if by_type.get("Escalated", {}).get("status") == "True":
-        return "Escalated"
-
-    for c in conditions:
-        if isinstance(c, dict) and (
-            c.get("type") in {"Analyzed", "Executed", "Verified"}
-            and c.get("status") == "False"
-            and c.get("reason") != "RetryingExecution"
-        ):
-            return "Failed"
-
-    step_to_condition = {"verification": "Verified", "execution": "Executed"}
-    if proposal_spec:
-        last = next(
-            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
-            "Analyzed",
-        )
-    else:
-        last = "Analyzed"
-        for step in ("Verified", "Executed", "Analyzed"):
-            if by_type.get(step, {}).get("status") == "True":
-                last = step
-                break
-
-    if by_type.get(last, {}).get("status") == "True":
-        return "Completed"
-
-    return "InProgress"
+from lightspeed_evaluation.core.proposal import derive_phase
 
 
 def _check_phase(
@@ -62,7 +16,7 @@ def _check_phase(
     if phase is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual == phase:
         return True, f"Phase matches: {actual}"
     return False, f"Phase mismatch: expected '{phase}', got '{actual}'"
@@ -78,7 +32,7 @@ def _check_phase_in(
     if phase_in is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual in phase_in:
         return True, f"Phase '{actual}' in {phase_in}"
     return False, f"Phase '{actual}' not in {phase_in}"
diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
index ae88af2c..90c9153d 100644
--- a/src/lightspeed_evaluation/core/models/data.py
+++ b/src/lightspeed_evaluation/core/models/data.py
@@ -76,6 +76,26 @@ class TurnData(StreamingMetricsMixin):
     expected_intent: Optional[str] = Field(
         default=None, min_length=1, description="Expected intent for intent evaluation"
     )
+    expected_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected outcome for proposal evaluation correctness",
+    )
+    expected_analysis_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected analysis/diagnosis outcome for proposal evaluation",
+    )
+    expected_execution_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected execution/action outcome for proposal evaluation",
+    )
+    expected_verification_outcome: Optional[str] = Field(
+        default=None,
+        min_length=1,
+        description="Expected verification outcome for proposal evaluation",
+    )
     conversation_id: Optional[str] = Field(
         default=None, description="Conversation ID - populated by API if enabled"
     )
@@ -122,6 +142,14 @@ class TurnData(StreamingMetricsMixin):
     proposal_status: Optional[dict[str, Any]] = Field(
         default=None, description="Raw CRD status populated by ProposalDriver"
     )
+    proposal_results: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="Structured results from child Result CRs, populated by ProposalAmender",
+    )
+    proposal_phases: Optional[list[str]] = Field(
+        default=None,
+        description="Workflow phases that actually executed (e.g. ['analysis', 'execution'])",
+    )
 
     # Set of turn metrics that don't pass the validation to ignore them later
     _invalid_metrics: set[str] = set()
diff --git a/src/lightspeed_evaluation/core/proposal/__init__.py b/src/lightspeed_evaluation/core/proposal/__init__.py
new file mode 100644
index 00000000..5e35bfab
--- /dev/null
+++ b/src/lightspeed_evaluation/core/proposal/__init__.py
@@ -0,0 +1,5 @@
+"""Proposal CRD domain logic."""
+
+from lightspeed_evaluation.core.proposal.phase import derive_phase
+
+__all__ = ["derive_phase"]
diff --git a/src/lightspeed_evaluation/core/proposal/phase.py b/src/lightspeed_evaluation/core/proposal/phase.py
new file mode 100644
index 00000000..e892afd6
--- /dev/null
+++ b/src/lightspeed_evaluation/core/proposal/phase.py
@@ -0,0 +1,50 @@
+"""Derive terminal phase from Proposal CRD conditions."""
+
+from typing import Any, Optional
+
+
+def derive_phase(
+    conditions: list[dict[str, Any]],
+    proposal_spec: Optional[dict[str, Any]] = None,
+) -> str:
+    """Derive the terminal phase from CRD conditions.
+
+    Args:
+        conditions: List of condition dicts from proposal_status.
+        proposal_spec: Proposal spec to determine the last expected step.
+
+    Returns:
+        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
+    """
+    by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c}
+
+    if by_type.get("Denied", {}).get("status") == "True":
+        return "Denied"
+    if by_type.get("Escalated", {}).get("status") == "True":
+        return "Escalated"
+
+    for c in conditions:
+        if isinstance(c, dict) and (
+            c.get("type") in {"Analyzed", "Executed", "Verified"}
+            and c.get("status") == "False"
+            and c.get("reason") != "RetryingExecution"
+        ):
+            return "Failed"
+
+    step_to_condition = {"verification": "Verified", "execution": "Executed"}
+    if proposal_spec:
+        last = next(
+            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
+            "Analyzed",
+        )
+    else:
+        last = "Analyzed"
+        for step in ("Verified", "Executed", "Analyzed"):
+            if by_type.get(step, {}).get("status") == "True":
+                last = step
+                break
+
+    if by_type.get(last, {}).get("status") == "True":
+        return "Completed"
+
+    return "InProgress"
diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py
index 53036e24..d1f8aa96 100644
--- a/src/lightspeed_evaluation/core/system/validator.py
+++ b/src/lightspeed_evaluation/core/system/validator.py
@@ -62,6 +62,13 @@
         "required_fields": ["expected_proposal_status"],
         "description": "requires 'expected_proposal_status' field",
     },
+    "custom:proposal_evaluation_correctness": {
+        "required_fields": ["response", "expected_outcome"],
+        "description": (
+            "requires 'response' and 'expected_outcome' fields "
+            "(Markdown workflow summary from ProposalAmender)"
+        ),
+    },
     "script:action_eval": {
         "required_fields": ["verify_script"],
         "description": "requires 'verify_script' field",
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/cli.py b/src/lightspeed_evaluation/pipeline/evaluation/cli.py
new file mode 100644
index 00000000..5653258b
--- /dev/null
+++ b/src/lightspeed_evaluation/pipeline/evaluation/cli.py
@@ -0,0 +1,131 @@
+"""CLI client abstraction for Kubernetes cluster operations."""
+
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+
+class CLIClient(ABC):
+    """Interface for CLI operations against a Kubernetes cluster."""
+
+    def __init__(self, timeout: int) -> None:
+        """Initialize with a command timeout in seconds."""
+        self._timeout = timeout
+
+    @abstractmethod
+    def run(
+        self,
+        args: list[str],
+        stdin: Optional[str] = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a CLI command and return the completed process."""
+
+    @abstractmethod
+    def get_resource(
+        self,
+        resource_plural: str,
+        name: str,
+    ) -> tuple[dict[str, Any], Optional[str]]:
+        """Fetch a single resource by name.
+
+        Returns:
+            Tuple of (resource_dict, error_message). On success the dict is
+            the full JSON object; on failure the dict is empty.
+        """
+
+    @abstractmethod
+    def apply(
+        self,
+        manifest: dict[str, Any],
+    ) -> subprocess.CompletedProcess[str]:
+        """Apply a manifest via stdin."""
+
+    @abstractmethod
+    def delete(self, resource_plural: str, name: str) -> None:
+        """Delete a resource by name (idempotent)."""
+
+
+class KubeCLI(CLIClient):
+    """Concrete CLIClient backed by oc or kubectl."""
+
+    def __init__(self, cli_path: str, namespace: str, timeout: int) -> None:
+        """Initialize with a resolved binary path and target namespace."""
+        super().__init__(timeout)
+        self._cli = cli_path
+        self._namespace = namespace
+
+    def run(
+        self,
+        args: list[str],
+        stdin: Optional[str] = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Run a CLI command and return the completed process."""
+        try:
+            return subprocess.run(
+                [self._cli, *args],
+                input=stdin,
+                text=True,
+                capture_output=True,
+                env=os.environ.copy(),
+                timeout=self._timeout,
+                check=False,
+            )
+        except subprocess.TimeoutExpired:
+            cmd_str = " ".join([self._cli, *args])
+            return subprocess.CompletedProcess(
+                args=[self._cli, *args],
+                returncode=1,
+                stdout="",
+                stderr=f"Command timed out after {self._timeout}s: {cmd_str}",
+            )
+
+    def get_resource(
+        self,
+        resource_plural: str,
+        name: str,
+    ) -> tuple[dict[str, Any], Optional[str]]:
+        """Fetch a single resource by name.
+
+        Returns:
+            Tuple of (resource_dict, error_message). On success the dict is
+            the full JSON object; on failure the dict is empty.
+        """
+        result = self.run(
+            ["get", resource_plural, name, "-n", self._namespace, "-o", "json"]
+        )
+        if result.returncode != 0:
+            return (
+                {},
+                f"Failed to get {resource_plural}/{name}: " f"{result.stderr.strip()}",
+            )
+        try:
+            return json.loads(result.stdout), None
+        except json.JSONDecodeError as exc:
+            return (
+                {},
+                f"Failed to parse JSON for {resource_plural}/{name}: {exc}",
+            )
+
+    def apply(
+        self,
+        manifest: dict[str, Any],
+    ) -> subprocess.CompletedProcess[str]:
+        """Apply a manifest via stdin."""
+        return self.run(["apply", "-f", "-"], stdin=json.dumps(manifest))
+
+    def delete(self, resource_plural: str, name: str) -> None:
+        """Delete a resource by name (idempotent)."""
+        self.run(
+            [
+                "delete",
+                resource_plural,
+                name,
+                "-n",
+                self._namespace,
+                "--ignore-not-found",
+            ]
+        )
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/driver.py b/src/lightspeed_evaluation/pipeline/evaluation/driver.py
index 1c46b947..8432967c 100644
--- a/src/lightspeed_evaluation/pipeline/evaluation/driver.py
+++ b/src/lightspeed_evaluation/pipeline/evaluation/driver.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-import json
 import logging
-import os
 import re
 import shutil
 import subprocess
@@ -15,17 +13,19 @@
 from typing import Any, Optional, cast
 
 from lightspeed_evaluation.core.api import APIClient
-from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
-    _derive_phase,
-)
 from lightspeed_evaluation.core.models import (
     APIConfig,
     HttpApiAgentConfig,
     ProposalAgentConfig,
     TurnData,
 )
+from lightspeed_evaluation.core.proposal import derive_phase
 from lightspeed_evaluation.core.system.exceptions import ConfigurationError
 from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender
+from lightspeed_evaluation.pipeline.evaluation.cli import KubeCLI
+from lightspeed_evaluation.pipeline.evaluation.proposal_amender import (
+    ProposalAmender,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -140,6 +140,12 @@ def __init__(self, config: dict[str, Any], *, enabled: bool = True) -> None:
         """Initialize the proposal driver."""
         super().__init__(config, enabled=enabled)
         self._cli = self._resolve_cli()
+        self._kube_cli = KubeCLI(
+            cli_path=self._cli,
+            namespace=self._config.namespace,
+            timeout=self._config.cli_timeout,
+        )
+        self._amender = ProposalAmender(self._kube_cli)
 
     def validate_config(self, config: dict[str, Any]) -> ProposalAgentConfig:
         """Validate proposal driver configuration."""
@@ -205,75 +211,64 @@ def execute_turn(
                 None,
             )
 
-        turn_data.response = self._extract_summary(status_dict)
-        turn_data.proposal_status = status_dict
+        self._amend_turn_data(turn_data, status_dict)
         self._cleanup(cr_name)
+        return self._outcome_to_result(outcome, cr_name)
+
+    def _amend_turn_data(
+        self, turn_data: TurnData, status_dict: dict[str, Any]
+    ) -> None:
+        """Amend turn data from proposal status, with fallback on amender failure."""
+        amend_err = self._amender.amend(turn_data, status_dict)
+        if amend_err:
+            logger.warning("ProposalAmender failed: %s", amend_err)
+            if not turn_data.response:
+                turn_data.response = self._extract_summary(status_dict)
+            if not turn_data.proposal_status:
+                turn_data.proposal_status = status_dict
+
+    # Failed/Escalated → error (pipeline marks remaining turns as ERROR,
+    # metrics are NOT evaluated). Denied/Completed → no error (metrics run).
+    _OUTCOME_ERRORS: dict[TerminalOutcome, str] = {
+        TerminalOutcome.FAILED: "Proposal '{cr_name}' execution failed",
+        TerminalOutcome.ESCALATED: (
+            "Proposal '{cr_name}' escalated after verification failure"
+        ),
+    }
 
-        if outcome == TerminalOutcome.COMPLETED:
-            return (None, None)
-        return (
-            f"Proposal '{cr_name}' terminated with outcome: {outcome}",
-            None,
-        )
+    @staticmethod
+    def _outcome_to_result(
+        outcome: Optional[TerminalOutcome], cr_name: str
+    ) -> tuple[Optional[str], None]:
+        """Map a terminal outcome to an (error_message, None) result tuple."""
+        template = ProposalDriver._OUTCOME_ERRORS.get(outcome)  # type: ignore[arg-type]
+        if template:
+            return (template.format(cr_name=cr_name), None)
+        if outcome != TerminalOutcome.COMPLETED:
+            logger.warning(
+                "Proposal '%s' terminated with outcome: %s", cr_name, outcome
+            )
+        return (None, None)
 
     @staticmethod
     def _resolve_cli() -> str:
         """Resolve oc or kubectl binary path."""
         return shutil.which("oc") or shutil.which("kubectl") or ""
 
-    def _run_cli(
-        self,
-        args: list[str],
-        stdin: Optional[str] = None,
-    ) -> subprocess.CompletedProcess[str]:
-        """Run a CLI command and return the result."""
-        return subprocess.run(
-            [self._cli, *args],
-            input=stdin,
-            text=True,
-            capture_output=True,
-            env=os.environ.copy(),
-            timeout=self._config.cli_timeout,
-            check=False,
-        )
-
     def _apply(self, manifest: dict[str, Any]) -> subprocess.CompletedProcess[str]:
         """Apply a CR manifest via stdin."""
-        return self._run_cli(["apply", "-f", "-"], stdin=json.dumps(manifest))
+        return self._kube_cli.apply(manifest)
 
     def _get_status(self, cr_name: str) -> tuple[dict[str, Any], Optional[str]]:
         """Get Proposal CR status."""
-        result = self._run_cli(
-            [
-                "get",
-                CRD_PLURAL,
-                cr_name,
-                "-n",
-                self._config.namespace,
-                "-o",
-                "json",
-            ]
-        )
-        if result.returncode != 0:
-            return {}, f"Failed to get status for '{cr_name}': {result.stderr.strip()}"
-        try:
-            cr = json.loads(result.stdout)
-        except json.JSONDecodeError as exc:
-            return {}, f"Failed to parse status JSON for '{cr_name}': {exc}"
+        cr, err = self._kube_cli.get_resource(CRD_PLURAL, cr_name)
+        if err:
+            return {}, f"Failed to get status for '{cr_name}': {err}"
         return cr.get("status", {}), None
 
     def _delete(self, cr_name: str) -> None:
         """Delete a Proposal CR."""
-        self._run_cli(
-            [
-                "delete",
-                CRD_PLURAL,
-                cr_name,
-                "-n",
-                self._config.namespace,
-                "--ignore-not-found",
-            ]
-        )
+        self._kube_cli.delete(CRD_PLURAL, cr_name)
 
     def _cleanup(self, cr_name: str) -> None:
         """Delete the Proposal CR if cleanup is enabled."""
@@ -385,12 +380,18 @@ def _is_terminal(
         conditions: list[dict[str, Any]], proposal_spec: dict[str, Any]
     ) -> Optional[TerminalOutcome]:
         """Check if conditions indicate a terminal state."""
-        phase = _derive_phase(conditions, proposal_spec or None)
+        phase = derive_phase(conditions, proposal_spec or None)
         return ProposalDriver._PHASE_TO_OUTCOME.get(phase)
 
     @staticmethod
     def _extract_summary(status_dict: dict[str, Any]) -> str:
-        """Extract a human-readable summary from analysis results."""
+        """Extract a human-readable summary from analysis results.
+
+        Degraded fallback: only called when ProposalAmender.amend() fails.
+        Reads only condition messages, ignoring proposal_results and child
+        Result CRs (analysis/execution/verification). The full rich summary
+        is built by ProposalAmender.build_summary() in the happy path.
+        """
         conditions = status_dict.get("conditions", [])
         messages = [c["message"] for c in conditions if c.get("message")]
         return "; ".join(messages) if messages else "No summary available"
diff --git a/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py b/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py
new file mode 100644
index 00000000..45b29ce6
--- /dev/null
+++ b/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py
@@ -0,0 +1,261 @@
+"""ProposalAmender — fetches child Result CRs and enriches TurnData."""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+from typing import Any, Optional
+
+from lightspeed_evaluation.core.models import TurnData
+from lightspeed_evaluation.pipeline.evaluation.cli import CLIClient
+
+logger = logging.getLogger(__name__)
+
+STEP_RESOURCES: dict[str, str] = {
+    "analysis": "analysisresults",
+    "execution": "executionresults",
+    "verification": "verificationresults",
+    "escalation": "escalationresults",
+}
+
+
+class ProposalAmender:
+    """Fetches child Result CRs and enriches TurnData with structured results and summary."""
+
+    def __init__(self, cli_client: CLIClient) -> None:
+        """Initialize with a CLIClient for fetching child CRs."""
+        self._cli = cli_client
+
+    def amend(
+        self, turn_data: TurnData, proposal_status: dict[str, Any]
+    ) -> Optional[str]:
+        """Amend turn_data in-place with proposal results and Markdown summary.
+
+        Returns:
+            Error message on failure, None on success.
+        """
+        try:
+            return self._do_amend(turn_data, proposal_status)
+        except (KeyError, TypeError, ValueError, subprocess.SubprocessError) as exc:
+            return f"ProposalAmender error: {exc}"
+
+    def _do_amend(
+        self, turn_data: TurnData, proposal_status: dict[str, Any]
+    ) -> Optional[str]:
+        """Internal amend logic."""
+        turn_data.proposal_status = proposal_status
+
+        steps = proposal_status.get("steps", {})
+        if not steps:
+            turn_data.proposal_results = {}
+            turn_data.proposal_phases = []
+            turn_data.response = self.build_summary(turn_data, {})
+            return None
+
+        results: dict[str, list[dict[str, Any]]] = {}
+        for step_name, resource_plural in STEP_RESOURCES.items():
+            step_data = steps.get(step_name)
+            if step_data is None:
+                continue
+            refs = step_data.get("results", [])
+            step_results: list[dict[str, Any]] = []
+            for ref in refs:
+                ref_name = ref.get("name", "")
+                if not ref_name:
+                    continue
+                cr, err = self._cli.get_resource(resource_plural, ref_name)
+                if err:
+                    logger.warning(
+                        "Failed to fetch %s/%s: %s",
+                        resource_plural,
+                        ref_name,
+                        err,
+                    )
+                    continue
+                status = cr.get("status", {})
+                if status:
+                    step_results.append(status)
+            results[step_name] = step_results
+
+        turn_data.proposal_results = results
+        turn_data.proposal_phases = [
+            step for step in STEP_RESOURCES if results.get(step)
+        ]
+        turn_data.response = self.build_summary(turn_data, results)
+
+        return None
+
+    @staticmethod
+    def build_summary(
+        turn_data: TurnData,
+        results: dict[str, list[dict[str, Any]]],
+    ) -> str:
+        """Build a Markdown workflow summary from structured results."""
+        sections: list[str] = []
+
+        sections.append(f"## Request\n\n{turn_data.query or 'N/A'}")
+
+        analysis_results = results.get("analysis", [])
+        if analysis_results:
+            sections.append(_build_analysis_section(analysis_results))
+
+        execution_results = results.get("execution", [])
+        if execution_results:
+            sections.append(_build_execution_section(execution_results))
+
+        verification_results = results.get("verification", [])
+        if verification_results:
+            sections.append(_build_verification_section(verification_results))
+
+        escalation_results = results.get("escalation", [])
+        if escalation_results:
+            sections.append(_build_escalation_section(escalation_results))
+
+        sections.append(_build_outcome_section(turn_data))
+
+        return "\n\n".join(sections)
+
+
+def _build_analysis_section(analysis_results: list[dict[str, Any]]) -> str:
+    """Build the Analysis section from AnalysisResult statuses."""
+    lines: list[str] = ["## Analysis"]
+    for result_status in analysis_results:
+        options = result_status.get("options", [])
+        if not options:
+            failure = result_status.get("failureReason", "")
+            if failure:
+                lines.append(f"\n**Failed:** {failure}")
+            continue
+        lines.append(f"\n{len(options)} option(s) proposed")
+        for idx, option in enumerate(options):
+            label = "(Approved)" if idx == 0 else ""
+            title = option.get("title", "Untitled")
+            lines.append(f"\n### Option {idx} {label}: {title}".rstrip())
+            _append_diagnosis(lines, option.get("diagnosis", {}))
+            _append_proposal(lines, option.get("proposal", {}))
+
+    return "\n".join(lines)
+
+
+def _append_diagnosis(lines: list[str], diagnosis: dict[str, Any]) -> None:
+    """Append diagnosis details to output lines."""
+    if not diagnosis:
+        return
+    summary = diagnosis.get("summary", "")
+    confidence = diagnosis.get("confidence", "")
+    root_cause = diagnosis.get("rootCause", "")
+    if summary:
+        lines.append(f"**Diagnosis:** {summary} (Confidence: {confidence})")
+    if root_cause:
+        lines.append(f"**Root Cause:** {root_cause}")
+
+
+def _append_proposal(lines: list[str], proposal: dict[str, Any]) -> None:
+    """Append proposal details to output lines."""
+    if not proposal:
+        return
+    actions = proposal.get("actions", [])
+    if actions:
+        lines.append("**Proposed Actions:**")
+        for i, action in enumerate(actions, 1):
+            a_type = action.get("type", "")
+            a_desc = action.get("description", "")
+            lines.append(f"{i}. [{a_type}] {a_desc}")
+    risk = proposal.get("risk", "")
+    reversible = proposal.get("reversible", "")
+    if risk or reversible:
+        lines.append(f"**Risk:** {risk} | **Reversible:** {reversible}")
+    impact = proposal.get("estimatedImpact", "")
+    if impact:
+        lines.append(f"**Estimated Impact:** {impact}")
+
+
+def _build_execution_section(execution_results: list[dict[str, Any]]) -> str:
+    """Build the Execution section from ExecutionResult statuses."""
+    lines: list[str] = ["## Execution"]
+    for result_status in execution_results:
+        failure = result_status.get("failureReason", "")
+        if failure:
+            lines.append(f"\n**Failed:** {failure}")
+            continue
+        actions = result_status.get("actionsTaken", [])
+        if actions:
+            lines.append("\n**Actions Taken:**")
+            for i, action in enumerate(actions, 1):
+                lines.append(_format_execution_action(i, action))
+        verification = result_status.get("verification", {})
+        if verification:
+            condition_outcome = verification.get("conditionOutcome", "")
+            ver_summary = verification.get("summary", "")
+            lines.append(f"**Post-execution:** {condition_outcome} — {ver_summary}")
+    return "\n".join(lines)
+
+
+def _format_execution_action(index: int, action: dict[str, Any]) -> str:
+    """Format a single execution action line."""
+    a_type = action.get("type", "")
+    a_desc = action.get("description", "")
+    outcome = action.get("outcome", "")
+    line = f"{index}. [{a_type}] {a_desc} → {outcome}"
+    output = action.get("output", "")
+    if output:
+        line += f"\n   Output: {output}"
+    error = action.get("error", "")
+    if error:
+        line += f"\n   Error: {error}"
+    return line
+
+
+def _build_verification_section(
+    verification_results: list[dict[str, Any]],
+) -> str:
+    """Build the Verification section from VerificationResult statuses."""
+    lines: list[str] = ["## Verification"]
+    for result_status in verification_results:
+        failure = result_status.get("failureReason", "")
+        if failure:
+            lines.append(f"\n**Failed:** {failure}")
+            continue
+        checks = result_status.get("checks", [])
+        if checks:
+            lines.append("\n**Checks:**")
+            for check in checks:
+                name = check.get("name", "")
+                source = check.get("source", "")
+                value = check.get("value", "")
+                check_result = check.get("result", "")
+                lines.append(f"- {name} ({source}): {value} → {check_result}")
+        ver_summary = result_status.get("summary", "")
+        if ver_summary:
+            lines.append(f"**Summary:** {ver_summary}")
+    return "\n".join(lines)
+
+
+def _build_escalation_section(
+    escalation_results: list[dict[str, Any]],
+) -> str:
+    """Build the Escalation section from EscalationResult statuses."""
+    lines: list[str] = ["## Escalation"]
+    for result_status in escalation_results:
+        esc_summary = result_status.get("summary", "")
+        if esc_summary:
+            lines.append(f"\n**Summary:** {esc_summary}")
+        content = result_status.get("content", "")
+        if content:
+            lines.append(f"\n{content}")
+        failure = result_status.get("failureReason", "")
+        if failure:
+            lines.append(f"\n**Failed:** {failure}")
+    return "\n".join(lines)
+
+
+def _build_outcome_section(turn_data: TurnData) -> str:
+    """Build the Outcome section from proposal_status conditions."""
+    conditions = []
+    if turn_data.proposal_status:
+        conditions = turn_data.proposal_status.get("conditions", [])
+    if not conditions:
+        return "## Outcome\n\nNo conditions available"
+    messages = [c.get("message", "") for c in conditions if c.get("message")]
+    summary = "; ".join(messages) if messages else "No summary available"
+    return f"## Outcome\n\n{summary}"
diff --git a/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml b/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml
new file mode 100644
index 00000000..0c4622bc
--- /dev/null
+++ b/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml
@@ -0,0 +1,39 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: crashloop-probe-demo
+  namespace: lightspeed-evaluation-test
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: crashloop-probe-demo
+  template:
+    metadata:
+      labels:
+        app: crashloop-probe-demo
+    spec:
+      containers:
+        - name: nginx
+          image: registry.access.redhat.com/ubi9/nginx-124:latest
+          ports:
+            - containerPort: 8080
+          livenessProbe:
+            httpGet:
+              path: /nonexistent-health
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 3
+            failureThreshold: 2
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+            capabilities:
+              drop: ["ALL"]
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+              cpu: "100m"
diff --git a/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh b/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh
new file mode 100644
index 00000000..7edde567
--- /dev/null
+++ b/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Common infrastructure cleanup for Claude/Vertex AI provider.
+# Sourced by per-scenario cleanup scripts — do NOT run directly.
+#
+# Deletes "eval-" prefixed operator resources (reverse order of creation)
+# and the test namespace.
+
+OPERATOR_NS="${OPERATOR_NS:-openshift-lightspeed}"
+TEST_NS="${TEST_NS:-lightspeed-evaluation-test}"
+
+oc delete proposals --all -n "$TEST_NS" --ignore-not-found
+oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found
+oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
+oc delete agent eval-default --ignore-not-found
+oc delete llmprovider eval-vertex-ai --ignore-not-found
+oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
+oc delete namespace "$TEST_NS" --ignore-not-found
diff --git a/tests/integration/agentic/scripts/_cleanup_infra-openai.sh b/tests/integration/agentic/scripts/_cleanup_infra-openai.sh
new file mode 100644
index 00000000..8bc62214
--- /dev/null
+++ b/tests/integration/agentic/scripts/_cleanup_infra-openai.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Common infrastructure cleanup for OpenAI provider.
+# Sourced by per-scenario cleanup scripts — do NOT run directly.
+#
+# Deletes "eval-" prefixed operator resources (reverse order of creation).
+
+OPERATOR_NS="${OPERATOR_NS:-openshift-lightspeed}"
+TEST_NS="${TEST_NS:-lightspeed-evaluation-test}"
+
+oc delete proposals --all -n "$TEST_NS" --ignore-not-found
+oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found
+oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
+oc delete agent eval-default --ignore-not-found
+oc delete llmprovider eval-openai --ignore-not-found
+oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
diff --git a/tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh b/tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh
old mode 100755
new mode 100644
similarity index 80%
rename from tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh
rename to tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh
index 70689a06..971839aa
--- a/tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh
+++ b/tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh
@@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-set -euo pipefail
-
-# Deploy agentic infrastructure + OOMKill test workload for integration tests.
-# All operator-level resources use an "eval-" prefix to avoid conflicts with
-# existing cluster resources.
+# Common infrastructure setup for Claude via Vertex AI.
+# Sourced by per-scenario setup scripts — do NOT run directly.
+#
+# Deploys: namespace, Secret, LLMProvider, Agent, SandboxTemplate.
+# All operator-level resources use an "eval-" prefix.
 #
 # Required env vars:
 #   GCP_CREDENTIALS_FILE          — path to GCP credentials JSON file
@@ -15,8 +15,10 @@ set -euo pipefail
 #   CLOUD_ML_REGION  — default: global
 #   AGENT_MODEL      — default: claude-opus-4-6
 
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
+set -euo pipefail
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 GCP_CREDENTIALS_FILE="${GCP_CREDENTIALS_FILE:-$HOME/.config/gcloud/application_default_credentials.json}"
 CLOUD_ML_REGION="${CLOUD_ML_REGION:-global}"
@@ -34,19 +36,17 @@ if [ ! -f "$GCP_CREDENTIALS_FILE" ]; then
   exit 1
 fi
 
-# 1. Test namespace + OOMKill workload (static fixtures, already test-scoped)
+# 1. Test namespace
 oc apply -f "$SCRIPT_DIR/../fixtures/namespace.yaml"
-oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
-
 
-# 2. Secret (GCP credentials) — prefixed name in operator namespace
+# 2. Secret (GCP credentials)
 oc create secret generic eval-llm-credentials \
   --from-file=credentials.json="$GCP_CREDENTIALS_FILE" \
   --from-literal=ANTHROPIC_VERTEX_PROJECT_ID="$ANTHROPIC_VERTEX_PROJECT_ID" \
   --from-literal=CLOUD_ML_REGION="$CLOUD_ML_REGION" \
   -n "$OPERATOR_NS" --dry-run=client -o yaml | oc apply -f -
 
-# 3. LLMProvider — references prefixed secret
+# 3. LLMProvider
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: LLMProvider
@@ -59,9 +59,10 @@ spec:
       name: eval-llm-credentials
     projectID: $ANTHROPIC_VERTEX_PROJECT_ID
     region: $CLOUD_ML_REGION
+    modelProvider: Anthropic
 EOF
 
-# 4. Agent — references prefixed LLMProvider
+# 4. Agent
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: Agent
@@ -132,9 +133,4 @@ spec:
         emptyDir: {}
 EOF
 
-# 6. Wait for OOMKill workload to start crashing
-echo "Waiting for oomkill-demo pod to appear..."
-oc wait --for=condition=Available=false deployment/oomkill-demo \
-  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
-sleep 10
-echo "Setup complete."
+echo "Infrastructure setup complete (Claude/Vertex AI)."
diff --git a/tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh b/tests/integration/agentic/scripts/_setup_infra-openai.sh
old mode 100755
new mode 100644
similarity index 67%
rename from tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh
rename to tests/integration/agentic/scripts/_setup_infra-openai.sh
index 34110ca1..dac09a7e
--- a/tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh
+++ b/tests/integration/agentic/scripts/_setup_infra-openai.sh
@@ -1,10 +1,9 @@
 #!/usr/bin/env bash
-set -euo pipefail
-
-# Deploy agentic infrastructure + OOMKill test workload for integration tests
-# using OpenAI as the LLM provider.
-# All operator-level resources use an "eval-" prefix to avoid conflicts with
-# existing cluster resources.
+# Common infrastructure setup for OpenAI provider.
+# Sourced by per-scenario setup scripts — do NOT run directly.
+#
+# Deploys: namespace, Secret, LLMProvider, Agent, SandboxTemplate.
+# All operator-level resources use an "eval-" prefix.
 #
 # Required env vars:
 #   OPENAI_API_KEY   — OpenAI API key
@@ -12,13 +11,11 @@ set -euo pipefail
 #
 # Optional env vars:
 #   AGENT_MODEL      — default: gpt-5.2
-#
-# To test with Claude via Vertex AI instead, use
-# setup_proposal_fixtures-claude-vertex.sh and set the corresponding env vars
-# (GCP_CREDENTIALS_FILE, ANTHROPIC_VERTEX_PROJECT_ID, SANDBOX_IMAGE).
 
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
+set -euo pipefail
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
 AGENT_MODEL="${AGENT_MODEL:-gpt-5.2}"
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -30,13 +27,10 @@ for var in OPENAI_API_KEY SANDBOX_IMAGE; do
   fi
 done
 
-# 1. Test namespace + OOMKill workload (static fixtures, already test-scoped)
+# 1. Test namespace
 oc apply -f "$SCRIPT_DIR/../fixtures/namespace.yaml"
-oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
 
-# 2. Secret (OpenAI API key + provider override) — prefixed name in operator namespace
-# LIGHTSPEED_AGENT_PROVIDER and OPENAI_MODEL are injected via envFrom so the
-# sandbox picks the OpenAI provider instead of defaulting to claude.
+# 2. Secret (OpenAI API key + provider override)
 cat <<EOF | oc apply -n "$OPERATOR_NS" -f -
 apiVersion: v1
 kind: Secret
@@ -49,7 +43,7 @@ stringData:
   OPENAI_MODEL: ${AGENT_MODEL}
 EOF
 
-# 3. LLMProvider — references prefixed secret
+# 3. LLMProvider
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: LLMProvider
@@ -62,7 +56,7 @@ spec:
       name: eval-llm-credentials
 EOF
 
-# 4. Agent — references prefixed LLMProvider
+# 4. Agent
 cat <<EOF | oc apply -f -
 apiVersion: agentic.openshift.io/v1alpha1
 kind: Agent
@@ -133,9 +127,4 @@ spec:
         emptyDir: {}
 EOF
 
-# 6. Wait for OOMKill workload to start crashing
-echo "Waiting for oomkill-demo pod to appear..."
-oc wait --for=condition=Available=false deployment/oomkill-demo \
-  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
-sleep 10
-echo "Setup complete."
+echo "Infrastructure setup complete (OpenAI)."
diff --git a/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
new file mode 100755
index 00000000..552f868d
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down CrashLoopBackOff (liveness probe) test workload and Claude/Vertex AI infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up CrashLoopBackOff probe integration test resources..."
+oc delete deployment crashloop-probe-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_cleanup_infra-claude-vertex.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh b/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh
new file mode 100755
index 00000000..09adcfcf
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down CrashLoopBackOff (liveness probe) test workload and OpenAI provider infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up CrashLoopBackOff probe integration test resources..."
+oc delete deployment crashloop-probe-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-openai.sh
+source "$SCRIPT_DIR/_cleanup_infra-openai.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh
new file mode 100755
index 00000000..65eb9879
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down OOMKill test workload and Claude/Vertex AI provider infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up OOMKill integration test resources..."
+oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_cleanup_infra-claude-vertex.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh b/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh
new file mode 100755
index 00000000..6729724c
--- /dev/null
+++ b/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Tear down OOMKill test workload and OpenAI provider infrastructure.
+
+export OPERATOR_NS="openshift-lightspeed"
+export TEST_NS="lightspeed-evaluation-test"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "Cleaning up OOMKill integration test resources..."
+oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
+
+# shellcheck source=_cleanup_infra-openai.sh
+source "$SCRIPT_DIR/_cleanup_infra-openai.sh"
+echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh
deleted file mode 100755
index 4f84d111..00000000
--- a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Tear down integration test resources. Only deletes "eval-" prefixed operator
-# resources to avoid touching anything that isn't ours.
-
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
-
-echo "Cleaning up integration test resources..."
-
-# Delete test workload + any leftover Proposals in test namespace
-oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
-oc delete proposals --all -n "$TEST_NS" --ignore-not-found
-oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found
-
-# Delete prefixed operator resources (reverse order of creation)
-oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
-oc delete agent eval-default --ignore-not-found
-oc delete llmprovider eval-vertex-ai --ignore-not-found
-oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
-
-# Delete test namespace (cascades remaining namespaced resources)
-oc delete namespace "$TEST_NS" --ignore-not-found
-
-echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh b/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh
deleted file mode 100755
index 550b95d5..00000000
--- a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Tear down integration test resources deployed by setup_proposal_fixtures.sh
-# (OpenAI provider). Only deletes "eval-" prefixed operator resources to avoid
-# touching anything that isn't ours.
-
-OPERATOR_NS="openshift-lightspeed"
-TEST_NS="lightspeed-evaluation-test"
-
-echo "Cleaning up integration test resources..."
-
-# Delete test workload (Proposal cleanup is handled by the driver via cleanup_proposals)
-oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found
-
-# Delete prefixed operator resources (reverse order of creation)
-oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found
-oc delete agent eval-default --ignore-not-found
-oc delete llmprovider eval-openai --ignore-not-found
-oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found
-
-echo "Cleanup complete."
diff --git a/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh b/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh
new file mode 100755
index 00000000..71677724
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy CrashLoopBackOff (misconfigured liveness probe) test workload
+# with Claude/Vertex AI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_setup_infra-claude-vertex.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/crashloop-probe-demo.yaml"
+
+echo "Waiting for crashloop-probe-demo pod to appear..."
+oc wait --for=condition=Available=false deployment/crashloop-probe-demo \
+  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
+sleep 10
+echo "Setup complete."
diff --git a/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh b/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh
new file mode 100755
index 00000000..0383f79c
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy CrashLoopBackOff (misconfigured liveness probe) test workload
+# with OpenAI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-openai.sh
+source "$SCRIPT_DIR/_setup_infra-openai.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/crashloop-probe-demo.yaml"
+
+echo "Waiting for crashloop-probe-demo pod to appear..."
+oc wait --for=condition=Available=false deployment/crashloop-probe-demo \
+  -n "$TEST_NS" --timeout=60s 2>/dev/null
+sleep 10
+echo "Setup complete."
diff --git a/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh b/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh
new file mode 100755
index 00000000..16e4b98a
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy OOMKill test workload with Claude/Vertex AI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-claude-vertex.sh
+source "$SCRIPT_DIR/_setup_infra-claude-vertex.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
+
+echo "Waiting for oomkill-demo pod to appear..."
+oc wait --for=condition=Available=false deployment/oomkill-demo \
+  -n "$TEST_NS" --timeout=60s 2>/dev/null || true
+sleep 10
+echo "Setup complete."
diff --git a/tests/integration/agentic/scripts/setup_oomkill-openai.sh b/tests/integration/agentic/scripts/setup_oomkill-openai.sh
new file mode 100755
index 00000000..1080d733
--- /dev/null
+++ b/tests/integration/agentic/scripts/setup_oomkill-openai.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy OOMKill test workload with OpenAI provider infrastructure.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_setup_infra-openai.sh
+source "$SCRIPT_DIR/_setup_infra-openai.sh"
+
+oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml"
+
+echo "Waiting for oomkill-demo pod to reach OOMKilled or CrashLoopBackOff..."
+TIMEOUT=120
+INTERVAL=5
+ELAPSED=0
+while [ "$ELAPSED" -lt "$TIMEOUT" ]; do
+  TERMINATED_REASON=$(oc get pod -l app=oomkill-demo -n "$TEST_NS" \
+    -o jsonpath='{.items[0].status.containerStatuses[0].state.terminated.reason}' 2>/dev/null || true)
+  WAITING_REASON=$(oc get pod -l app=oomkill-demo -n "$TEST_NS" \
+    -o jsonpath='{.items[0].status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true)
+
+  if [ "$TERMINATED_REASON" = "OOMKilled" ] || [ "$WAITING_REASON" = "CrashLoopBackOff" ]; then
+    echo "Pod reached expected state (terminated=$TERMINATED_REASON, waiting=$WAITING_REASON)."
+    echo "Setup complete."
+    exit 0
+  fi
+
+  sleep "$INTERVAL"
+  ELAPSED=$((ELAPSED + INTERVAL))
+done
+
+echo "ERROR: Timed out after ${TIMEOUT}s waiting for OOMKilled or CrashLoopBackOff." >&2
+exit 1
diff --git a/tests/integration/system-config-agents-proposal.yaml b/tests/integration/system-config-agents-proposal.yaml
index d0902a23..0fb0db82 100644
--- a/tests/integration/system-config-agents-proposal.yaml
+++ b/tests/integration/system-config-agents-proposal.yaml
@@ -1,3 +1,6 @@
+logging:
+  source_level: DEBUG
+
 core:
   max_threads: 1
   fail_on_invalid_data: true
@@ -12,7 +15,7 @@ agents:
     namespace: lightspeed-evaluation-test
     auto_approve: true
     cleanup_proposals: true
-    timeout: 900
+    timeout: 1200
     poll_interval: 5
 
 storage:
diff --git a/tests/integration/test_evaluation_data_proposal.yaml b/tests/integration/test_evaluation_data_proposal.yaml
index 27456b68..e597ecf2 100644
--- a/tests/integration/test_evaluation_data_proposal.yaml
+++ b/tests/integration/test_evaluation_data_proposal.yaml
@@ -1,13 +1,42 @@
-- conversation_group_id: proposal_full_lifecycle
-  description: Full lifecycle — analysis, execution, verification
-  tag: proposal
-  setup_script: agentic/scripts/setup_proposal_fixtures-openai.sh
-  cleanup_script: agentic/scripts/cleanup_proposal_fixtures-openai.sh
+- conversation_group_id: proposal_analysis_only
+  description: Analysis-only — no execution or verification (query from proposal_spec.request)
+  tag: proposal_analysis_only
+  setup_script: agentic/scripts/setup_oomkill-openai.sh
+  cleanup_script: agentic/scripts/cleanup_oomkill-openai.sh
   conversation_metrics: []
   conversation_metrics_metadata: {}
   turns:
     - turn_id: turn_1
       response: null
+      proposal_spec:
+        request: >-
+          A pod named oomkill-demo in namespace lightspeed-evaluation-test
+          is in CrashLoopBackOff. Analyze the root cause.
+        targetNamespaces:
+          - lightspeed-evaluation-test
+        tools:
+          skills:
+            # TODO(rioloc): replace with a stable image
+            - image: quay.io/harpatil/agentic-skills:latest
+              paths:
+                - /skills/find-token
+        analysis:
+          agent: eval-default
+      expected_proposal_status:
+        phase: Completed
+      turn_metrics:
+        - custom:proposal_status
+      turn_metrics_metadata: {}
+
+- conversation_group_id: proposal_oomkill_openai
+  description: OOMKill full lifecycle — status check + LLM-as-judge evaluation
+  tag: proposal_oomkill
+  setup_script: agentic/scripts/setup_oomkill-openai.sh
+  cleanup_script: agentic/scripts/cleanup_oomkill-openai.sh
+  conversation_metrics: []
+  conversation_metrics_metadata: {}
+  turns:
+    - turn_id: turn_1
       proposal_spec:
         request: >-
           A pod named oomkill-demo in namespace lightspeed-evaluation-test
@@ -29,24 +58,42 @@
           agent: eval-default
       expected_proposal_status:
         phase: Completed
+      expected_outcome: >-
+        Root cause: the pod oomkill-demo is OOMKilled because its container
+        memory limit is too low. Remediation: increase the container memory
+        limit (e.g. to 512Mi) and verify the pod reaches Running state
+        without further OOMKill events.
+      expected_analysis_outcome: >-
+        The agent should identify that the pod oomkill-demo is being
+        OOMKilled because its container memory limit is set too low
+        (e.g. 128Mi), causing the kernel to terminate the process.
+      expected_execution_outcome: >-
+        The agent should increase the container memory limit (e.g. to
+        512Mi) by patching the pod or its owning workload resource.
+      expected_verification_outcome: >-
+        The agent should verify the pod reaches Running state without
+        further OOMKill events.
       turn_metrics:
         - custom:proposal_status
-      turn_metrics_metadata: {}
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
 
-- conversation_group_id: proposal_analysis_only
-  description: Analysis-only — no execution or verification (query from proposal_spec.request)
-  tag: proposal
-  setup_script: agentic/scripts/setup_proposal_fixtures-openai.sh
-  cleanup_script: agentic/scripts/cleanup_proposal_fixtures-openai.sh
+- conversation_group_id: proposal_oomkill_claude-vertex
+  description: Judge evaluation — LLM-as-judge on full lifecycle workflow
+  tag: proposal_oomkill
+  setup_script: agentic/scripts/setup_oomkill-claude-vertex.sh
+  cleanup_script: agentic/scripts/cleanup_oomkill-claude-vertex.sh
   conversation_metrics: []
   conversation_metrics_metadata: {}
   turns:
     - turn_id: turn_1
-      response: null
       proposal_spec:
         request: >-
           A pod named oomkill-demo in namespace lightspeed-evaluation-test
-          is in CrashLoopBackOff. Analyze the root cause.
+          is in CrashLoopBackOff due to OOMKill. Analyze the root cause,
+          fix the memory configuration, and verify the fix.
         targetNamespaces:
           - lightspeed-evaluation-test
         tools:
@@ -57,8 +104,115 @@
                 - /skills/find-token
         analysis:
           agent: eval-default
+        execution:
+          agent: eval-default
+        verification:
+          agent: eval-default
+      expected_outcome: >-
+        Root cause: the pod oomkill-demo is OOMKilled because its container
+        memory limit is too low. Remediation: increase the container memory
+        limit (e.g. to 512Mi) and verify the pod reaches Running state
+        without further OOMKill events.
+      expected_analysis_outcome: >-
+        The agent should identify that the pod oomkill-demo is being
+        OOMKilled because its container memory limit is set too low
+        (e.g. 128Mi), causing the kernel to terminate the process.
+      expected_execution_outcome: >-
+        The agent should increase the container memory limit (e.g. to
+        512Mi) by patching the pod or its owning workload resource.
+      expected_verification_outcome: >-
+        The agent should verify the pod reaches Running state without
+        further OOMKill events.
+      turn_metrics:
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
+
+- conversation_group_id: proposal_crashloop_probe_openai
+  description: CrashLoopBackOff caused by misconfigured liveness probe — full lifecycle
+  tag: proposal_crashloop_probe
+  setup_script: agentic/scripts/setup_crashloop_probe-openai.sh
+  cleanup_script: agentic/scripts/cleanup_crashloop_probe-openai.sh
+  conversation_metrics: []
+  conversation_metrics_metadata: {}
+  turns:
+    - turn_id: turn_1
+      proposal_spec:
+        request: >-
+          A pod named crashloop-probe-demo in namespace lightspeed-evaluation-test
+          is in CrashLoopBackOff. The container keeps restarting. Analyze the
+          root cause, fix the issue, and verify the pod is stable.
+        targetNamespaces:
+          - lightspeed-evaluation-test
+        tools:
+          skills:
+            # TODO(rioloc): replace with a stable image
+            - image: quay.io/harpatil/agentic-skills:latest
+              paths:
+                - /skills/find-token
+        analysis:
+          agent: eval-default
+        execution:
+          agent: eval-default
+        verification:
+          agent: eval-default
       expected_proposal_status:
         phase: Completed
+      expected_outcome: >-
+        Root cause: the liveness probe on pod crashloop-probe-demo is
+        configured to check an endpoint that does not exist (/nonexistent-health),
+        causing the kubelet to kill and restart the container repeatedly.
+        Remediation: fix the liveness probe path to point to a valid health
+        endpoint (e.g. / or remove the probe) and verify the pod reaches
+        Running state without further restarts.
       turn_metrics:
         - custom:proposal_status
-      turn_metrics_metadata: {}
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
+
+- conversation_group_id: proposal_crashloop_probe_claude-vertex
+  description: CrashLoopBackOff caused by misconfigured liveness probe — full lifecycle (Claude/Vertex)
+  tag: proposal_crashloop_probe
+  setup_script: agentic/scripts/setup_crashloop_probe-claude-vertex.sh
+  cleanup_script: agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh
+  conversation_metrics: []
+  conversation_metrics_metadata: {}
+  turns:
+    - turn_id: turn_1
+      proposal_spec:
+        request: >-
+          A pod named crashloop-probe-demo in namespace lightspeed-evaluation-test
+          is in CrashLoopBackOff. The container keeps restarting. Analyze the
+          root cause, fix the issue, and verify the pod is stable.
+        targetNamespaces:
+          - lightspeed-evaluation-test
+        tools:
+          skills:
+            # TODO(rioloc): replace with a stable image
+            - image: quay.io/harpatil/agentic-skills:latest
+              paths:
+                - /skills/find-token
+        analysis:
+          agent: eval-default
+        execution:
+          agent: eval-default
+        verification:
+          agent: eval-default
+      expected_proposal_status:
+        phase: Completed
+      expected_outcome: >-
+        Root cause: the liveness probe on pod crashloop-probe-demo is
+        configured to check an endpoint that does not exist (/nonexistent-health),
+        causing the kubelet to kill and restart the container repeatedly.
+        Remediation: fix the liveness probe path to point to a valid health
+        endpoint (e.g. / or remove the probe) and verify the pod reaches
+        Running state without further restarts.
+      turn_metrics:
+        - custom:proposal_status
+        - custom:proposal_evaluation_correctness
+      turn_metrics_metadata:
+        "custom:proposal_evaluation_correctness":
+          threshold: 0.75
diff --git a/tests/integration/test_proposal_evaluation.py b/tests/integration/test_proposal_evaluation.py
index 06238126..f30d9a15 100644
--- a/tests/integration/test_proposal_evaluation.py
+++ b/tests/integration/test_proposal_evaluation.py
@@ -8,10 +8,8 @@
     - OPENAI_API_KEY, SANDBOX_IMAGE env vars set
     - Network connectivity to the cluster API
 
-The default setup/cleanup scripts use OpenAI as the LLM provider.
-To test with a different provider (e.g. Claude via Vertex AI), set the
-corresponding env vars and point the eval data at the provider-specific
-scripts (e.g. setup_proposal_fixtures-claude-vertex.sh).
+Each scenario has its own setup/cleanup scripts that source a shared
+infrastructure script per provider (e.g. _setup_infra-openai.sh).
 
 Run with: pytest tests/integration/test_proposal_evaluation.py -v -m agentic
 """
@@ -103,8 +101,8 @@ class TestProposalDriverEvaluation:
     """End-to-end tests for ProposalDriver evaluation pipeline."""
 
     @pytest.mark.timeout(1200)
-    def test_full_lifecycle(self, tmp_path: Path) -> None:
-        """Test full Proposal lifecycle: analysis, execution, verification.
+    def test_oomkill_full_lifecycle(self, tmp_path: Path) -> None:
+        """Test OOMKill full lifecycle: status check + judge evaluation.
 
         Verifies:
         - Setup script deploys infrastructure and test workload
@@ -126,9 +124,9 @@ def test_full_lifecycle(self, tmp_path: Path) -> None:
         )
         all_data = validator.load_evaluation_data(str(PROPOSAL_EVAL_DATA_PATH))
         eval_data = [
-            d for d in all_data if d.conversation_group_id == "proposal_full_lifecycle"
+            d for d in all_data if d.conversation_group_id == "proposal_oomkill_openai"
         ]
-        assert len(eval_data) == 1, "Should find proposal_full_lifecycle data"
+        assert len(eval_data) == 1, "Should find proposal_oomkill_openai data"
 
         evaluate(system_config, eval_data)
 
@@ -201,6 +199,40 @@ def test_analysis_only(self, tmp_path: Path) -> None:
                 f"got reason={by_type['Verified'].get('reason')}"
             )
 
+    @pytest.mark.timeout(1200)
+    def test_oomkill_claude_vertex(self, tmp_path: Path) -> None:
+        """Test OOMKill full lifecycle with Claude/Vertex AI provider.
+
+        Verifies:
+        - ProposalDriver populates response with workflow summary
+        - custom:proposal_evaluation_correctness metric runs against response
+        - Pipeline completes without errors
+        """
+        loader = ConfigLoader()
+        system_config = loader.load_system_config(str(PROPOSAL_CONFIG_PATH))
+        system_config.storage = [
+            FileBackendConfig(output_dir=str(tmp_path / "eval_output"))
+        ]
+
+        validator = DataValidator(
+            api_enabled=True,
+            fail_on_invalid_data=system_config.core.fail_on_invalid_data,
+        )
+        all_data = validator.load_evaluation_data(str(PROPOSAL_EVAL_DATA_PATH))
+        eval_data = [
+            d
+            for d in all_data
+            if d.conversation_group_id == "proposal_oomkill_claude-vertex"
+        ]
+        assert len(eval_data) == 1, "Should find proposal_oomkill_claude-vertex data"
+
+        evaluate(system_config, eval_data)
+
+        turn = eval_data[0].turns[0]
+        assert (
+            turn.response and turn.response.strip()
+        ), "Response should be populated by ProposalDriver"
+
     @pytest.mark.timeout(120)
     def test_timeout_handling(self, tmp_path: Path) -> None:
         """Test that a very short timeout is handled gracefully.
diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py
index 51a002c7..9125c6d2 100644
--- a/tests/unit/core/metrics/custom/test_custom.py
+++ b/tests/unit/core/metrics/custom/test_custom.py
@@ -1,10 +1,15 @@
 """Tests for custom metrics module."""
 
+# pylint: disable=protected-access
+
+import json
+
 from pytest_mock import MockerFixture
 
 from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics
 from lightspeed_evaluation.core.metrics.manager import MetricLevel
 from lightspeed_evaluation.core.models import EvaluationScope, TurnData
+from lightspeed_evaluation.core.system.exceptions import LLMError
 
 
 class TestCustomMetricsToolEval:
@@ -175,3 +180,322 @@ def test_config_from_system_defaults_via_metric_manager(
         assert score == 1.0
         assert "partial" in reason
         assert "unordered" in reason
+
+
+def _make_custom_metrics(mocker: MockerFixture) -> CustomMetrics:
+    """Create a CustomMetrics instance with mocked LLM manager."""
+    mock_llm_manager = mocker.Mock()
+    mock_llm_manager.get_model_name.return_value = "test-model"
+    mock_llm_manager.get_llm_params.return_value = {"parameters": {}}
+    return CustomMetrics(mock_llm_manager)
+
+
+def _make_scope(turn_data: TurnData, is_conversation: bool = False) -> EvaluationScope:
+    """Create an EvaluationScope for a turn."""
+    return EvaluationScope(
+        turn_idx=0, turn_data=turn_data, is_conversation=is_conversation
+    )
+
+
+METRIC_NAME = "proposal_evaluation_correctness"
+
+_LLM_RESPONSE_ALL_DIMS = json.dumps(
+    {
+        "reasoning": (
+            "Diagnosis was accurate. Execution addressed root cause. "
+            "Verification confirmed fix."
+        ),
+        "diagnosis": 0.9,
+        "execution": 0.8,
+        "verification": 0.7,
+        "average": 0.80,
+    }
+)
+
+_LLM_RESPONSE_NO_VERIFICATION = json.dumps(
+    {
+        "reasoning": "Diagnosis correct. Execution appropriate but no verification.",
+        "diagnosis": 0.9,
+        "execution": 0.8,
+        "verification": None,
+        "average": 0.85,
+    }
+)
+
+
+class TestParseProposalEvalResponse:
+    """Test _parse_proposal_eval_response parser."""
+
+    def test_all_dimensions(self, mocker: MockerFixture) -> None:
+        """Test parsing response with all three dimensions scored."""
+        cm = _make_custom_metrics(mocker)
+        score, detail = cm._parse_proposal_eval_response(_LLM_RESPONSE_ALL_DIMS)
+
+        assert score == 0.80
+        assert "diagnosis=0.90" in detail
+        assert "execution=0.80" in detail
+        assert "verification=0.70" in detail
+        assert "avg=0.80" in detail
+        assert "Diagnosis was accurate" in detail
+
+    def test_dimension_na(self, mocker: MockerFixture) -> None:
+        """Test parsing response with N/A dimension."""
+        cm = _make_custom_metrics(mocker)
+        score, detail = cm._parse_proposal_eval_response(_LLM_RESPONSE_NO_VERIFICATION)
+
+        assert score == 0.85
+        assert "verification=N/A" in detail
+        assert "diagnosis=0.90" in detail
+
+    def test_fallback_average_from_sub_scores(self, mocker: MockerFixture) -> None:
+        """Test average is computed from sub-scores when average key is missing."""
+        cm = _make_custom_metrics(mocker)
+        response = json.dumps(
+            {
+                "reasoning": "ok",
+                "diagnosis": 0.9,
+                "execution": 0.7,
+                "verification": None,
+            }
+        )
+
+        score, detail = cm._parse_proposal_eval_response(response)
+
+        assert score is not None
+        assert abs(score - 0.80) < 0.01
+        assert "avg=0.80" in detail
+
+    def test_invalid_json_returns_none(self, mocker: MockerFixture) -> None:
+        """Test that invalid JSON response returns None score."""
+        cm = _make_custom_metrics(mocker)
+        score, detail = cm._parse_proposal_eval_response("I cannot evaluate this.")
+
+        assert score is None
+        assert "Invalid JSON" in detail
+
+
+class TestProposalEvaluationCorrectness:
+    """Test custom:proposal_evaluation_correctness metric."""
+
+    def test_returns_score_from_llm(self, mocker: MockerFixture) -> None:
+        """Test successful LLM evaluation returns parsed score."""
+        cm = _make_custom_metrics(mocker)
+        mocker.patch.object(cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS)
+
+        turn = TurnData(
+            turn_id="t1",
+            query="Fix pod crash",
+            response="## Request\n\nFix pod crash\n\n## Analysis\n...",
+            expected_outcome="Root cause: OOMKilled. Remediation: increase memory limit.",
+        )
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score == 0.80
+        assert "diagnosis=0.90" in reason
+        assert "avg=0.80" in reason
+
+    def test_conversation_level_returns_none(self, mocker: MockerFixture) -> None:
+        """Test conversation-level scope returns None score."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(turn_id="t1", query="q", response="r")
+        scope = _make_scope(turn, is_conversation=True)
+
+        score, reason = cm.evaluate(METRIC_NAME, None, scope)
+
+        assert score is None
+        assert "turn-level" in reason
+
+    def test_missing_response_returns_none(self, mocker: MockerFixture) -> None:
+        """Test missing response returns None score."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(turn_id="t1", query="Fix pod crash")
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "response" in reason.lower()
+
+    def test_missing_expected_outcome_returns_none(self, mocker: MockerFixture) -> None:
+        """Test missing expected_outcome returns None score."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(
+            turn_id="t1",
+            query="Fix pod crash",
+            response="## Analysis\nDiagnosis: memory too low",
+        )
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "expected outcome" in reason.lower()
+
+    def test_none_turn_data_returns_none(self, mocker: MockerFixture) -> None:
+        """Test None turn data returns None score."""
+        cm = _make_custom_metrics(mocker)
+        scope = EvaluationScope(turn_idx=0, turn_data=None, is_conversation=False)
+
+        score, reason = cm.evaluate(METRIC_NAME, None, scope)
+
+        assert score is None
+        assert "TurnData" in reason
+
+    def test_unparseable_llm_response(self, mocker: MockerFixture) -> None:
+        """Test unparseable LLM response returns None score."""
+        cm = _make_custom_metrics(mocker)
+        mocker.patch.object(cm, "_call_llm", return_value="I cannot evaluate this.")
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="## Request\nq",
+            expected_outcome="Expected outcome",
+        )
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "parse" in reason.lower() or "Could not" in reason
+
+    def test_llm_error_handled(self, mocker: MockerFixture) -> None:
+        """Test LLM error is caught and returns None score."""
+        cm = _make_custom_metrics(mocker)
+        mocker.patch.object(cm, "_call_llm", side_effect=LLMError("timeout"))
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="## Request\nq",
+            expected_outcome="Expected outcome",
+        )
+
+        score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        assert score is None
+        assert "timeout" in reason
+
+    def test_prompt_contains_query_response_and_expected(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Test that the LLM prompt includes query, response, and expected."""
+        cm = _make_custom_metrics(mocker)
+        call_spy = mocker.patch.object(
+            cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS
+        )
+
+        turn = TurnData(
+            turn_id="t1",
+            query="Fix OOMKilled pod",
+            response="## Analysis\nDiagnosis: memory too low",
+            expected_outcome="Root cause: OOMKilled. Increase memory limit to 512Mi.",
+        )
+        cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        prompt: str = call_spy.call_args[0][0]
+        assert "Fix OOMKilled pod" in prompt
+        assert "Diagnosis: memory too low" in prompt
+        assert "Increase memory limit to 512Mi" in prompt
+
+    def test_prompt_contains_sre_persona(self, mocker: MockerFixture) -> None:
+        """Test that the prompt includes the SRE persona."""
+        cm = _make_custom_metrics(mocker)
+        call_spy = mocker.patch.object(
+            cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS
+        )
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="r",
+            expected_outcome="e",
+        )
+        cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        prompt: str = call_spy.call_args[0][0]
+        assert "senior Site Reliability Engineer" in prompt
+
+    def test_prompt_contains_workflow_phases(self, mocker: MockerFixture) -> None:
+        """Test that the prompt includes workflow phases when set."""
+        cm = _make_custom_metrics(mocker)
+        call_spy = mocker.patch.object(
+            cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS
+        )
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="r",
+            expected_outcome="e",
+            proposal_phases=["analysis", "execution"],
+        )
+        cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        prompt: str = call_spy.call_args[0][0]
+        assert "Phases executed: analysis, execution" in prompt
+
+    def test_prompt_workflow_phases_unknown_when_none(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Test that missing proposal_phases produces fallback text."""
+        cm = _make_custom_metrics(mocker)
+        call_spy = mocker.patch.object(
+            cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS
+        )
+
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            response="r",
+            expected_outcome="e",
+        )
+        cm.evaluate(METRIC_NAME, None, _make_scope(turn))
+
+        prompt: str = call_spy.call_args[0][0]
+        assert "Phases executed: unknown" in prompt
+
+
+class TestBuildWorkflowPhases:
+    """Test _build_workflow_phases helper."""
+
+    def test_with_phases(self, mocker: MockerFixture) -> None:
+        """Test phases list produces comma-separated string."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            proposal_phases=["analysis", "execution", "verification"],
+        )
+
+        result = cm._build_workflow_phases(turn)
+
+        assert result == "Phases executed: analysis, execution, verification"
+
+    def test_analysis_only(self, mocker: MockerFixture) -> None:
+        """Test single-phase list."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(
+            turn_id="t1",
+            query="q",
+            proposal_phases=["analysis"],
+        )
+
+        result = cm._build_workflow_phases(turn)
+
+        assert result == "Phases executed: analysis"
+
+    def test_none_phases(self, mocker: MockerFixture) -> None:
+        """Test None proposal_phases produces fallback."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(turn_id="t1", query="q")
+
+        result = cm._build_workflow_phases(turn)
+
+        assert "unknown" in result
+
+    def test_empty_phases(self, mocker: MockerFixture) -> None:
+        """Test empty list produces fallback."""
+        cm = _make_custom_metrics(mocker)
+        turn = TurnData(turn_id="t1", query="q", proposal_phases=[])
+
+        result = cm._build_workflow_phases(turn)
+
+        assert "unknown" in result
diff --git a/tests/unit/core/metrics/custom/test_proposal_eval.py b/tests/unit/core/metrics/custom/test_proposal_eval.py
index 2c3875aa..cc756dff 100644
--- a/tests/unit/core/metrics/custom/test_proposal_eval.py
+++ b/tests/unit/core/metrics/custom/test_proposal_eval.py
@@ -3,10 +3,10 @@
 from typing import Any, Optional
 
 from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
-    _derive_phase,
     evaluate_proposal_status,
 )
 from lightspeed_evaluation.core.models import TurnData
+from lightspeed_evaluation.core.proposal import derive_phase
 
 
 def _make_turn(
@@ -61,7 +61,7 @@ class TestDerivePhase:
     def test_completed_analysis_only(self) -> None:
         """Analysis-only with Analyzed=True derives Completed."""
         conditions = [{"type": "Analyzed", "status": "True"}]
-        assert _derive_phase(conditions, {"analysis": {}}) == "Completed"
+        assert derive_phase(conditions, {"analysis": {}}) == "Completed"
 
     def test_completed_full_lifecycle(self) -> None:
         """Full lifecycle with all conditions True derives Completed."""
@@ -71,7 +71,7 @@ def test_completed_full_lifecycle(self) -> None:
             {"type": "Verified", "status": "True"},
         ]
         spec: dict[str, Any] = {"analysis": {}, "execution": {}, "verification": {}}
-        assert _derive_phase(conditions, spec) == "Completed"
+        assert derive_phase(conditions, spec) == "Completed"
 
     def test_completed_execution_no_verification(self) -> None:
         """Execution without verification with Executed=True derives Completed."""
@@ -80,7 +80,7 @@ def test_completed_execution_no_verification(self) -> None:
             {"type": "Executed", "status": "True"},
         ]
         spec: dict[str, Any] = {"analysis": {}, "execution": {}}
-        assert _derive_phase(conditions, spec) == "Completed"
+        assert derive_phase(conditions, spec) == "Completed"
 
     def test_failed_condition(self) -> None:
         """Any condition with status False derives Failed."""
@@ -88,7 +88,7 @@ def test_failed_condition(self) -> None:
             {"type": "Analyzed", "status": "True"},
             {"type": "Executed", "status": "False", "reason": "Error"},
         ]
-        assert _derive_phase(conditions) == "Failed"
+        assert derive_phase(conditions) == "Failed"
 
     def test_retrying_execution_not_failed(self) -> None:
         """RetryingExecution reason does not count as failure."""
@@ -97,22 +97,22 @@ def test_retrying_execution_not_failed(self) -> None:
             {"type": "Verified", "status": "False", "reason": "RetryingExecution"},
         ]
         spec: dict[str, Any] = {"analysis": {}, "execution": {}, "verification": {}}
-        assert _derive_phase(conditions, spec) == "InProgress"
+        assert derive_phase(conditions, spec) == "InProgress"
 
     def test_denied(self) -> None:
         """Denied=True derives Denied."""
         conditions = [{"type": "Denied", "status": "True"}]
-        assert _derive_phase(conditions) == "Denied"
+        assert derive_phase(conditions) == "Denied"
 
     def test_escalated(self) -> None:
         """Escalated=True derives Escalated."""
         conditions = [{"type": "Escalated", "status": "True"}]
-        assert _derive_phase(conditions) == "Escalated"
+        assert derive_phase(conditions) == "Escalated"
 
     def test_in_progress(self) -> None:
         """Unknown status derives InProgress."""
         conditions = [{"type": "Analyzed", "status": "Unknown"}]
-        assert _derive_phase(conditions) == "InProgress"
+        assert derive_phase(conditions) == "InProgress"
 
     def test_no_proposal_spec_infers_last_step(self) -> None:
         """Without proposal_spec, infers last step from conditions."""
@@ -121,7 +121,7 @@ def test_no_proposal_spec_infers_last_step(self) -> None:
             {"type": "Executed", "status": "True"},
             {"type": "Verified", "status": "True"},
         ]
-        assert _derive_phase(conditions, None) == "Completed"
+        assert derive_phase(conditions, None) == "Completed"
 
 
 class TestPhaseCheck:
diff --git a/tests/unit/pipeline/evaluation/test_driver.py b/tests/unit/pipeline/evaluation/test_driver.py
index 78a85f5d..564ed780 100644
--- a/tests/unit/pipeline/evaluation/test_driver.py
+++ b/tests/unit/pipeline/evaluation/test_driver.py
@@ -14,6 +14,7 @@
     AgentDriver,
     HttpApiDriver,
     ProposalDriver,
+    TerminalOutcome,
 )
 from lightspeed_evaluation.pipeline.evaluation.registry import AgentDriverRegistry
 
@@ -176,6 +177,83 @@ def test_validate_config_invalid(self) -> None:
             )
 
 
+class TestProposalDriverExecuteTurn:
+    """Unit tests for ProposalDriver.execute_turn outcome routing."""
+
+    @pytest.fixture()
+    def _proposal_driver(self, mocker: MockerFixture) -> ProposalDriver:
+        """Create a ProposalDriver with mocked infrastructure."""
+        mocker.patch("shutil.which", return_value="/usr/bin/oc")
+        driver = ProposalDriver(
+            {"type": "proposal", "namespace": "ns", "timeout": 10, "poll_interval": 1},
+            enabled=True,
+        )
+        apply_result = mocker.Mock(returncode=0, stderr="")
+        mocker.patch.object(driver, "_apply", return_value=apply_result)
+        mocker.patch.object(driver, "_cleanup")
+        mocker.patch.object(driver, "_approve_when_ready", return_value=None)
+        mocker.patch.object(driver._amender, "amend", return_value=None)
+        return driver
+
+    def _stub_terminal(
+        self,
+        mocker: MockerFixture,
+        driver: ProposalDriver,
+        outcome: TerminalOutcome,
+    ) -> None:
+        """Stub _get_status and _is_terminal to return a given outcome immediately."""
+        mocker.patch.object(
+            driver, "_get_status", return_value=({"conditions": []}, None)
+        )
+        mocker.patch.object(driver, "_is_terminal", return_value=outcome)
+
+    def test_completed_returns_no_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test COMPLETED outcome returns no error."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.COMPLETED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is None
+
+    def test_failed_returns_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test FAILED outcome returns an error message."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.FAILED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is not None
+        assert "execution failed" in error
+
+    def test_denied_returns_no_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test DENIED outcome returns no error so metrics are still evaluated."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.DENIED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is None
+
+    def test_escalated_returns_error(
+        self, mocker: MockerFixture, _proposal_driver: ProposalDriver
+    ) -> None:
+        """Test ESCALATED outcome returns an error message."""
+        self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.ESCALATED)
+        turn = TurnData(turn_id="1", query="Q")
+
+        error, _ = _proposal_driver.execute_turn(turn)
+
+        assert error is not None
+        assert "escalated" in error
+
+
 class TestAgentDriverBase:
     """Unit tests for AgentDriver base class defaults."""
 
diff --git a/tests/unit/pipeline/evaluation/test_proposal_amender.py b/tests/unit/pipeline/evaluation/test_proposal_amender.py
new file mode 100644
index 00000000..ffab7b9f
--- /dev/null
+++ b/tests/unit/pipeline/evaluation/test_proposal_amender.py
@@ -0,0 +1,517 @@
+"""Unit tests for ProposalAmender module."""
+
+import subprocess
+from typing import Any, Optional
+
+from pytest_mock import MockerFixture
+
+from lightspeed_evaluation.core.models import TurnData
+from lightspeed_evaluation.pipeline.evaluation.cli import CLIClient, KubeCLI
+from lightspeed_evaluation.pipeline.evaluation.proposal_amender import (
+    ProposalAmender,
+)
+
+
+class MockCLI(CLIClient):
+    """Mock CLIClient that returns pre-configured resources."""
+
+    def __init__(self, resources: Optional[dict[str, dict[str, Any]]] = None) -> None:
+        """Initialize with a map of resource name -> full CR dict."""
+        super().__init__(timeout=30)
+        self._resources: dict[str, dict[str, Any]] = resources or {}
+
+    def run(
+        self,
+        args: list[str],
+        stdin: Optional[str] = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Not used in amender tests."""
+        return subprocess.CompletedProcess(
+            args=args, returncode=0, stdout="", stderr=""
+        )
+
+    def get_resource(
+        self,
+        resource_plural: str,
+        name: str,
+    ) -> tuple[dict[str, Any], Optional[str]]:
+        """Return pre-configured resource or error."""
+        key = f"{resource_plural}/{name}"
+        if key in self._resources:
+            return self._resources[key], None
+        return {}, f"not found: {key}"
+
+    def apply(
+        self,
+        manifest: dict[str, Any],
+    ) -> subprocess.CompletedProcess[str]:
+        """Not used in amender tests."""
+        return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="")
+
+    def delete(self, resource_plural: str, name: str) -> None:
+        """Not used in amender tests."""
+
+
+def _make_turn(query: str = "Fix pod crash") -> TurnData:
+    """Create a minimal TurnData for testing."""
+    return TurnData(turn_id="t1", query=query)
+
+
+def _get_results(turn: TurnData) -> dict[str, Any]:
+    """Extract proposal_results, failing if None."""
+    assert turn.proposal_results is not None
+    return dict(turn.proposal_results)
+
+
+def _get_response(turn: TurnData) -> str:
+    """Extract response, failing if None."""
+    assert turn.response is not None
+    return str(turn.response)
+
+
+DIAGNOSIS_STATUS: dict[str, Any] = {
+    "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}],
+    "options": [
+        {
+            "title": "Increase memory limit",
+            "summary": "Bump memory to 512Mi",
+            "diagnosis": {
+                "summary": "OOMKilled due to low memory",
+                "confidence": "High",
+                "rootCause": "Memory limit 256Mi too low",
+            },
+            "proposal": {
+                "description": "Patch deployment memory",
+                "actions": [
+                    {"type": "patch", "description": "Set memory limit to 512Mi"},
+                ],
+                "risk": "Low",
+                "reversible": "Reversible",
+                "estimatedImpact": "Brief pod restart",
+            },
+        },
+        {
+            "title": "Scale horizontally",
+            "diagnosis": {
+                "summary": "High load causing OOM",
+                "confidence": "Medium",
+                "rootCause": "Single replica under load",
+            },
+            "proposal": {
+                "description": "Add replicas",
+                "actions": [
+                    {"type": "scale", "description": "Scale to 3 replicas"},
+                ],
+                "risk": "Low",
+                "reversible": "Reversible",
+                "estimatedImpact": "No downtime",
+            },
+        },
+    ],
+}
+
+EXECUTION_STATUS: dict[str, Any] = {
+    "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}],
+    "actionsTaken": [
+        {
+            "type": "patch",
+            "description": "Set memory limit to 512Mi",
+            "outcome": "Succeeded",
+            "output": "deployment.apps/web patched",
+        },
+    ],
+    "verification": {
+        "conditionOutcome": "Improved",
+        "summary": "Pod running with new limits",
+    },
+}
+
+VERIFICATION_STATUS: dict[str, Any] = {
+    "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}],
+    "checks": [
+        {
+            "name": "pod-running",
+            "source": "oc",
+            "value": "Running",
+            "result": "Passed",
+        },
+    ],
+    "summary": "All checks passed",
+}
+
+
+class TestAmendAnalysisOnly:
+    """Test ProposalAmender with analysis-only workflow."""
+
+    def test_populates_proposal_results(self) -> None:
+        """Test amend populates proposal_results with analysis data."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Analyzed", "status": "True", "message": "Done"}],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        err = amender.amend(turn, status)
+
+        assert err is None
+        assert turn.proposal_status == status
+        results = _get_results(turn)
+        assert "analysis" in results
+        assert len(results["analysis"]) == 1
+        assert results["analysis"][0]["options"][0]["title"] == "Increase memory limit"
+
+    def test_response_contains_diagnosis(self) -> None:
+        """Test Markdown summary includes diagnosis details."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Analyzed", "status": "True", "message": "Done"}],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "OOMKilled due to low memory" in response
+        assert "Memory limit 256Mi too low" in response
+        assert "Confidence: High" in response
+
+    def test_option_zero_marked_approved(self) -> None:
+        """Test option 0 is marked as (Approved) in summary."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "(Approved)" in response
+        assert "### Option 0 (Approved): Increase memory limit" in response
+        assert "### Option 1 : Scale horizontally" in response
+
+    def test_no_execution_or_verification_in_results(self) -> None:
+        """Test analysis-only workflow has no execution/verification keys."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        results = _get_results(turn)
+        assert "execution" not in results
+        assert "verification" not in results
+
+    def test_proposal_phases_analysis_only(self) -> None:
+        """Test proposal_phases contains only 'analysis' for analysis-only workflow."""
+        cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        assert turn.proposal_phases == ["analysis"]
+
+
+class TestAmendFullPipeline:
+    """Test ProposalAmender with analysis + execution + verification."""
+
+    def _make_cli(self) -> MockCLI:
+        """Create MockCLI with all three result CRs."""
+        return MockCLI(
+            {
+                "analysisresults/ar-1": {"status": DIAGNOSIS_STATUS},
+                "executionresults/er-1": {"status": EXECUTION_STATUS},
+                "verificationresults/vr-1": {"status": VERIFICATION_STATUS},
+            }
+        )
+
+    def _make_status(self) -> dict[str, Any]:
+        """Create proposal status with all three steps."""
+        return {
+            "conditions": [
+                {"type": "Analyzed", "status": "True"},
+                {"type": "Executed", "status": "True"},
+                {"type": "Verified", "status": "True", "message": "All passed"},
+            ],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+                "execution": {
+                    "results": [{"name": "er-1", "outcome": "Succeeded"}],
+                },
+                "verification": {
+                    "results": [{"name": "vr-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+    def test_all_results_populated(self) -> None:
+        """Test all three step results are populated."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        results = _get_results(turn)
+        assert "analysis" in results
+        assert "execution" in results
+        assert "verification" in results
+        assert len(results["analysis"]) == 1
+        assert len(results["execution"]) == 1
+        assert len(results["verification"]) == 1
+
+    def test_execution_in_summary(self) -> None:
+        """Test Markdown includes execution actions."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        response = _get_response(turn)
+        assert "## Execution" in response
+        assert "Set memory limit to 512Mi" in response
+        assert "Succeeded" in response
+        assert "deployment.apps/web patched" in response
+
+    def test_verification_in_summary(self) -> None:
+        """Test Markdown includes verification checks."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        response = _get_response(turn)
+        assert "## Verification" in response
+        assert "pod-running" in response
+        assert "Passed" in response
+        assert "All checks passed" in response
+
+    def test_proposal_phases_full_pipeline(self) -> None:
+        """Test proposal_phases lists all three phases."""
+        amender = ProposalAmender(self._make_cli())
+        turn = _make_turn()
+
+        amender.amend(turn, self._make_status())
+
+        assert turn.proposal_phases == ["analysis", "execution", "verification"]
+
+
+class TestAmendEdgeCases:
+    """Test ProposalAmender edge cases."""
+
+    def test_empty_steps(self) -> None:
+        """Test status with no steps produces empty results."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {"conditions": []}
+
+        amender.amend(turn, status)
+
+        assert not turn.proposal_results
+        assert not turn.proposal_phases
+        response = _get_response(turn)
+        assert "## Request" in response
+
+    def test_step_with_no_results(self) -> None:
+        """Test step present but no result refs gives empty list."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {"results": []},
+            },
+        }
+
+        amender.amend(turn, status)
+
+        results = _get_results(turn)
+        assert results["analysis"] == []
+
+    def test_failed_fetch_logged_and_skipped(self, mocker: MockerFixture) -> None:
+        """Test failed CR fetch is logged and result is skipped."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "missing-cr", "outcome": "Succeeded"}],
+                },
+            },
+        }
+        mock_logger = mocker.patch(
+            "lightspeed_evaluation.pipeline.evaluation.proposal_amender.logger"
+        )
+
+        err = amender.amend(turn, status)
+
+        assert err is None
+        results = _get_results(turn)
+        assert results["analysis"] == []
+        mock_logger.warning.assert_called_once()
+
+    def test_analysis_with_failure_reason(self) -> None:
+        """Test analysis result with failureReason."""
+        failed_status: dict[str, Any] = {
+            "conditions": [{"type": "Completed", "status": "False"}],
+            "failureReason": "LLM timeout",
+        }
+        cli = MockCLI({"analysisresults/ar-fail": {"status": failed_status}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Analyzed", "status": "False"}],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-fail", "outcome": "Failed"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "LLM timeout" in response
+
+    def test_escalation_results(self) -> None:
+        """Test escalation step results are captured."""
+        esc_status: dict[str, Any] = {
+            "conditions": [],
+            "summary": "Requires manual intervention",
+            "content": "Cluster admin must review",
+        }
+        cli = MockCLI({"escalationresults/esc-1": {"status": esc_status}})
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [{"type": "Escalated", "status": "True"}],
+            "steps": {
+                "escalation": {
+                    "results": [{"name": "esc-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+
+        amender.amend(turn, status)
+
+        results = _get_results(turn)
+        assert "escalation" in results
+        response = _get_response(turn)
+        assert "Requires manual intervention" in response
+        assert "Cluster admin must review" in response
+
+    def test_outcome_section_from_conditions(self) -> None:
+        """Test outcome section uses condition messages."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [
+                {"type": "Analyzed", "status": "True", "message": "Analysis complete"},
+                {"type": "Executed", "status": "True", "message": "Execution done"},
+            ],
+        }
+
+        amender.amend(turn, status)
+
+        response = _get_response(turn)
+        assert "## Outcome" in response
+        assert "Analysis complete; Execution done" in response
+
+    def test_subprocess_error_caught_by_amend(self, mocker: MockerFixture) -> None:
+        """Test that subprocess errors are caught by the broadened except clause."""
+        cli = MockCLI()
+        amender = ProposalAmender(cli)
+        turn = _make_turn()
+        status: dict[str, Any] = {
+            "conditions": [],
+            "steps": {
+                "analysis": {
+                    "results": [{"name": "ar-1", "outcome": "Succeeded"}],
+                },
+            },
+        }
+        mocker.patch.object(
+            cli,
+            "get_resource",
+            side_effect=subprocess.SubprocessError("unexpected"),
+        )
+
+        err = amender.amend(turn, status)
+
+        assert err is not None
+        assert "ProposalAmender error" in err
+
+
+class TestKubeCLITimeoutHandling:
+    """Test KubeCLI normalizes subprocess.TimeoutExpired to CompletedProcess."""
+
+    def test_run_returns_completed_process_on_timeout(
+        self, mocker: MockerFixture
+    ) -> None:
+        """Test that TimeoutExpired is caught and a failing CompletedProcess returned."""
+        mocker.patch(
+            "subprocess.run",
+            side_effect=subprocess.TimeoutExpired(cmd="oc get pods", timeout=30),
+        )
+
+        cli = KubeCLI(cli_path="oc", namespace="test-ns", timeout=30)
+        result = cli.run(["get", "pods"])
+
+        assert result.returncode == 1
+        assert result.stdout == ""
+        assert "timed out after 30s" in result.stderr
+
+    def test_get_resource_returns_error_on_timeout(self, mocker: MockerFixture) -> None:
+        """Test that get_resource returns an error tuple when run times out."""
+        mocker.patch(
+            "subprocess.run",
+            side_effect=subprocess.TimeoutExpired(cmd="oc get pods", timeout=30),
+        )
+
+        cli = KubeCLI(cli_path="oc", namespace="test-ns", timeout=30)
+        resource, err = cli.get_resource("pods", "my-pod")
+
+        assert resource == {}
+        assert err is not None
+        assert "timed out" in err
diff --git a/tests/unit/pipeline/evaluation/test_proposal_driver.py b/tests/unit/pipeline/evaluation/test_proposal_driver.py
index 32d5fb22..3bcbc022 100644
--- a/tests/unit/pipeline/evaluation/test_proposal_driver.py
+++ b/tests/unit/pipeline/evaluation/test_proposal_driver.py
@@ -57,6 +57,7 @@ def test_valid_config_all_fields(self) -> None:
                 "auto_approve": False,
                 "cleanup_proposals": False,
                 "timeout": 60,
+                "cli_timeout": 15,
                 "poll_interval": 5,
             }
         )
@@ -64,6 +65,7 @@ def test_valid_config_all_fields(self) -> None:
         assert config.auto_approve is False
         assert config.cleanup_proposals is False
         assert config.timeout == 60
+        assert config.cli_timeout == 15
         assert config.poll_interval == 5
 
     def test_valid_config_defaults(self) -> None:
@@ -72,6 +74,7 @@ def test_valid_config_defaults(self) -> None:
         assert config.auto_approve is True
         assert config.cleanup_proposals is True
         assert config.timeout == 900
+        assert config.cli_timeout == 30
         assert config.poll_interval == 2
 
     def test_missing_namespace(self) -> None:
@@ -94,6 +97,16 @@ def test_invalid_timeout_negative(self) -> None:
         with pytest.raises(ValidationError):
             ProposalAgentConfig.model_validate({**VALID_CONFIG, "timeout": -1})
 
+    def test_invalid_cli_timeout_zero(self) -> None:
+        """Test cli_timeout=0 raises ValidationError."""
+        with pytest.raises(ValidationError):
+            ProposalAgentConfig.model_validate({**VALID_CONFIG, "cli_timeout": 0})
+
+    def test_invalid_cli_timeout_negative(self) -> None:
+        """Test negative cli_timeout raises ValidationError."""
+        with pytest.raises(ValidationError):
+            ProposalAgentConfig.model_validate({**VALID_CONFIG, "cli_timeout": -1})
+
     def test_invalid_poll_interval_zero(self) -> None:
         """Test poll_interval=0 raises ValidationError."""
         with pytest.raises(ValidationError):
@@ -476,7 +489,9 @@ def test_happy_path_completed(
 
         assert error is None
         assert conv_id is None
-        assert turn.response == "Analysis done; Passed"
+        response = str(turn.response)
+        assert "Analysis done" in response
+        assert "Passed" in response
         assert turn.proposal_status == terminal_status
         driver._cleanup.assert_called_once_with("eval-abcd1234")
 
@@ -584,14 +599,13 @@ def test_failed_terminal(
         error, _ = driver.execute_turn(turn)
 
         assert error is not None
-        assert "Failed" in error
+        assert "failed" in error
         assert turn.proposal_status == status
-        assert turn.response == "LLM error"
 
     def test_denied_terminal(
         self, mocker: MockerFixture, driver: ProposalDriver
     ) -> None:
-        """Test denied proposal returns error."""
+        """Test denied proposal populates turn data and returns no error."""
         mock_time = mocker.patch(f"{MODULE}.time")
         mock_time.monotonic.side_effect = [0.0, 0.0, 0.0, 1.0]
 
@@ -607,8 +621,8 @@ def test_denied_terminal(
         turn = TurnData(turn_id="t1", query="Q", proposal_spec=SPEC_FULL)
         error, _ = driver.execute_turn(turn)
 
-        assert error is not None
-        assert "Denied" in error
+        assert error is None
+        assert turn.proposal_status == status
 
     def test_get_status_error(
         self, mocker: MockerFixture, driver: ProposalDriver