lightspeed-core · rioloc · May 27, 2026
diff --git a/Makefile b/Makefile
@@ -114,7 +114,7 @@ shellcheck: ## Run shellcheck
 	@mkdir -p .shellcheck-stable
 	@wget -qO- "https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.$$(uname -m).tar.xz" | tar -xJ -C .shellcheck-stable --strip-components=1
 	@PATH="$$PWD/.shellcheck-stable:$$PATH" shellcheck --version
-	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck {} +
+	@PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck -e SC1091 {} +
 
 pylint:
 	uv run pylint src

diff --git a/README.md b/README.md
@@ -210,6 +210,8 @@ uv run lightspeed-eval --system-config <CONFIG.yaml> --eval-data <EVAL_DATA.yaml
     - [`keywords_eval`](src/lightspeed_evaluation/core/metrics/custom/keywords_eval.py) - Keywords evaluation with alternatives (ALL keywords must match, case insensitive)
   - Tool Evaluation
     - [`tool_eval`](src/lightspeed_evaluation/core/metrics/custom.py) - Validates tool calls, arguments, and optional results with regex pattern matching
+  - Agentic Workflow Evaluation
+    - [`proposal_evaluation_correctness`](src/lightspeed_evaluation/core/metrics/custom/custom.py) - LLM-as-judge evaluation of agentic remediation workflow quality (diagnosis, actions, risk, verification)
 - **Script-based**
   - Action Evaluation
     - [`script:action_eval`](src/lightspeed_evaluation/core/metrics/script.py) - Executes verification scripts to validate actions (e.g., infrastructure changes)

diff --git a/config/system.yaml b/config/system.yaml
@@ -155,6 +155,11 @@ metrics_metadata:
       ordered: true       # true (default): sequence order matters, false: any order allowed
       full_match: true    # true (default): exact 1:1 match, false: expected tools found in actual (extras allowed)
 
+    "custom:proposal_evaluation_correctness":
+      threshold: 0.75
+      description: "LLM judge of agentic remediation workflow quality (diagnosis, actions, risk, verification)"
+      default: false
+
     # Script-based metrics
     "script:action_eval":
       description: "Script-based evaluation for infrastructure/environment validation"

diff --git a/docs/EVALUATION_GUIDE.md b/docs/EVALUATION_GUIDE.md
@@ -420,6 +420,47 @@ expected_tool_calls:
 
 ---
 
+#### Proposal Evaluation Correctness
+
+**What it measures:** How good is the agentic remediation workflow? Evaluates diagnosis, actions, risk management, and verification.
+
+**Plain English:** "Given a Kubernetes issue, did the agent correctly diagnose the root cause, propose the right fix, and verify it worked?"
+
+**Score Range:** 0.0 to 1.0 (higher is better)
+
+**How it works:** A Judge LLM evaluates the workflow summary (produced by ProposalAmender) across four aspects.
+Diagnosis Quality is the most important criterion and carries the most weight:
+1. **Diagnosis Quality** — Is the root cause correctly identified and specific? Is the reasoning sound and the confidence level appropriate?
+2. **Action Appropriateness** — Are the actions safe and well-scoped?
+3. **Risk Management** — Is the risk assessment correct?
+4. **Verification Thoroughness** — Do the checks confirm the fix?
+
+Only aspects present in the workflow are evaluated. Analysis-only workflows are scored on diagnosis quality alone.
+
+**Example:**
+```yaml
+turns:
+  - turn_id: "fix-oom"
+    proposal_spec:
+      request: "Pod CrashLoopBackOff in namespace production"
+      analysis: {}
+      execution: {}
+      verification: {}
+    turn_metrics:
+      - "custom:proposal_evaluation_correctness"
+      - "custom:proposal_status"
+    expected_proposal_status:
+      phase: "Completed"
+```
+
+**When to use:** Evaluating agentic operator workflows (Proposal CRD lifecycle)
+
+**Threshold:** 0.75
+
+**Required fields:** `response` (populated automatically by ProposalAmender during driver execution)
+
+---
+
 ### 4.3 Script-Based Metrics
 
 #### Action Evaluation
@@ -1739,6 +1780,7 @@ lightspeed-eval --eval-data config/eval_batch2.yaml
 | **custom:answer_correctness** | 0-1 | Matches expected answer | 0.75 | query, response, expected_response |
 | **custom:intent_eval** | 0/1 | Has right intent | 1 | query, response, expected_intent |
 | **custom:tool_eval** | 0/1 | Called correct tools with expected results | 1 | expected_tool_calls, tool_calls |
+| **custom:proposal_evaluation_correctness** | 0-1 | Agentic workflow quality (diagnosis, actions, risk) | 0.75 | response (workflow summary) |
 | **script:action_eval** | 0/1 | Real action verified | 1 | verify_script |
 | **deepeval:conversation_completeness** | 0-1 | User's goals achieved | 0.8 | Full conversation |
 | **deepeval:conversation_relevancy** | 0-1 | Stayed on topic | 0.7 | Full conversation |

diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py
@@ -1,5 +1,6 @@
 """Custom metrics using direct LLM integration."""
 
+import json
 import re
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -9,6 +10,7 @@
 from lightspeed_evaluation.core.metrics.custom.prompts import (
     ANSWER_CORRECTNESS_PROMPT,
     INTENT_EVALUATION_PROMPT,
+    PROPOSAL_EVALUATION_CORRECTNESS_PROMPT,
 )
 from lightspeed_evaluation.core.metrics.custom.proposal_eval import (
     evaluate_proposal_status,
@@ -47,6 +49,9 @@ def __init__(
             "intent_eval": self._evaluate_intent,
             "tool_eval": self._evaluate_tool_calls,
             "proposal_status": evaluate_proposal_status,
+            "proposal_evaluation_correctness": (
+                self._evaluate_proposal_evaluation_correctness
+            ),
         }
 
         print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
@@ -295,3 +300,109 @@ def _evaluate_intent(
             return score, reason
         except LLMError as e:
             return None, f"Intent evaluation failed: {str(e)}"
+
+    def _parse_proposal_eval_response(
+        self, response: str
+    ) -> tuple[Optional[float], str]:
+        """Parse JSON LLM judge response for proposal evaluation.
+
+        Expected JSON schema::
+
+            {
+              "reasoning": "string",
+              "diagnosis": float | null,
+              "execution": float | null,
+              "verification": float | null,
+              "average": float
+            }
+        """
+        try:
+            data = json.loads(response)
+        except json.JSONDecodeError:
+            return None, f"Invalid JSON from LLM: {response[:120]}"
+
+        reasoning: str = data.get("reasoning", "")
+        sub_scores: dict[str, Optional[float]] = {
+            "diagnosis": self._try_parse_float(data.get("diagnosis")),
+            "execution": self._try_parse_float(data.get("execution")),
+            "verification": self._try_parse_float(data.get("verification")),
+        }
+        average: Optional[float] = self._try_parse_float(data.get("average"))
+
+        present = [v for v in sub_scores.values() if v is not None]
+        if average is None and present:
+            average = sum(present) / len(present)
+
+        parts = [
+            f"{dim}={v:.2f}" if v is not None else f"{dim}=N/A"
+            for dim, v in sub_scores.items()
+        ]
+        if average is not None:
+            parts.append(f"avg={average:.2f}")
+        detail = ", ".join(parts)
+        if reasoning:
+            detail = f"{detail} — {reasoning}"
+
+        return average, detail
+
+    @staticmethod
+    def _try_parse_float(value: Any) -> Optional[float]:
+        """Try to parse a float from a value, return None on failure."""
+        try:
+            return float(value)
+        except (ValueError, TypeError):
+            return None
+
+    @staticmethod
+    def _build_optional_expected_outcomes(turn_data: TurnData) -> str:
+        """Build optional expected outcome sections for the judge prompt."""
+        sections: list[str] = []
+        mapping = {
+            "Expected Analysis Outcome": turn_data.expected_analysis_outcome,
+            "Expected Execution Outcome": turn_data.expected_execution_outcome,
+            "Expected Verification Outcome": turn_data.expected_verification_outcome,
+        }
+        for label, value in mapping.items():
+            if value:
+                sections.append(f"\n### {label}\n{value}")
+        return "\n".join(sections)
+
+    def _evaluate_proposal_evaluation_correctness(
+        self,
+        _conv_data: Any,
+        _turn_idx: Optional[int],
+        turn_data: Optional[TurnData],
+        is_conversation: bool,
+    ) -> tuple[Optional[float], str]:
+        """Evaluate agentic remediation workflow quality using LLM judge."""
+        if is_conversation:
+            return None, "Proposal evaluation correctness is a turn-level metric"
+
+        if turn_data is None or not turn_data.response:
+            return None, "TurnData with response is required for proposal evaluation"
+
+        if not turn_data.expected_outcome:
+            return None, "No expected outcome provided for proposal evaluation"
+
+        optional_sections = self._build_optional_expected_outcomes(turn_data)
+
+        prompt = PROPOSAL_EVALUATION_CORRECTNESS_PROMPT.format(
+            request=turn_data.query or "N/A",
+            workflow_summary=turn_data.response,
+            expected_outcome=turn_data.expected_outcome,
+            optional_expected_outcomes=optional_sections,
+        )
+
+        try:
+            llm_response = self._call_llm(prompt)
+            score, reason = self._parse_proposal_eval_response(llm_response)
+
+            if score is None:
+                return (
+                    None,
+                    f"Could not parse score from LLM response: {llm_response[:100]}...",
+                )
+
+            return score, f"Proposal evaluation correctness: {reason}"
+        except LLMError as e:
+            return None, f"Proposal evaluation correctness failed: {str(e)}"
diff --git a/src/lightspeed_evaluation/core/metrics/custom/prompts.py b/src/lightspeed_evaluation/core/metrics/custom/prompts.py
@@ -35,7 +35,7 @@
 
 Examples of intent evaluation:
 - If expected intent is "provide instructions", check if the response is instructional
-- If expected intent is "explain a concept", check if the response is explanatory  
+- If expected intent is "explain a concept", check if the response is explanatory
 - If expected intent is "refuse or decline", check if the response is declining to help
 - If expected intent is "ask for clarification", check if the response is asking questions
 
@@ -44,3 +44,67 @@
 Format your response as:
 Score: [0 or 1]
 Reason: [your detailed explanation]"""
+
+# Proposal Evaluation Correctness Prompt
+PROPOSAL_EVALUATION_CORRECTNESS_PROMPT = """You are a senior Site Reliability Engineer evaluating an automated remediation workflow on an OpenShift/Kubernetes cluster. You must be strict, objective, and critical. Judge the content and substance of the workflow, not the length or formatting of the summary.
+
+## Original Request
+{request}
+
+## Workflow Summary
+{workflow_summary}
+
+## Expected Outcome
+{expected_outcome}
+
+## Additional Expected Outcomes (Optional)
+{optional_expected_outcomes}
+
+If additional expected outcomes are provided above, use them as supplementary reference points to refine your scoring precision. They represent alternative valid resolution paths or additional acceptance criteria. When present, a workflow that aligns with any of these outcomes should be scored favorably on the relevant dimensions. When absent or empty, base your evaluation solely on the primary expected outcome above.
+
+## Evaluation Criteria
+Compare the workflow summary against the expected outcome (and any additional expected outcomes, if provided) on each dimension independently:
+
+1. **Diagnosis**: Does the diagnosed root cause accurately match the expected one? Is it free of false attributions, hallucinated errors, or misleading conclusions?
+2. **Execution**: Do the proposed/executed actions directly address the root cause? Are they safe, well-scoped, and minimal? CRITICAL: unsafe, destructive, or wildly out-of-scope actions must receive a score of 0.2 or lower, regardless of diagnosis accuracy.
+3. **Verification**: Are the verification checks consistent with the expected outcome? Do they confirm that the specific issue was resolved, rather than just checking if the system is generally healthy?
+
+Only score dimensions present in the workflow. If only analysis occurred, score only Diagnosis. If execution occurred without verification, score Diagnosis and Execution only. If execution was attempted but failed due to infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A — do not penalize the agent's reasoning quality. Mark absent dimensions as N/A.
+
+## Scoring Rubric (apply per dimension)
+- **1.0**: Perfect alignment with the expected outcome for this dimension.
+- **0.7 - 0.9**: Correct direction, but slightly suboptimal, over-scoped, or missing minor details (still safe).
+- **0.4 - 0.6**: Partially correct but with significant gaps, inefficiencies, or poor scoping.
+- **0.1 - 0.3**: Incorrect, does not address the issue, or introduces safety/security risks.
+- **0.0**: Total failure, hallucinated content, or catastrophically unsafe.
+
+## Calibration Examples
+
+### Example A — Score: Diagnosis 0.9, Execution 0.8, Verification 0.8, Average 0.83
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi. Verify pod is Running."
+Workflow: Correctly diagnosed OOMKilled from container lastState. Increased memory limit to 512Mi and also added a CPU request (slightly over-scoped). Verified pod reached Running state.
+Why: Diagnosis was accurate (0.9). Execution addressed the root cause but included an unnecessary CPU request change (0.8). Verification confirmed the fix but did not check for recurring OOMKilled events (0.8).
+
+### Example B — Score: Diagnosis 0.2, Execution 0.1, Verification N/A, Average 0.15
+Request: "Pod frontend-abc is in CrashLoopBackOff"
+Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi."
+Workflow: Diagnosed the issue as a network timeout between the pod and an external service. Proposed restarting the cluster DNS operator.
+Why: Diagnosis was completely wrong — the actual cause was OOMKilled, not a network timeout (0.2). Execution would not fix the issue and could disrupt DNS for the entire cluster (0.1). No verification was performed (N/A).
+
+### Example C — Score: Diagnosis 1.0, Execution N/A, Verification N/A, Average 1.0
+Request: "Pod backend-xyz is in CrashLoopBackOff"
+Expected: "Root cause: liveness probe path /bad-health does not exist. Fix the probe path to /healthz."
+Workflow: Correctly diagnosed the liveness probe misconfiguration. Proposed patching the probe path to /healthz. Execution failed with: "context deadline exceeded" (sandbox pod timeout). No verification was performed.
+Why: Diagnosis was perfect (1.0). The proposed execution was correct and safe, but it failed due to infrastructure timeout — not agent reasoning. When execution fails for infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A rather than penalizing the agent's reasoning quality. Verification was never reached (N/A).
+
+## Output Format
+Use below json format for your response. Do not add any additional text apart from json output.
+
+{{
+  "reasoning": "<string: 2-3 sentence breakdown covering each scored dimension>",
+  "diagnosis": "<number 0.0-1.0>",
+  "execution": "<number 0.0-1.0 or null if N/A>",
+  "verification": "<number 0.0-1.0 or null if N/A>",
+  "average": "<number: mean of non-null dimensions, e.g. diagnosis=0.9 execution=0.8 verification=null → (0.9+0.8)/2=0.85>"
+}}"""
diff --git a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py
@@ -3,53 +3,7 @@
 from typing import Any, Optional
 
 from lightspeed_evaluation.core.models import TurnData
-
-
-def _derive_phase(
-    conditions: list[dict[str, Any]],
-    proposal_spec: Optional[dict[str, Any]] = None,
-) -> str:
-    """Derive the terminal phase from CRD conditions.
-
-    Args:
-        conditions: List of condition dicts from proposal_status.
-        proposal_spec: Proposal spec to determine the last expected step.
-
-    Returns:
-        Phase string: Completed, Failed, Denied, Escalated, or InProgress.
-    """
-    by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c}
-
-    if by_type.get("Denied", {}).get("status") == "True":
-        return "Denied"
-    if by_type.get("Escalated", {}).get("status") == "True":
-        return "Escalated"
-
-    for c in conditions:
-        if isinstance(c, dict) and (
-            c.get("type") in {"Analyzed", "Executed", "Verified"}
-            and c.get("status") == "False"
-            and c.get("reason") != "RetryingExecution"
-        ):
-            return "Failed"
-
-    step_to_condition = {"verification": "Verified", "execution": "Executed"}
-    if proposal_spec:
-        last = next(
-            (cond for step, cond in step_to_condition.items() if step in proposal_spec),
-            "Analyzed",
-        )
-    else:
-        last = "Analyzed"
-        for step in ("Verified", "Executed", "Analyzed"):
-            if by_type.get(step, {}).get("status") == "True":
-                last = step
-                break
-
-    if by_type.get(last, {}).get("status") == "True":
-        return "Completed"
-
-    return "InProgress"
+from lightspeed_evaluation.core.proposal import derive_phase
 
 
 def _check_phase(
@@ -62,7 +16,7 @@ def _check_phase(
     if phase is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual == phase:
         return True, f"Phase matches: {actual}"
     return False, f"Phase mismatch: expected '{phase}', got '{actual}'"
@@ -78,7 +32,7 @@ def _check_phase_in(
     if phase_in is None:
         return None
 
-    actual = _derive_phase(conditions, proposal_spec)
+    actual = derive_phase(conditions, proposal_spec)
     if actual in phase_in:
         return True, f"Phase '{actual}' in {phase_in}"
     return False, f"Phase '{actual}' not in {phase_in}"