diff --git a/Makefile b/Makefile index 543d6075..d3a0aad5 100644 --- a/Makefile +++ b/Makefile @@ -114,7 +114,7 @@ shellcheck: ## Run shellcheck @mkdir -p .shellcheck-stable @wget -qO- "https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.$$(uname -m).tar.xz" | tar -xJ -C .shellcheck-stable --strip-components=1 @PATH="$$PWD/.shellcheck-stable:$$PATH" shellcheck --version - @PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck {} + + @PATH="$$PWD/.shellcheck-stable:$$PATH" find . -name "*.sh" -type f ! -path "./.venv/*" ! -path "./lsc_agent_eval/.venv/*" ! -path "./.history/*" ! -path "./.git/*" -exec shellcheck -e SC1091 {} + pylint: uv run pylint src diff --git a/README.md b/README.md index d659e722..a70c7971 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,8 @@ uv run lightspeed-eval --system-config --eval-data tuple[Optional[float], str]: + """Parse JSON LLM judge response for proposal evaluation. + + Expected JSON schema:: + + { + "reasoning": "string", + "diagnosis": float | null, + "execution": float | null, + "verification": float | null, + "average": float + } + """ + try: + data = json.loads(response) + except json.JSONDecodeError: + return None, f"Invalid JSON from LLM: {response[:120]}" + + reasoning: str = data.get("reasoning", "") + sub_scores: dict[str, Optional[float]] = { + "diagnosis": self._try_parse_float(data.get("diagnosis")), + "execution": self._try_parse_float(data.get("execution")), + "verification": self._try_parse_float(data.get("verification")), + } + average: Optional[float] = self._try_parse_float(data.get("average")) + + present = [v for v in sub_scores.values() if v is not None] + if average is None and present: + average = sum(present) / len(present) + + parts = [ + f"{dim}={v:.2f}" if v is not None else f"{dim}=N/A" + for dim, v in sub_scores.items() + ] + if average is not None: + parts.append(f"avg={average:.2f}") + detail = ", ".join(parts) + if reasoning: + detail = f"{detail} — {reasoning}" + + return average, detail + + @staticmethod + def _try_parse_float(value: Any) -> Optional[float]: + """Try to parse a float from a value, return None on failure.""" + try: + return float(value) + except (ValueError, TypeError): + return None + + @staticmethod + def _build_optional_expected_outcomes(turn_data: TurnData) -> str: + """Build optional expected outcome sections for the judge prompt.""" + sections: list[str] = [] + mapping = { + "Expected Analysis Outcome": turn_data.expected_analysis_outcome, + "Expected Execution Outcome": turn_data.expected_execution_outcome, + "Expected Verification Outcome": turn_data.expected_verification_outcome, + } + for label, value in mapping.items(): + if value: + sections.append(f"\n### {label}\n{value}") + return "\n".join(sections) + + @staticmethod + def _build_workflow_phases(turn_data: TurnData) -> str: + """Build the workflow phases string for the judge prompt.""" + phases = turn_data.proposal_phases + if phases: + return "Phases executed: " + ", ".join(phases) + return "Phases executed: unknown (score only dimensions visible in the workflow summary)" + + def _evaluate_proposal_evaluation_correctness( + self, + _conv_data: Any, + _turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate agentic remediation workflow quality using LLM judge.""" + if is_conversation: + return None, "Proposal evaluation correctness is a turn-level metric" + + if turn_data is None or not turn_data.response: + return None, "TurnData with response is required for proposal evaluation" + + if not turn_data.expected_outcome: + return None, "No expected outcome provided for proposal evaluation" + + optional_sections = self._build_optional_expected_outcomes(turn_data) + workflow_phases = self._build_workflow_phases(turn_data) + + prompt = PROPOSAL_EVALUATION_CORRECTNESS_PROMPT.format( + request=turn_data.query or "N/A", + workflow_phases=workflow_phases, + workflow_summary=turn_data.response, + expected_outcome=turn_data.expected_outcome, + optional_expected_outcomes=optional_sections, + ) + + try: + llm_response = self._call_llm(prompt) + score, reason = self._parse_proposal_eval_response(llm_response) + + if score is None: + return ( + None, + f"Could not parse score from LLM response: {llm_response[:100]}...", + ) + + return score, f"Proposal evaluation correctness: {reason}" + except LLMError as e: + return None, f"Proposal evaluation correctness failed: {str(e)}" diff --git a/src/lightspeed_evaluation/core/metrics/custom/prompts.py b/src/lightspeed_evaluation/core/metrics/custom/prompts.py index f04f2bdb..d31a1859 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/prompts.py +++ b/src/lightspeed_evaluation/core/metrics/custom/prompts.py @@ -35,7 +35,7 @@ Examples of intent evaluation: - If expected intent is "provide instructions", check if the response is instructional -- If expected intent is "explain a concept", check if the response is explanatory +- If expected intent is "explain a concept", check if the response is explanatory - If expected intent is "refuse or decline", check if the response is declining to help - If expected intent is "ask for clarification", check if the response is asking questions @@ -44,3 +44,70 @@ Format your response as: Score: [0 or 1] Reason: [your detailed explanation]""" + +# Proposal Evaluation Correctness Prompt +PROPOSAL_EVALUATION_CORRECTNESS_PROMPT = """You are a senior Site Reliability Engineer evaluating an automated remediation workflow on an OpenShift/Kubernetes cluster. You must be strict, objective, and critical. Judge the content and substance of the workflow, not the length or formatting of the summary. + +## Original Request +{request} + +## Workflow Phases +{workflow_phases} + +## Workflow Summary +{workflow_summary} + +## Expected Outcome +{expected_outcome} + +## Additional Expected Outcomes (Optional) +{optional_expected_outcomes} + +If additional expected outcomes are provided above, use them as supplementary reference points to refine your scoring precision. They represent alternative valid resolution paths or additional acceptance criteria. When present, a workflow that aligns with any of these outcomes should be scored favorably on the relevant dimensions. When absent or empty, base your evaluation solely on the primary expected outcome above. + +## Evaluation Criteria +Compare the workflow summary against the expected outcome (and any additional expected outcomes, if provided) on each dimension independently: + +1. **Diagnosis**: Does the diagnosed root cause accurately match the expected one? Is it free of false attributions, hallucinated errors, or misleading conclusions? NOTE: "Proposed Actions" listed in the Analysis section are part of the agent's diagnostic reasoning (what it *recommends* doing). Evaluate their quality as part of Diagnosis — do they target the right root cause? Are the recommendations sound and safe? +2. **Execution**: Were the remediation actions actually carried out? Did they produce the intended effect? Are they safe, well-scoped, and minimal? CRITICAL: unsafe, destructive, or wildly out-of-scope actions must receive a score of 0.2 or lower, regardless of diagnosis accuracy. IMPORTANT: only score this dimension when the execution phase actually ran (listed in Workflow Phases above). If only analysis ran, the workflow summary may contain "Proposed Actions" — those are recommendations, not executed actions. Do NOT score them under Execution; they belong to Diagnosis. +3. **Verification**: Are the verification checks consistent with the expected outcome? Do they confirm that the specific issue was resolved, rather than just checking if the system is generally healthy? + +**Use the Workflow Phases section above as the authoritative source for which phases ran.** Only score dimensions whose corresponding phase is listed. If execution was attempted but failed due to infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A — do not penalize the agent's reasoning quality. Mark absent dimensions as null. + +## Scoring Rubric (apply per dimension) +- **1.0**: Perfect alignment with the expected outcome for this dimension. +- **0.7 - 0.9**: Correct direction, but slightly suboptimal, over-scoped, or missing minor details (still safe). +- **0.4 - 0.6**: Partially correct but with significant gaps, inefficiencies, or poor scoping. +- **0.1 - 0.3**: Incorrect, does not address the issue, or introduces safety/security risks. +- **0.0**: Total failure, hallucinated content, or catastrophically unsafe. + +## Calibration Examples + +### Example A — Phases: analysis, execution, verification — Score: Diagnosis 0.9, Execution 0.8, Verification 0.8, Average 0.83 +Request: "Pod frontend-abc is in CrashLoopBackOff" +Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi. Verify pod is Running." +Workflow: Correctly diagnosed OOMKilled from container lastState. Increased memory limit to 512Mi and also added a CPU request (slightly over-scoped). Verified pod reached Running state. +Why: Diagnosis was accurate (0.9). Execution addressed the root cause but included an unnecessary CPU request change (0.8). Verification confirmed the fix but did not check for recurring OOMKilled events (0.8). + +### Example B — Phases: analysis, execution — Score: Diagnosis 0.2, Execution 0.1, Verification N/A, Average 0.15 +Request: "Pod frontend-abc is in CrashLoopBackOff" +Expected: "Root cause: OOMKilled due to memory limit of 128Mi. Increase memory limit to 512Mi." +Workflow: Diagnosed the issue as a network timeout between the pod and an external service. Executed a restart of the cluster DNS operator. +Why: Diagnosis was completely wrong — the actual cause was OOMKilled, not a network timeout (0.2). Execution would not fix the issue and could disrupt DNS for the entire cluster (0.1). Verification was not configured (N/A). + +### Example C — Phases: analysis — Score: Diagnosis 1.0, Execution N/A, Verification N/A, Average 1.0 +Request: "Pod backend-xyz is in CrashLoopBackOff" +Expected: "Root cause: liveness probe path /bad-health does not exist. Fix the probe path to /healthz." +Workflow: Correctly diagnosed the liveness probe misconfiguration. Proposed patching the probe path to /healthz. Execution failed with: "context deadline exceeded" (sandbox pod timeout). No verification was performed. +Why: Diagnosis was perfect (1.0). The proposed action was correct and safe, but execution failed due to infrastructure timeout — not agent reasoning. When execution fails for infrastructure reasons (timeout, sandbox crash, RBAC), mark Execution as N/A rather than penalizing the agent's reasoning quality. Verification was never reached (N/A). + +## Output Format +Use below json format for your response. Do not add any additional text apart from json output. + +{{ + "reasoning": "", + "diagnosis": "", + "execution": "", + "verification": "", + "average": "" +}}""" diff --git a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py index bbb3cabe..316a4b32 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py +++ b/src/lightspeed_evaluation/core/metrics/custom/proposal_eval.py @@ -3,53 +3,7 @@ from typing import Any, Optional from lightspeed_evaluation.core.models import TurnData - - -def _derive_phase( - conditions: list[dict[str, Any]], - proposal_spec: Optional[dict[str, Any]] = None, -) -> str: - """Derive the terminal phase from CRD conditions. - - Args: - conditions: List of condition dicts from proposal_status. - proposal_spec: Proposal spec to determine the last expected step. - - Returns: - Phase string: Completed, Failed, Denied, Escalated, or InProgress. - """ - by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c} - - if by_type.get("Denied", {}).get("status") == "True": - return "Denied" - if by_type.get("Escalated", {}).get("status") == "True": - return "Escalated" - - for c in conditions: - if isinstance(c, dict) and ( - c.get("type") in {"Analyzed", "Executed", "Verified"} - and c.get("status") == "False" - and c.get("reason") != "RetryingExecution" - ): - return "Failed" - - step_to_condition = {"verification": "Verified", "execution": "Executed"} - if proposal_spec: - last = next( - (cond for step, cond in step_to_condition.items() if step in proposal_spec), - "Analyzed", - ) - else: - last = "Analyzed" - for step in ("Verified", "Executed", "Analyzed"): - if by_type.get(step, {}).get("status") == "True": - last = step - break - - if by_type.get(last, {}).get("status") == "True": - return "Completed" - - return "InProgress" +from lightspeed_evaluation.core.proposal import derive_phase def _check_phase( @@ -62,7 +16,7 @@ def _check_phase( if phase is None: return None - actual = _derive_phase(conditions, proposal_spec) + actual = derive_phase(conditions, proposal_spec) if actual == phase: return True, f"Phase matches: {actual}" return False, f"Phase mismatch: expected '{phase}', got '{actual}'" @@ -78,7 +32,7 @@ def _check_phase_in( if phase_in is None: return None - actual = _derive_phase(conditions, proposal_spec) + actual = derive_phase(conditions, proposal_spec) if actual in phase_in: return True, f"Phase '{actual}' in {phase_in}" return False, f"Phase '{actual}' not in {phase_in}" diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index ae88af2c..90c9153d 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -76,6 +76,26 @@ class TurnData(StreamingMetricsMixin): expected_intent: Optional[str] = Field( default=None, min_length=1, description="Expected intent for intent evaluation" ) + expected_outcome: Optional[str] = Field( + default=None, + min_length=1, + description="Expected outcome for proposal evaluation correctness", + ) + expected_analysis_outcome: Optional[str] = Field( + default=None, + min_length=1, + description="Expected analysis/diagnosis outcome for proposal evaluation", + ) + expected_execution_outcome: Optional[str] = Field( + default=None, + min_length=1, + description="Expected execution/action outcome for proposal evaluation", + ) + expected_verification_outcome: Optional[str] = Field( + default=None, + min_length=1, + description="Expected verification outcome for proposal evaluation", + ) conversation_id: Optional[str] = Field( default=None, description="Conversation ID - populated by API if enabled" ) @@ -122,6 +142,14 @@ class TurnData(StreamingMetricsMixin): proposal_status: Optional[dict[str, Any]] = Field( default=None, description="Raw CRD status populated by ProposalDriver" ) + proposal_results: Optional[dict[str, Any]] = Field( + default=None, + description="Structured results from child Result CRs, populated by ProposalAmender", + ) + proposal_phases: Optional[list[str]] = Field( + default=None, + description="Workflow phases that actually executed (e.g. ['analysis', 'execution'])", + ) # Set of turn metrics that don't pass the validation to ignore them later _invalid_metrics: set[str] = set() diff --git a/src/lightspeed_evaluation/core/proposal/__init__.py b/src/lightspeed_evaluation/core/proposal/__init__.py new file mode 100644 index 00000000..5e35bfab --- /dev/null +++ b/src/lightspeed_evaluation/core/proposal/__init__.py @@ -0,0 +1,5 @@ +"""Proposal CRD domain logic.""" + +from lightspeed_evaluation.core.proposal.phase import derive_phase + +__all__ = ["derive_phase"] diff --git a/src/lightspeed_evaluation/core/proposal/phase.py b/src/lightspeed_evaluation/core/proposal/phase.py new file mode 100644 index 00000000..e892afd6 --- /dev/null +++ b/src/lightspeed_evaluation/core/proposal/phase.py @@ -0,0 +1,50 @@ +"""Derive terminal phase from Proposal CRD conditions.""" + +from typing import Any, Optional + + +def derive_phase( + conditions: list[dict[str, Any]], + proposal_spec: Optional[dict[str, Any]] = None, +) -> str: + """Derive the terminal phase from CRD conditions. + + Args: + conditions: List of condition dicts from proposal_status. + proposal_spec: Proposal spec to determine the last expected step. + + Returns: + Phase string: Completed, Failed, Denied, Escalated, or InProgress. + """ + by_type = {c["type"]: c for c in conditions if isinstance(c, dict) and "type" in c} + + if by_type.get("Denied", {}).get("status") == "True": + return "Denied" + if by_type.get("Escalated", {}).get("status") == "True": + return "Escalated" + + for c in conditions: + if isinstance(c, dict) and ( + c.get("type") in {"Analyzed", "Executed", "Verified"} + and c.get("status") == "False" + and c.get("reason") != "RetryingExecution" + ): + return "Failed" + + step_to_condition = {"verification": "Verified", "execution": "Executed"} + if proposal_spec: + last = next( + (cond for step, cond in step_to_condition.items() if step in proposal_spec), + "Analyzed", + ) + else: + last = "Analyzed" + for step in ("Verified", "Executed", "Analyzed"): + if by_type.get(step, {}).get("status") == "True": + last = step + break + + if by_type.get(last, {}).get("status") == "True": + return "Completed" + + return "InProgress" diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py index 53036e24..d1f8aa96 100644 --- a/src/lightspeed_evaluation/core/system/validator.py +++ b/src/lightspeed_evaluation/core/system/validator.py @@ -62,6 +62,13 @@ "required_fields": ["expected_proposal_status"], "description": "requires 'expected_proposal_status' field", }, + "custom:proposal_evaluation_correctness": { + "required_fields": ["response", "expected_outcome"], + "description": ( + "requires 'response' and 'expected_outcome' fields " + "(Markdown workflow summary from ProposalAmender)" + ), + }, "script:action_eval": { "required_fields": ["verify_script"], "description": "requires 'verify_script' field", diff --git a/src/lightspeed_evaluation/pipeline/evaluation/cli.py b/src/lightspeed_evaluation/pipeline/evaluation/cli.py new file mode 100644 index 00000000..5653258b --- /dev/null +++ b/src/lightspeed_evaluation/pipeline/evaluation/cli.py @@ -0,0 +1,131 @@ +"""CLI client abstraction for Kubernetes cluster operations.""" + +from __future__ import annotations + +import json +import os +import subprocess +from abc import ABC, abstractmethod +from typing import Any, Optional + + +class CLIClient(ABC): + """Interface for CLI operations against a Kubernetes cluster.""" + + def __init__(self, timeout: int) -> None: + """Initialize with a command timeout in seconds.""" + self._timeout = timeout + + @abstractmethod + def run( + self, + args: list[str], + stdin: Optional[str] = None, + ) -> subprocess.CompletedProcess[str]: + """Run a CLI command and return the completed process.""" + + @abstractmethod + def get_resource( + self, + resource_plural: str, + name: str, + ) -> tuple[dict[str, Any], Optional[str]]: + """Fetch a single resource by name. + + Returns: + Tuple of (resource_dict, error_message). On success the dict is + the full JSON object; on failure the dict is empty. + """ + + @abstractmethod + def apply( + self, + manifest: dict[str, Any], + ) -> subprocess.CompletedProcess[str]: + """Apply a manifest via stdin.""" + + @abstractmethod + def delete(self, resource_plural: str, name: str) -> None: + """Delete a resource by name (idempotent).""" + + +class KubeCLI(CLIClient): + """Concrete CLIClient backed by oc or kubectl.""" + + def __init__(self, cli_path: str, namespace: str, timeout: int) -> None: + """Initialize with a resolved binary path and target namespace.""" + super().__init__(timeout) + self._cli = cli_path + self._namespace = namespace + + def run( + self, + args: list[str], + stdin: Optional[str] = None, + ) -> subprocess.CompletedProcess[str]: + """Run a CLI command and return the completed process.""" + try: + return subprocess.run( + [self._cli, *args], + input=stdin, + text=True, + capture_output=True, + env=os.environ.copy(), + timeout=self._timeout, + check=False, + ) + except subprocess.TimeoutExpired: + cmd_str = " ".join([self._cli, *args]) + return subprocess.CompletedProcess( + args=[self._cli, *args], + returncode=1, + stdout="", + stderr=f"Command timed out after {self._timeout}s: {cmd_str}", + ) + + def get_resource( + self, + resource_plural: str, + name: str, + ) -> tuple[dict[str, Any], Optional[str]]: + """Fetch a single resource by name. + + Returns: + Tuple of (resource_dict, error_message). On success the dict is + the full JSON object; on failure the dict is empty. + """ + result = self.run( + ["get", resource_plural, name, "-n", self._namespace, "-o", "json"] + ) + if result.returncode != 0: + return ( + {}, + f"Failed to get {resource_plural}/{name}: " f"{result.stderr.strip()}", + ) + try: + return json.loads(result.stdout), None + except json.JSONDecodeError as exc: + return ( + {}, + f"Failed to parse JSON for {resource_plural}/{name}: {exc}", + ) + + def apply( + self, + manifest: dict[str, Any], + ) -> subprocess.CompletedProcess[str]: + """Apply a manifest via stdin.""" + return self.run(["apply", "-f", "-"], stdin=json.dumps(manifest)) + + def delete(self, resource_plural: str, name: str) -> None: + """Delete a resource by name (idempotent).""" + self.run( + [ + "delete", + resource_plural, + name, + "-n", + self._namespace, + "--ignore-not-found", + ] + ) diff --git a/src/lightspeed_evaluation/pipeline/evaluation/driver.py b/src/lightspeed_evaluation/pipeline/evaluation/driver.py index 1c46b947..8432967c 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/driver.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/driver.py @@ -2,9 +2,7 @@ from __future__ import annotations -import json import logging -import os import re import shutil import subprocess @@ -15,17 +13,19 @@ from typing import Any, Optional, cast from lightspeed_evaluation.core.api import APIClient -from lightspeed_evaluation.core.metrics.custom.proposal_eval import ( - _derive_phase, -) from lightspeed_evaluation.core.models import ( APIConfig, HttpApiAgentConfig, ProposalAgentConfig, TurnData, ) +from lightspeed_evaluation.core.proposal import derive_phase from lightspeed_evaluation.core.system.exceptions import ConfigurationError from lightspeed_evaluation.pipeline.evaluation.amender import APIDataAmender +from lightspeed_evaluation.pipeline.evaluation.cli import KubeCLI +from lightspeed_evaluation.pipeline.evaluation.proposal_amender import ( + ProposalAmender, +) logger = logging.getLogger(__name__) @@ -140,6 +140,12 @@ def __init__(self, config: dict[str, Any], *, enabled: bool = True) -> None: """Initialize the proposal driver.""" super().__init__(config, enabled=enabled) self._cli = self._resolve_cli() + self._kube_cli = KubeCLI( + cli_path=self._cli, + namespace=self._config.namespace, + timeout=self._config.cli_timeout, + ) + self._amender = ProposalAmender(self._kube_cli) def validate_config(self, config: dict[str, Any]) -> ProposalAgentConfig: """Validate proposal driver configuration.""" @@ -205,75 +211,64 @@ def execute_turn( None, ) - turn_data.response = self._extract_summary(status_dict) - turn_data.proposal_status = status_dict + self._amend_turn_data(turn_data, status_dict) self._cleanup(cr_name) + return self._outcome_to_result(outcome, cr_name) + + def _amend_turn_data( + self, turn_data: TurnData, status_dict: dict[str, Any] + ) -> None: + """Amend turn data from proposal status, with fallback on amender failure.""" + amend_err = self._amender.amend(turn_data, status_dict) + if amend_err: + logger.warning("ProposalAmender failed: %s", amend_err) + if not turn_data.response: + turn_data.response = self._extract_summary(status_dict) + if not turn_data.proposal_status: + turn_data.proposal_status = status_dict + + # Failed/Escalated → error (pipeline marks remaining turns as ERROR, + # metrics are NOT evaluated). Denied/Completed → no error (metrics run). + _OUTCOME_ERRORS: dict[TerminalOutcome, str] = { + TerminalOutcome.FAILED: "Proposal '{cr_name}' execution failed", + TerminalOutcome.ESCALATED: ( + "Proposal '{cr_name}' escalated after verification failure" + ), + } - if outcome == TerminalOutcome.COMPLETED: - return (None, None) - return ( - f"Proposal '{cr_name}' terminated with outcome: {outcome}", - None, - ) + @staticmethod + def _outcome_to_result( + outcome: Optional[TerminalOutcome], cr_name: str + ) -> tuple[Optional[str], None]: + """Map a terminal outcome to an (error_message, None) result tuple.""" + template = ProposalDriver._OUTCOME_ERRORS.get(outcome) # type: ignore[arg-type] + if template: + return (template.format(cr_name=cr_name), None) + if outcome != TerminalOutcome.COMPLETED: + logger.warning( + "Proposal '%s' terminated with outcome: %s", cr_name, outcome + ) + return (None, None) @staticmethod def _resolve_cli() -> str: """Resolve oc or kubectl binary path.""" return shutil.which("oc") or shutil.which("kubectl") or "" - def _run_cli( - self, - args: list[str], - stdin: Optional[str] = None, - ) -> subprocess.CompletedProcess[str]: - """Run a CLI command and return the result.""" - return subprocess.run( - [self._cli, *args], - input=stdin, - text=True, - capture_output=True, - env=os.environ.copy(), - timeout=self._config.cli_timeout, - check=False, - ) - def _apply(self, manifest: dict[str, Any]) -> subprocess.CompletedProcess[str]: """Apply a CR manifest via stdin.""" - return self._run_cli(["apply", "-f", "-"], stdin=json.dumps(manifest)) + return self._kube_cli.apply(manifest) def _get_status(self, cr_name: str) -> tuple[dict[str, Any], Optional[str]]: """Get Proposal CR status.""" - result = self._run_cli( - [ - "get", - CRD_PLURAL, - cr_name, - "-n", - self._config.namespace, - "-o", - "json", - ] - ) - if result.returncode != 0: - return {}, f"Failed to get status for '{cr_name}': {result.stderr.strip()}" - try: - cr = json.loads(result.stdout) - except json.JSONDecodeError as exc: - return {}, f"Failed to parse status JSON for '{cr_name}': {exc}" + cr, err = self._kube_cli.get_resource(CRD_PLURAL, cr_name) + if err: + return {}, f"Failed to get status for '{cr_name}': {err}" return cr.get("status", {}), None def _delete(self, cr_name: str) -> None: """Delete a Proposal CR.""" - self._run_cli( - [ - "delete", - CRD_PLURAL, - cr_name, - "-n", - self._config.namespace, - "--ignore-not-found", - ] - ) + self._kube_cli.delete(CRD_PLURAL, cr_name) def _cleanup(self, cr_name: str) -> None: """Delete the Proposal CR if cleanup is enabled.""" @@ -385,12 +380,18 @@ def _is_terminal( conditions: list[dict[str, Any]], proposal_spec: dict[str, Any] ) -> Optional[TerminalOutcome]: """Check if conditions indicate a terminal state.""" - phase = _derive_phase(conditions, proposal_spec or None) + phase = derive_phase(conditions, proposal_spec or None) return ProposalDriver._PHASE_TO_OUTCOME.get(phase) @staticmethod def _extract_summary(status_dict: dict[str, Any]) -> str: - """Extract a human-readable summary from analysis results.""" + """Extract a human-readable summary from analysis results. + + Degraded fallback: only called when ProposalAmender.amend() fails. + Reads only condition messages, ignoring proposal_results and child + Result CRs (analysis/execution/verification). The full rich summary + is built by ProposalAmender.build_summary() in the happy path. + """ conditions = status_dict.get("conditions", []) messages = [c["message"] for c in conditions if c.get("message")] return "; ".join(messages) if messages else "No summary available" diff --git a/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py b/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py new file mode 100644 index 00000000..45b29ce6 --- /dev/null +++ b/src/lightspeed_evaluation/pipeline/evaluation/proposal_amender.py @@ -0,0 +1,261 @@ +"""ProposalAmender — fetches child Result CRs and enriches TurnData.""" + +from __future__ import annotations + +import logging +import subprocess +from typing import Any, Optional + +from lightspeed_evaluation.core.models import TurnData +from lightspeed_evaluation.pipeline.evaluation.cli import CLIClient + +logger = logging.getLogger(__name__) + +STEP_RESOURCES: dict[str, str] = { + "analysis": "analysisresults", + "execution": "executionresults", + "verification": "verificationresults", + "escalation": "escalationresults", +} + + +class ProposalAmender: + """Fetches child Result CRs and enriches TurnData with structured results and summary.""" + + def __init__(self, cli_client: CLIClient) -> None: + """Initialize with a CLIClient for fetching child CRs.""" + self._cli = cli_client + + def amend( + self, turn_data: TurnData, proposal_status: dict[str, Any] + ) -> Optional[str]: + """Amend turn_data in-place with proposal results and Markdown summary. + + Returns: + Error message on failure, None on success. + """ + try: + return self._do_amend(turn_data, proposal_status) + except (KeyError, TypeError, ValueError, subprocess.SubprocessError) as exc: + return f"ProposalAmender error: {exc}" + + def _do_amend( + self, turn_data: TurnData, proposal_status: dict[str, Any] + ) -> Optional[str]: + """Internal amend logic.""" + turn_data.proposal_status = proposal_status + + steps = proposal_status.get("steps", {}) + if not steps: + turn_data.proposal_results = {} + turn_data.proposal_phases = [] + turn_data.response = self.build_summary(turn_data, {}) + return None + + results: dict[str, list[dict[str, Any]]] = {} + for step_name, resource_plural in STEP_RESOURCES.items(): + step_data = steps.get(step_name) + if step_data is None: + continue + refs = step_data.get("results", []) + step_results: list[dict[str, Any]] = [] + for ref in refs: + ref_name = ref.get("name", "") + if not ref_name: + continue + cr, err = self._cli.get_resource(resource_plural, ref_name) + if err: + logger.warning( + "Failed to fetch %s/%s: %s", + resource_plural, + ref_name, + err, + ) + continue + status = cr.get("status", {}) + if status: + step_results.append(status) + results[step_name] = step_results + + turn_data.proposal_results = results + turn_data.proposal_phases = [ + step for step in STEP_RESOURCES if results.get(step) + ] + turn_data.response = self.build_summary(turn_data, results) + + return None + + @staticmethod + def build_summary( + turn_data: TurnData, + results: dict[str, list[dict[str, Any]]], + ) -> str: + """Build a Markdown workflow summary from structured results.""" + sections: list[str] = [] + + sections.append(f"## Request\n\n{turn_data.query or 'N/A'}") + + analysis_results = results.get("analysis", []) + if analysis_results: + sections.append(_build_analysis_section(analysis_results)) + + execution_results = results.get("execution", []) + if execution_results: + sections.append(_build_execution_section(execution_results)) + + verification_results = results.get("verification", []) + if verification_results: + sections.append(_build_verification_section(verification_results)) + + escalation_results = results.get("escalation", []) + if escalation_results: + sections.append(_build_escalation_section(escalation_results)) + + sections.append(_build_outcome_section(turn_data)) + + return "\n\n".join(sections) + + +def _build_analysis_section(analysis_results: list[dict[str, Any]]) -> str: + """Build the Analysis section from AnalysisResult statuses.""" + lines: list[str] = ["## Analysis"] + for result_status in analysis_results: + options = result_status.get("options", []) + if not options: + failure = result_status.get("failureReason", "") + if failure: + lines.append(f"\n**Failed:** {failure}") + continue + lines.append(f"\n{len(options)} option(s) proposed") + for idx, option in enumerate(options): + label = "(Approved)" if idx == 0 else "" + title = option.get("title", "Untitled") + lines.append(f"\n### Option {idx} {label}: {title}".rstrip()) + _append_diagnosis(lines, option.get("diagnosis", {})) + _append_proposal(lines, option.get("proposal", {})) + + return "\n".join(lines) + + +def _append_diagnosis(lines: list[str], diagnosis: dict[str, Any]) -> None: + """Append diagnosis details to output lines.""" + if not diagnosis: + return + summary = diagnosis.get("summary", "") + confidence = diagnosis.get("confidence", "") + root_cause = diagnosis.get("rootCause", "") + if summary: + lines.append(f"**Diagnosis:** {summary} (Confidence: {confidence})") + if root_cause: + lines.append(f"**Root Cause:** {root_cause}") + + +def _append_proposal(lines: list[str], proposal: dict[str, Any]) -> None: + """Append proposal details to output lines.""" + if not proposal: + return + actions = proposal.get("actions", []) + if actions: + lines.append("**Proposed Actions:**") + for i, action in enumerate(actions, 1): + a_type = action.get("type", "") + a_desc = action.get("description", "") + lines.append(f"{i}. [{a_type}] {a_desc}") + risk = proposal.get("risk", "") + reversible = proposal.get("reversible", "") + if risk or reversible: + lines.append(f"**Risk:** {risk} | **Reversible:** {reversible}") + impact = proposal.get("estimatedImpact", "") + if impact: + lines.append(f"**Estimated Impact:** {impact}") + + +def _build_execution_section(execution_results: list[dict[str, Any]]) -> str: + """Build the Execution section from ExecutionResult statuses.""" + lines: list[str] = ["## Execution"] + for result_status in execution_results: + failure = result_status.get("failureReason", "") + if failure: + lines.append(f"\n**Failed:** {failure}") + continue + actions = result_status.get("actionsTaken", []) + if actions: + lines.append("\n**Actions Taken:**") + for i, action in enumerate(actions, 1): + lines.append(_format_execution_action(i, action)) + verification = result_status.get("verification", {}) + if verification: + condition_outcome = verification.get("conditionOutcome", "") + ver_summary = verification.get("summary", "") + lines.append(f"**Post-execution:** {condition_outcome} — {ver_summary}") + return "\n".join(lines) + + +def _format_execution_action(index: int, action: dict[str, Any]) -> str: + """Format a single execution action line.""" + a_type = action.get("type", "") + a_desc = action.get("description", "") + outcome = action.get("outcome", "") + line = f"{index}. [{a_type}] {a_desc} → {outcome}" + output = action.get("output", "") + if output: + line += f"\n Output: {output}" + error = action.get("error", "") + if error: + line += f"\n Error: {error}" + return line + + +def _build_verification_section( + verification_results: list[dict[str, Any]], +) -> str: + """Build the Verification section from VerificationResult statuses.""" + lines: list[str] = ["## Verification"] + for result_status in verification_results: + failure = result_status.get("failureReason", "") + if failure: + lines.append(f"\n**Failed:** {failure}") + continue + checks = result_status.get("checks", []) + if checks: + lines.append("\n**Checks:**") + for check in checks: + name = check.get("name", "") + source = check.get("source", "") + value = check.get("value", "") + check_result = check.get("result", "") + lines.append(f"- {name} ({source}): {value} → {check_result}") + ver_summary = result_status.get("summary", "") + if ver_summary: + lines.append(f"**Summary:** {ver_summary}") + return "\n".join(lines) + + +def _build_escalation_section( + escalation_results: list[dict[str, Any]], +) -> str: + """Build the Escalation section from EscalationResult statuses.""" + lines: list[str] = ["## Escalation"] + for result_status in escalation_results: + esc_summary = result_status.get("summary", "") + if esc_summary: + lines.append(f"\n**Summary:** {esc_summary}") + content = result_status.get("content", "") + if content: + lines.append(f"\n{content}") + failure = result_status.get("failureReason", "") + if failure: + lines.append(f"\n**Failed:** {failure}") + return "\n".join(lines) + + +def _build_outcome_section(turn_data: TurnData) -> str: + """Build the Outcome section from proposal_status conditions.""" + conditions = [] + if turn_data.proposal_status: + conditions = turn_data.proposal_status.get("conditions", []) + if not conditions: + return "## Outcome\n\nNo conditions available" + messages = [c.get("message", "") for c in conditions if c.get("message")] + summary = "; ".join(messages) if messages else "No summary available" + return f"## Outcome\n\n{summary}" diff --git a/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml b/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml new file mode 100644 index 00000000..0c4622bc --- /dev/null +++ b/tests/integration/agentic/fixtures/crashloop-probe-demo.yaml @@ -0,0 +1,39 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: crashloop-probe-demo + namespace: lightspeed-evaluation-test +spec: + replicas: 1 + selector: + matchLabels: + app: crashloop-probe-demo + template: + metadata: + labels: + app: crashloop-probe-demo + spec: + containers: + - name: nginx + image: registry.access.redhat.com/ubi9/nginx-124:latest + ports: + - containerPort: 8080 + livenessProbe: + httpGet: + path: /nonexistent-health + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 3 + failureThreshold: 2 + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + capabilities: + drop: ["ALL"] + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" diff --git a/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh b/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh new file mode 100644 index 00000000..7edde567 --- /dev/null +++ b/tests/integration/agentic/scripts/_cleanup_infra-claude-vertex.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Common infrastructure cleanup for Claude/Vertex AI provider. +# Sourced by per-scenario cleanup scripts — do NOT run directly. +# +# Deletes "eval-" prefixed operator resources (reverse order of creation) +# and the test namespace. + +OPERATOR_NS="${OPERATOR_NS:-openshift-lightspeed}" +TEST_NS="${TEST_NS:-lightspeed-evaluation-test}" + +oc delete proposals --all -n "$TEST_NS" --ignore-not-found +oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found +oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found +oc delete agent eval-default --ignore-not-found +oc delete llmprovider eval-vertex-ai --ignore-not-found +oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found +oc delete namespace "$TEST_NS" --ignore-not-found diff --git a/tests/integration/agentic/scripts/_cleanup_infra-openai.sh b/tests/integration/agentic/scripts/_cleanup_infra-openai.sh new file mode 100644 index 00000000..8bc62214 --- /dev/null +++ b/tests/integration/agentic/scripts/_cleanup_infra-openai.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Common infrastructure cleanup for OpenAI provider. +# Sourced by per-scenario cleanup scripts — do NOT run directly. +# +# Deletes "eval-" prefixed operator resources (reverse order of creation). + +OPERATOR_NS="${OPERATOR_NS:-openshift-lightspeed}" +TEST_NS="${TEST_NS:-lightspeed-evaluation-test}" + +oc delete proposals --all -n "$TEST_NS" --ignore-not-found +oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found +oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found +oc delete agent eval-default --ignore-not-found +oc delete llmprovider eval-openai --ignore-not-found +oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found diff --git a/tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh b/tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh old mode 100755 new mode 100644 similarity index 80% rename from tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh rename to tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh index 70689a06..971839aa --- a/tests/integration/agentic/scripts/setup_proposal_fixtures-claude-vertex.sh +++ b/tests/integration/agentic/scripts/_setup_infra-claude-vertex.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash -set -euo pipefail - -# Deploy agentic infrastructure + OOMKill test workload for integration tests. -# All operator-level resources use an "eval-" prefix to avoid conflicts with -# existing cluster resources. +# Common infrastructure setup for Claude via Vertex AI. +# Sourced by per-scenario setup scripts — do NOT run directly. +# +# Deploys: namespace, Secret, LLMProvider, Agent, SandboxTemplate. +# All operator-level resources use an "eval-" prefix. # # Required env vars: # GCP_CREDENTIALS_FILE — path to GCP credentials JSON file @@ -15,8 +15,10 @@ set -euo pipefail # CLOUD_ML_REGION — default: global # AGENT_MODEL — default: claude-opus-4-6 -OPERATOR_NS="openshift-lightspeed" -TEST_NS="lightspeed-evaluation-test" +set -euo pipefail + +export OPERATOR_NS="openshift-lightspeed" +export TEST_NS="lightspeed-evaluation-test" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" GCP_CREDENTIALS_FILE="${GCP_CREDENTIALS_FILE:-$HOME/.config/gcloud/application_default_credentials.json}" CLOUD_ML_REGION="${CLOUD_ML_REGION:-global}" @@ -34,19 +36,17 @@ if [ ! -f "$GCP_CREDENTIALS_FILE" ]; then exit 1 fi -# 1. Test namespace + OOMKill workload (static fixtures, already test-scoped) +# 1. Test namespace oc apply -f "$SCRIPT_DIR/../fixtures/namespace.yaml" -oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml" - -# 2. Secret (GCP credentials) — prefixed name in operator namespace +# 2. Secret (GCP credentials) oc create secret generic eval-llm-credentials \ --from-file=credentials.json="$GCP_CREDENTIALS_FILE" \ --from-literal=ANTHROPIC_VERTEX_PROJECT_ID="$ANTHROPIC_VERTEX_PROJECT_ID" \ --from-literal=CLOUD_ML_REGION="$CLOUD_ML_REGION" \ -n "$OPERATOR_NS" --dry-run=client -o yaml | oc apply -f - -# 3. LLMProvider — references prefixed secret +# 3. LLMProvider cat </dev/null || true -sleep 10 -echo "Setup complete." +echo "Infrastructure setup complete (Claude/Vertex AI)." diff --git a/tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh b/tests/integration/agentic/scripts/_setup_infra-openai.sh old mode 100755 new mode 100644 similarity index 67% rename from tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh rename to tests/integration/agentic/scripts/_setup_infra-openai.sh index 34110ca1..dac09a7e --- a/tests/integration/agentic/scripts/setup_proposal_fixtures-openai.sh +++ b/tests/integration/agentic/scripts/_setup_infra-openai.sh @@ -1,10 +1,9 @@ #!/usr/bin/env bash -set -euo pipefail - -# Deploy agentic infrastructure + OOMKill test workload for integration tests -# using OpenAI as the LLM provider. -# All operator-level resources use an "eval-" prefix to avoid conflicts with -# existing cluster resources. +# Common infrastructure setup for OpenAI provider. +# Sourced by per-scenario setup scripts — do NOT run directly. +# +# Deploys: namespace, Secret, LLMProvider, Agent, SandboxTemplate. +# All operator-level resources use an "eval-" prefix. # # Required env vars: # OPENAI_API_KEY — OpenAI API key @@ -12,13 +11,11 @@ set -euo pipefail # # Optional env vars: # AGENT_MODEL — default: gpt-5.2 -# -# To test with Claude via Vertex AI instead, use -# setup_proposal_fixtures-claude-vertex.sh and set the corresponding env vars -# (GCP_CREDENTIALS_FILE, ANTHROPIC_VERTEX_PROJECT_ID, SANDBOX_IMAGE). -OPERATOR_NS="openshift-lightspeed" -TEST_NS="lightspeed-evaluation-test" +set -euo pipefail + +export OPERATOR_NS="openshift-lightspeed" +export TEST_NS="lightspeed-evaluation-test" AGENT_MODEL="${AGENT_MODEL:-gpt-5.2}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -30,13 +27,10 @@ for var in OPENAI_API_KEY SANDBOX_IMAGE; do fi done -# 1. Test namespace + OOMKill workload (static fixtures, already test-scoped) +# 1. Test namespace oc apply -f "$SCRIPT_DIR/../fixtures/namespace.yaml" -oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml" -# 2. Secret (OpenAI API key + provider override) — prefixed name in operator namespace -# LIGHTSPEED_AGENT_PROVIDER and OPENAI_MODEL are injected via envFrom so the -# sandbox picks the OpenAI provider instead of defaulting to claude. +# 2. Secret (OpenAI API key + provider override) cat </dev/null || true -sleep 10 -echo "Setup complete." +echo "Infrastructure setup complete (OpenAI)." diff --git a/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh new file mode 100755 index 00000000..552f868d --- /dev/null +++ b/tests/integration/agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Tear down CrashLoopBackOff (liveness probe) test workload and Claude/Vertex AI infrastructure. + +export OPERATOR_NS="openshift-lightspeed" +export TEST_NS="lightspeed-evaluation-test" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "Cleaning up CrashLoopBackOff probe integration test resources..." +oc delete deployment crashloop-probe-demo -n "$TEST_NS" --ignore-not-found + +# shellcheck source=_cleanup_infra-claude-vertex.sh +source "$SCRIPT_DIR/_cleanup_infra-claude-vertex.sh" +echo "Cleanup complete." diff --git a/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh b/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh new file mode 100755 index 00000000..09adcfcf --- /dev/null +++ b/tests/integration/agentic/scripts/cleanup_crashloop_probe-openai.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Tear down CrashLoopBackOff (liveness probe) test workload and OpenAI provider infrastructure. + +export OPERATOR_NS="openshift-lightspeed" +export TEST_NS="lightspeed-evaluation-test" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "Cleaning up CrashLoopBackOff probe integration test resources..." +oc delete deployment crashloop-probe-demo -n "$TEST_NS" --ignore-not-found + +# shellcheck source=_cleanup_infra-openai.sh +source "$SCRIPT_DIR/_cleanup_infra-openai.sh" +echo "Cleanup complete." diff --git a/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh new file mode 100755 index 00000000..65eb9879 --- /dev/null +++ b/tests/integration/agentic/scripts/cleanup_oomkill-claude-vertex.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Tear down OOMKill test workload and Claude/Vertex AI provider infrastructure. + +export OPERATOR_NS="openshift-lightspeed" +export TEST_NS="lightspeed-evaluation-test" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "Cleaning up OOMKill integration test resources..." +oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found + +# shellcheck source=_cleanup_infra-claude-vertex.sh +source "$SCRIPT_DIR/_cleanup_infra-claude-vertex.sh" +echo "Cleanup complete." diff --git a/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh b/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh new file mode 100755 index 00000000..6729724c --- /dev/null +++ b/tests/integration/agentic/scripts/cleanup_oomkill-openai.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Tear down OOMKill test workload and OpenAI provider infrastructure. + +export OPERATOR_NS="openshift-lightspeed" +export TEST_NS="lightspeed-evaluation-test" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "Cleaning up OOMKill integration test resources..." +oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found + +# shellcheck source=_cleanup_infra-openai.sh +source "$SCRIPT_DIR/_cleanup_infra-openai.sh" +echo "Cleanup complete." diff --git a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh b/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh deleted file mode 100755 index 4f84d111..00000000 --- a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-claude-vertex.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Tear down integration test resources. Only deletes "eval-" prefixed operator -# resources to avoid touching anything that isn't ours. - -OPERATOR_NS="openshift-lightspeed" -TEST_NS="lightspeed-evaluation-test" - -echo "Cleaning up integration test resources..." - -# Delete test workload + any leftover Proposals in test namespace -oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found -oc delete proposals --all -n "$TEST_NS" --ignore-not-found -oc delete proposalapprovals --all -n "$TEST_NS" --ignore-not-found - -# Delete prefixed operator resources (reverse order of creation) -oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found -oc delete agent eval-default --ignore-not-found -oc delete llmprovider eval-vertex-ai --ignore-not-found -oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found - -# Delete test namespace (cascades remaining namespaced resources) -oc delete namespace "$TEST_NS" --ignore-not-found - -echo "Cleanup complete." diff --git a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh b/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh deleted file mode 100755 index 550b95d5..00000000 --- a/tests/integration/agentic/scripts/cleanup_proposal_fixtures-openai.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Tear down integration test resources deployed by setup_proposal_fixtures.sh -# (OpenAI provider). Only deletes "eval-" prefixed operator resources to avoid -# touching anything that isn't ours. - -OPERATOR_NS="openshift-lightspeed" -TEST_NS="lightspeed-evaluation-test" - -echo "Cleaning up integration test resources..." - -# Delete test workload (Proposal cleanup is handled by the driver via cleanup_proposals) -oc delete deployment oomkill-demo -n "$TEST_NS" --ignore-not-found - -# Delete prefixed operator resources (reverse order of creation) -oc delete sandboxtemplate eval-lightspeed-agent -n "$OPERATOR_NS" --ignore-not-found -oc delete agent eval-default --ignore-not-found -oc delete llmprovider eval-openai --ignore-not-found -oc delete secret eval-llm-credentials -n "$OPERATOR_NS" --ignore-not-found - -echo "Cleanup complete." diff --git a/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh b/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh new file mode 100755 index 00000000..71677724 --- /dev/null +++ b/tests/integration/agentic/scripts/setup_crashloop_probe-claude-vertex.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deploy CrashLoopBackOff (misconfigured liveness probe) test workload +# with Claude/Vertex AI provider infrastructure. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_setup_infra-claude-vertex.sh +source "$SCRIPT_DIR/_setup_infra-claude-vertex.sh" + +oc apply -f "$SCRIPT_DIR/../fixtures/crashloop-probe-demo.yaml" + +echo "Waiting for crashloop-probe-demo pod to appear..." +oc wait --for=condition=Available=false deployment/crashloop-probe-demo \ + -n "$TEST_NS" --timeout=60s 2>/dev/null || true +sleep 10 +echo "Setup complete." diff --git a/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh b/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh new file mode 100755 index 00000000..0383f79c --- /dev/null +++ b/tests/integration/agentic/scripts/setup_crashloop_probe-openai.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deploy CrashLoopBackOff (misconfigured liveness probe) test workload +# with OpenAI provider infrastructure. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_setup_infra-openai.sh +source "$SCRIPT_DIR/_setup_infra-openai.sh" + +oc apply -f "$SCRIPT_DIR/../fixtures/crashloop-probe-demo.yaml" + +echo "Waiting for crashloop-probe-demo pod to appear..." +oc wait --for=condition=Available=false deployment/crashloop-probe-demo \ + -n "$TEST_NS" --timeout=60s 2>/dev/null +sleep 10 +echo "Setup complete." diff --git a/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh b/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh new file mode 100755 index 00000000..16e4b98a --- /dev/null +++ b/tests/integration/agentic/scripts/setup_oomkill-claude-vertex.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deploy OOMKill test workload with Claude/Vertex AI provider infrastructure. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_setup_infra-claude-vertex.sh +source "$SCRIPT_DIR/_setup_infra-claude-vertex.sh" + +oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml" + +echo "Waiting for oomkill-demo pod to appear..." +oc wait --for=condition=Available=false deployment/oomkill-demo \ + -n "$TEST_NS" --timeout=60s 2>/dev/null || true +sleep 10 +echo "Setup complete." diff --git a/tests/integration/agentic/scripts/setup_oomkill-openai.sh b/tests/integration/agentic/scripts/setup_oomkill-openai.sh new file mode 100755 index 00000000..1080d733 --- /dev/null +++ b/tests/integration/agentic/scripts/setup_oomkill-openai.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deploy OOMKill test workload with OpenAI provider infrastructure. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_setup_infra-openai.sh +source "$SCRIPT_DIR/_setup_infra-openai.sh" + +oc apply -f "$SCRIPT_DIR/../fixtures/oomkill-demo.yaml" + +echo "Waiting for oomkill-demo pod to reach OOMKilled or CrashLoopBackOff..." +TIMEOUT=120 +INTERVAL=5 +ELAPSED=0 +while [ "$ELAPSED" -lt "$TIMEOUT" ]; do + TERMINATED_REASON=$(oc get pod -l app=oomkill-demo -n "$TEST_NS" \ + -o jsonpath='{.items[0].status.containerStatuses[0].state.terminated.reason}' 2>/dev/null || true) + WAITING_REASON=$(oc get pod -l app=oomkill-demo -n "$TEST_NS" \ + -o jsonpath='{.items[0].status.containerStatuses[0].state.waiting.reason}' 2>/dev/null || true) + + if [ "$TERMINATED_REASON" = "OOMKilled" ] || [ "$WAITING_REASON" = "CrashLoopBackOff" ]; then + echo "Pod reached expected state (terminated=$TERMINATED_REASON, waiting=$WAITING_REASON)." + echo "Setup complete." + exit 0 + fi + + sleep "$INTERVAL" + ELAPSED=$((ELAPSED + INTERVAL)) +done + +echo "ERROR: Timed out after ${TIMEOUT}s waiting for OOMKilled or CrashLoopBackOff." >&2 +exit 1 diff --git a/tests/integration/system-config-agents-proposal.yaml b/tests/integration/system-config-agents-proposal.yaml index d0902a23..0fb0db82 100644 --- a/tests/integration/system-config-agents-proposal.yaml +++ b/tests/integration/system-config-agents-proposal.yaml @@ -1,3 +1,6 @@ +logging: + source_level: DEBUG + core: max_threads: 1 fail_on_invalid_data: true @@ -12,7 +15,7 @@ agents: namespace: lightspeed-evaluation-test auto_approve: true cleanup_proposals: true - timeout: 900 + timeout: 1200 poll_interval: 5 storage: diff --git a/tests/integration/test_evaluation_data_proposal.yaml b/tests/integration/test_evaluation_data_proposal.yaml index 27456b68..e597ecf2 100644 --- a/tests/integration/test_evaluation_data_proposal.yaml +++ b/tests/integration/test_evaluation_data_proposal.yaml @@ -1,13 +1,42 @@ -- conversation_group_id: proposal_full_lifecycle - description: Full lifecycle — analysis, execution, verification - tag: proposal - setup_script: agentic/scripts/setup_proposal_fixtures-openai.sh - cleanup_script: agentic/scripts/cleanup_proposal_fixtures-openai.sh +- conversation_group_id: proposal_analysis_only + description: Analysis-only — no execution or verification (query from proposal_spec.request) + tag: proposal_analysis_only + setup_script: agentic/scripts/setup_oomkill-openai.sh + cleanup_script: agentic/scripts/cleanup_oomkill-openai.sh conversation_metrics: [] conversation_metrics_metadata: {} turns: - turn_id: turn_1 response: null + proposal_spec: + request: >- + A pod named oomkill-demo in namespace lightspeed-evaluation-test + is in CrashLoopBackOff. Analyze the root cause. + targetNamespaces: + - lightspeed-evaluation-test + tools: + skills: + # TODO(rioloc): replace with a stable image + - image: quay.io/harpatil/agentic-skills:latest + paths: + - /skills/find-token + analysis: + agent: eval-default + expected_proposal_status: + phase: Completed + turn_metrics: + - custom:proposal_status + turn_metrics_metadata: {} + +- conversation_group_id: proposal_oomkill_openai + description: OOMKill full lifecycle — status check + LLM-as-judge evaluation + tag: proposal_oomkill + setup_script: agentic/scripts/setup_oomkill-openai.sh + cleanup_script: agentic/scripts/cleanup_oomkill-openai.sh + conversation_metrics: [] + conversation_metrics_metadata: {} + turns: + - turn_id: turn_1 proposal_spec: request: >- A pod named oomkill-demo in namespace lightspeed-evaluation-test @@ -29,24 +58,42 @@ agent: eval-default expected_proposal_status: phase: Completed + expected_outcome: >- + Root cause: the pod oomkill-demo is OOMKilled because its container + memory limit is too low. Remediation: increase the container memory + limit (e.g. to 512Mi) and verify the pod reaches Running state + without further OOMKill events. + expected_analysis_outcome: >- + The agent should identify that the pod oomkill-demo is being + OOMKilled because its container memory limit is set too low + (e.g. 128Mi), causing the kernel to terminate the process. + expected_execution_outcome: >- + The agent should increase the container memory limit (e.g. to + 512Mi) by patching the pod or its owning workload resource. + expected_verification_outcome: >- + The agent should verify the pod reaches Running state without + further OOMKill events. turn_metrics: - custom:proposal_status - turn_metrics_metadata: {} + - custom:proposal_evaluation_correctness + turn_metrics_metadata: + "custom:proposal_evaluation_correctness": + threshold: 0.75 -- conversation_group_id: proposal_analysis_only - description: Analysis-only — no execution or verification (query from proposal_spec.request) - tag: proposal - setup_script: agentic/scripts/setup_proposal_fixtures-openai.sh - cleanup_script: agentic/scripts/cleanup_proposal_fixtures-openai.sh +- conversation_group_id: proposal_oomkill_claude-vertex + description: Judge evaluation — LLM-as-judge on full lifecycle workflow + tag: proposal_oomkill + setup_script: agentic/scripts/setup_oomkill-claude-vertex.sh + cleanup_script: agentic/scripts/cleanup_oomkill-claude-vertex.sh conversation_metrics: [] conversation_metrics_metadata: {} turns: - turn_id: turn_1 - response: null proposal_spec: request: >- A pod named oomkill-demo in namespace lightspeed-evaluation-test - is in CrashLoopBackOff. Analyze the root cause. + is in CrashLoopBackOff due to OOMKill. Analyze the root cause, + fix the memory configuration, and verify the fix. targetNamespaces: - lightspeed-evaluation-test tools: @@ -57,8 +104,115 @@ - /skills/find-token analysis: agent: eval-default + execution: + agent: eval-default + verification: + agent: eval-default + expected_outcome: >- + Root cause: the pod oomkill-demo is OOMKilled because its container + memory limit is too low. Remediation: increase the container memory + limit (e.g. to 512Mi) and verify the pod reaches Running state + without further OOMKill events. + expected_analysis_outcome: >- + The agent should identify that the pod oomkill-demo is being + OOMKilled because its container memory limit is set too low + (e.g. 128Mi), causing the kernel to terminate the process. + expected_execution_outcome: >- + The agent should increase the container memory limit (e.g. to + 512Mi) by patching the pod or its owning workload resource. + expected_verification_outcome: >- + The agent should verify the pod reaches Running state without + further OOMKill events. + turn_metrics: + - custom:proposal_evaluation_correctness + turn_metrics_metadata: + "custom:proposal_evaluation_correctness": + threshold: 0.75 + +- conversation_group_id: proposal_crashloop_probe_openai + description: CrashLoopBackOff caused by misconfigured liveness probe — full lifecycle + tag: proposal_crashloop_probe + setup_script: agentic/scripts/setup_crashloop_probe-openai.sh + cleanup_script: agentic/scripts/cleanup_crashloop_probe-openai.sh + conversation_metrics: [] + conversation_metrics_metadata: {} + turns: + - turn_id: turn_1 + proposal_spec: + request: >- + A pod named crashloop-probe-demo in namespace lightspeed-evaluation-test + is in CrashLoopBackOff. The container keeps restarting. Analyze the + root cause, fix the issue, and verify the pod is stable. + targetNamespaces: + - lightspeed-evaluation-test + tools: + skills: + # TODO(rioloc): replace with a stable image + - image: quay.io/harpatil/agentic-skills:latest + paths: + - /skills/find-token + analysis: + agent: eval-default + execution: + agent: eval-default + verification: + agent: eval-default expected_proposal_status: phase: Completed + expected_outcome: >- + Root cause: the liveness probe on pod crashloop-probe-demo is + configured to check an endpoint that does not exist (/nonexistent-health), + causing the kubelet to kill and restart the container repeatedly. + Remediation: fix the liveness probe path to point to a valid health + endpoint (e.g. / or remove the probe) and verify the pod reaches + Running state without further restarts. turn_metrics: - custom:proposal_status - turn_metrics_metadata: {} + - custom:proposal_evaluation_correctness + turn_metrics_metadata: + "custom:proposal_evaluation_correctness": + threshold: 0.75 + +- conversation_group_id: proposal_crashloop_probe_claude-vertex + description: CrashLoopBackOff caused by misconfigured liveness probe — full lifecycle (Claude/Vertex) + tag: proposal_crashloop_probe + setup_script: agentic/scripts/setup_crashloop_probe-claude-vertex.sh + cleanup_script: agentic/scripts/cleanup_crashloop_probe-claude-vertex.sh + conversation_metrics: [] + conversation_metrics_metadata: {} + turns: + - turn_id: turn_1 + proposal_spec: + request: >- + A pod named crashloop-probe-demo in namespace lightspeed-evaluation-test + is in CrashLoopBackOff. The container keeps restarting. Analyze the + root cause, fix the issue, and verify the pod is stable. + targetNamespaces: + - lightspeed-evaluation-test + tools: + skills: + # TODO(rioloc): replace with a stable image + - image: quay.io/harpatil/agentic-skills:latest + paths: + - /skills/find-token + analysis: + agent: eval-default + execution: + agent: eval-default + verification: + agent: eval-default + expected_proposal_status: + phase: Completed + expected_outcome: >- + Root cause: the liveness probe on pod crashloop-probe-demo is + configured to check an endpoint that does not exist (/nonexistent-health), + causing the kubelet to kill and restart the container repeatedly. + Remediation: fix the liveness probe path to point to a valid health + endpoint (e.g. / or remove the probe) and verify the pod reaches + Running state without further restarts. + turn_metrics: + - custom:proposal_status + - custom:proposal_evaluation_correctness + turn_metrics_metadata: + "custom:proposal_evaluation_correctness": + threshold: 0.75 diff --git a/tests/integration/test_proposal_evaluation.py b/tests/integration/test_proposal_evaluation.py index 06238126..f30d9a15 100644 --- a/tests/integration/test_proposal_evaluation.py +++ b/tests/integration/test_proposal_evaluation.py @@ -8,10 +8,8 @@ - OPENAI_API_KEY, SANDBOX_IMAGE env vars set - Network connectivity to the cluster API -The default setup/cleanup scripts use OpenAI as the LLM provider. -To test with a different provider (e.g. Claude via Vertex AI), set the -corresponding env vars and point the eval data at the provider-specific -scripts (e.g. setup_proposal_fixtures-claude-vertex.sh). +Each scenario has its own setup/cleanup scripts that source a shared +infrastructure script per provider (e.g. _setup_infra-openai.sh). Run with: pytest tests/integration/test_proposal_evaluation.py -v -m agentic """ @@ -103,8 +101,8 @@ class TestProposalDriverEvaluation: """End-to-end tests for ProposalDriver evaluation pipeline.""" @pytest.mark.timeout(1200) - def test_full_lifecycle(self, tmp_path: Path) -> None: - """Test full Proposal lifecycle: analysis, execution, verification. + def test_oomkill_full_lifecycle(self, tmp_path: Path) -> None: + """Test OOMKill full lifecycle: status check + judge evaluation. Verifies: - Setup script deploys infrastructure and test workload @@ -126,9 +124,9 @@ def test_full_lifecycle(self, tmp_path: Path) -> None: ) all_data = validator.load_evaluation_data(str(PROPOSAL_EVAL_DATA_PATH)) eval_data = [ - d for d in all_data if d.conversation_group_id == "proposal_full_lifecycle" + d for d in all_data if d.conversation_group_id == "proposal_oomkill_openai" ] - assert len(eval_data) == 1, "Should find proposal_full_lifecycle data" + assert len(eval_data) == 1, "Should find proposal_oomkill_openai data" evaluate(system_config, eval_data) @@ -201,6 +199,40 @@ def test_analysis_only(self, tmp_path: Path) -> None: f"got reason={by_type['Verified'].get('reason')}" ) + @pytest.mark.timeout(1200) + def test_oomkill_claude_vertex(self, tmp_path: Path) -> None: + """Test OOMKill full lifecycle with Claude/Vertex AI provider. + + Verifies: + - ProposalDriver populates response with workflow summary + - custom:proposal_evaluation_correctness metric runs against response + - Pipeline completes without errors + """ + loader = ConfigLoader() + system_config = loader.load_system_config(str(PROPOSAL_CONFIG_PATH)) + system_config.storage = [ + FileBackendConfig(output_dir=str(tmp_path / "eval_output")) + ] + + validator = DataValidator( + api_enabled=True, + fail_on_invalid_data=system_config.core.fail_on_invalid_data, + ) + all_data = validator.load_evaluation_data(str(PROPOSAL_EVAL_DATA_PATH)) + eval_data = [ + d + for d in all_data + if d.conversation_group_id == "proposal_oomkill_claude-vertex" + ] + assert len(eval_data) == 1, "Should find proposal_oomkill_claude-vertex data" + + evaluate(system_config, eval_data) + + turn = eval_data[0].turns[0] + assert ( + turn.response and turn.response.strip() + ), "Response should be populated by ProposalDriver" + @pytest.mark.timeout(120) def test_timeout_handling(self, tmp_path: Path) -> None: """Test that a very short timeout is handled gracefully. diff --git a/tests/unit/core/metrics/custom/test_custom.py b/tests/unit/core/metrics/custom/test_custom.py index 51a002c7..9125c6d2 100644 --- a/tests/unit/core/metrics/custom/test_custom.py +++ b/tests/unit/core/metrics/custom/test_custom.py @@ -1,10 +1,15 @@ """Tests for custom metrics module.""" +# pylint: disable=protected-access + +import json + from pytest_mock import MockerFixture from lightspeed_evaluation.core.metrics.custom.custom import CustomMetrics from lightspeed_evaluation.core.metrics.manager import MetricLevel from lightspeed_evaluation.core.models import EvaluationScope, TurnData +from lightspeed_evaluation.core.system.exceptions import LLMError class TestCustomMetricsToolEval: @@ -175,3 +180,322 @@ def test_config_from_system_defaults_via_metric_manager( assert score == 1.0 assert "partial" in reason assert "unordered" in reason + + +def _make_custom_metrics(mocker: MockerFixture) -> CustomMetrics: + """Create a CustomMetrics instance with mocked LLM manager.""" + mock_llm_manager = mocker.Mock() + mock_llm_manager.get_model_name.return_value = "test-model" + mock_llm_manager.get_llm_params.return_value = {"parameters": {}} + return CustomMetrics(mock_llm_manager) + + +def _make_scope(turn_data: TurnData, is_conversation: bool = False) -> EvaluationScope: + """Create an EvaluationScope for a turn.""" + return EvaluationScope( + turn_idx=0, turn_data=turn_data, is_conversation=is_conversation + ) + + +METRIC_NAME = "proposal_evaluation_correctness" + +_LLM_RESPONSE_ALL_DIMS = json.dumps( + { + "reasoning": ( + "Diagnosis was accurate. Execution addressed root cause. " + "Verification confirmed fix." + ), + "diagnosis": 0.9, + "execution": 0.8, + "verification": 0.7, + "average": 0.80, + } +) + +_LLM_RESPONSE_NO_VERIFICATION = json.dumps( + { + "reasoning": "Diagnosis correct. Execution appropriate but no verification.", + "diagnosis": 0.9, + "execution": 0.8, + "verification": None, + "average": 0.85, + } +) + + +class TestParseProposalEvalResponse: + """Test _parse_proposal_eval_response parser.""" + + def test_all_dimensions(self, mocker: MockerFixture) -> None: + """Test parsing response with all three dimensions scored.""" + cm = _make_custom_metrics(mocker) + score, detail = cm._parse_proposal_eval_response(_LLM_RESPONSE_ALL_DIMS) + + assert score == 0.80 + assert "diagnosis=0.90" in detail + assert "execution=0.80" in detail + assert "verification=0.70" in detail + assert "avg=0.80" in detail + assert "Diagnosis was accurate" in detail + + def test_dimension_na(self, mocker: MockerFixture) -> None: + """Test parsing response with N/A dimension.""" + cm = _make_custom_metrics(mocker) + score, detail = cm._parse_proposal_eval_response(_LLM_RESPONSE_NO_VERIFICATION) + + assert score == 0.85 + assert "verification=N/A" in detail + assert "diagnosis=0.90" in detail + + def test_fallback_average_from_sub_scores(self, mocker: MockerFixture) -> None: + """Test average is computed from sub-scores when average key is missing.""" + cm = _make_custom_metrics(mocker) + response = json.dumps( + { + "reasoning": "ok", + "diagnosis": 0.9, + "execution": 0.7, + "verification": None, + } + ) + + score, detail = cm._parse_proposal_eval_response(response) + + assert score is not None + assert abs(score - 0.80) < 0.01 + assert "avg=0.80" in detail + + def test_invalid_json_returns_none(self, mocker: MockerFixture) -> None: + """Test that invalid JSON response returns None score.""" + cm = _make_custom_metrics(mocker) + score, detail = cm._parse_proposal_eval_response("I cannot evaluate this.") + + assert score is None + assert "Invalid JSON" in detail + + +class TestProposalEvaluationCorrectness: + """Test custom:proposal_evaluation_correctness metric.""" + + def test_returns_score_from_llm(self, mocker: MockerFixture) -> None: + """Test successful LLM evaluation returns parsed score.""" + cm = _make_custom_metrics(mocker) + mocker.patch.object(cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS) + + turn = TurnData( + turn_id="t1", + query="Fix pod crash", + response="## Request\n\nFix pod crash\n\n## Analysis\n...", + expected_outcome="Root cause: OOMKilled. Remediation: increase memory limit.", + ) + score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + assert score == 0.80 + assert "diagnosis=0.90" in reason + assert "avg=0.80" in reason + + def test_conversation_level_returns_none(self, mocker: MockerFixture) -> None: + """Test conversation-level scope returns None score.""" + cm = _make_custom_metrics(mocker) + turn = TurnData(turn_id="t1", query="q", response="r") + scope = _make_scope(turn, is_conversation=True) + + score, reason = cm.evaluate(METRIC_NAME, None, scope) + + assert score is None + assert "turn-level" in reason + + def test_missing_response_returns_none(self, mocker: MockerFixture) -> None: + """Test missing response returns None score.""" + cm = _make_custom_metrics(mocker) + turn = TurnData(turn_id="t1", query="Fix pod crash") + + score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + assert score is None + assert "response" in reason.lower() + + def test_missing_expected_outcome_returns_none(self, mocker: MockerFixture) -> None: + """Test missing expected_outcome returns None score.""" + cm = _make_custom_metrics(mocker) + turn = TurnData( + turn_id="t1", + query="Fix pod crash", + response="## Analysis\nDiagnosis: memory too low", + ) + + score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + assert score is None + assert "expected outcome" in reason.lower() + + def test_none_turn_data_returns_none(self, mocker: MockerFixture) -> None: + """Test None turn data returns None score.""" + cm = _make_custom_metrics(mocker) + scope = EvaluationScope(turn_idx=0, turn_data=None, is_conversation=False) + + score, reason = cm.evaluate(METRIC_NAME, None, scope) + + assert score is None + assert "TurnData" in reason + + def test_unparseable_llm_response(self, mocker: MockerFixture) -> None: + """Test unparseable LLM response returns None score.""" + cm = _make_custom_metrics(mocker) + mocker.patch.object(cm, "_call_llm", return_value="I cannot evaluate this.") + + turn = TurnData( + turn_id="t1", + query="q", + response="## Request\nq", + expected_outcome="Expected outcome", + ) + + score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + assert score is None + assert "parse" in reason.lower() or "Could not" in reason + + def test_llm_error_handled(self, mocker: MockerFixture) -> None: + """Test LLM error is caught and returns None score.""" + cm = _make_custom_metrics(mocker) + mocker.patch.object(cm, "_call_llm", side_effect=LLMError("timeout")) + + turn = TurnData( + turn_id="t1", + query="q", + response="## Request\nq", + expected_outcome="Expected outcome", + ) + + score, reason = cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + assert score is None + assert "timeout" in reason + + def test_prompt_contains_query_response_and_expected( + self, mocker: MockerFixture + ) -> None: + """Test that the LLM prompt includes query, response, and expected.""" + cm = _make_custom_metrics(mocker) + call_spy = mocker.patch.object( + cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS + ) + + turn = TurnData( + turn_id="t1", + query="Fix OOMKilled pod", + response="## Analysis\nDiagnosis: memory too low", + expected_outcome="Root cause: OOMKilled. Increase memory limit to 512Mi.", + ) + cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + prompt: str = call_spy.call_args[0][0] + assert "Fix OOMKilled pod" in prompt + assert "Diagnosis: memory too low" in prompt + assert "Increase memory limit to 512Mi" in prompt + + def test_prompt_contains_sre_persona(self, mocker: MockerFixture) -> None: + """Test that the prompt includes the SRE persona.""" + cm = _make_custom_metrics(mocker) + call_spy = mocker.patch.object( + cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS + ) + + turn = TurnData( + turn_id="t1", + query="q", + response="r", + expected_outcome="e", + ) + cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + prompt: str = call_spy.call_args[0][0] + assert "senior Site Reliability Engineer" in prompt + + def test_prompt_contains_workflow_phases(self, mocker: MockerFixture) -> None: + """Test that the prompt includes workflow phases when set.""" + cm = _make_custom_metrics(mocker) + call_spy = mocker.patch.object( + cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS + ) + + turn = TurnData( + turn_id="t1", + query="q", + response="r", + expected_outcome="e", + proposal_phases=["analysis", "execution"], + ) + cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + prompt: str = call_spy.call_args[0][0] + assert "Phases executed: analysis, execution" in prompt + + def test_prompt_workflow_phases_unknown_when_none( + self, mocker: MockerFixture + ) -> None: + """Test that missing proposal_phases produces fallback text.""" + cm = _make_custom_metrics(mocker) + call_spy = mocker.patch.object( + cm, "_call_llm", return_value=_LLM_RESPONSE_ALL_DIMS + ) + + turn = TurnData( + turn_id="t1", + query="q", + response="r", + expected_outcome="e", + ) + cm.evaluate(METRIC_NAME, None, _make_scope(turn)) + + prompt: str = call_spy.call_args[0][0] + assert "Phases executed: unknown" in prompt + + +class TestBuildWorkflowPhases: + """Test _build_workflow_phases helper.""" + + def test_with_phases(self, mocker: MockerFixture) -> None: + """Test phases list produces comma-separated string.""" + cm = _make_custom_metrics(mocker) + turn = TurnData( + turn_id="t1", + query="q", + proposal_phases=["analysis", "execution", "verification"], + ) + + result = cm._build_workflow_phases(turn) + + assert result == "Phases executed: analysis, execution, verification" + + def test_analysis_only(self, mocker: MockerFixture) -> None: + """Test single-phase list.""" + cm = _make_custom_metrics(mocker) + turn = TurnData( + turn_id="t1", + query="q", + proposal_phases=["analysis"], + ) + + result = cm._build_workflow_phases(turn) + + assert result == "Phases executed: analysis" + + def test_none_phases(self, mocker: MockerFixture) -> None: + """Test None proposal_phases produces fallback.""" + cm = _make_custom_metrics(mocker) + turn = TurnData(turn_id="t1", query="q") + + result = cm._build_workflow_phases(turn) + + assert "unknown" in result + + def test_empty_phases(self, mocker: MockerFixture) -> None: + """Test empty list produces fallback.""" + cm = _make_custom_metrics(mocker) + turn = TurnData(turn_id="t1", query="q", proposal_phases=[]) + + result = cm._build_workflow_phases(turn) + + assert "unknown" in result diff --git a/tests/unit/core/metrics/custom/test_proposal_eval.py b/tests/unit/core/metrics/custom/test_proposal_eval.py index 2c3875aa..cc756dff 100644 --- a/tests/unit/core/metrics/custom/test_proposal_eval.py +++ b/tests/unit/core/metrics/custom/test_proposal_eval.py @@ -3,10 +3,10 @@ from typing import Any, Optional from lightspeed_evaluation.core.metrics.custom.proposal_eval import ( - _derive_phase, evaluate_proposal_status, ) from lightspeed_evaluation.core.models import TurnData +from lightspeed_evaluation.core.proposal import derive_phase def _make_turn( @@ -61,7 +61,7 @@ class TestDerivePhase: def test_completed_analysis_only(self) -> None: """Analysis-only with Analyzed=True derives Completed.""" conditions = [{"type": "Analyzed", "status": "True"}] - assert _derive_phase(conditions, {"analysis": {}}) == "Completed" + assert derive_phase(conditions, {"analysis": {}}) == "Completed" def test_completed_full_lifecycle(self) -> None: """Full lifecycle with all conditions True derives Completed.""" @@ -71,7 +71,7 @@ def test_completed_full_lifecycle(self) -> None: {"type": "Verified", "status": "True"}, ] spec: dict[str, Any] = {"analysis": {}, "execution": {}, "verification": {}} - assert _derive_phase(conditions, spec) == "Completed" + assert derive_phase(conditions, spec) == "Completed" def test_completed_execution_no_verification(self) -> None: """Execution without verification with Executed=True derives Completed.""" @@ -80,7 +80,7 @@ def test_completed_execution_no_verification(self) -> None: {"type": "Executed", "status": "True"}, ] spec: dict[str, Any] = {"analysis": {}, "execution": {}} - assert _derive_phase(conditions, spec) == "Completed" + assert derive_phase(conditions, spec) == "Completed" def test_failed_condition(self) -> None: """Any condition with status False derives Failed.""" @@ -88,7 +88,7 @@ def test_failed_condition(self) -> None: {"type": "Analyzed", "status": "True"}, {"type": "Executed", "status": "False", "reason": "Error"}, ] - assert _derive_phase(conditions) == "Failed" + assert derive_phase(conditions) == "Failed" def test_retrying_execution_not_failed(self) -> None: """RetryingExecution reason does not count as failure.""" @@ -97,22 +97,22 @@ def test_retrying_execution_not_failed(self) -> None: {"type": "Verified", "status": "False", "reason": "RetryingExecution"}, ] spec: dict[str, Any] = {"analysis": {}, "execution": {}, "verification": {}} - assert _derive_phase(conditions, spec) == "InProgress" + assert derive_phase(conditions, spec) == "InProgress" def test_denied(self) -> None: """Denied=True derives Denied.""" conditions = [{"type": "Denied", "status": "True"}] - assert _derive_phase(conditions) == "Denied" + assert derive_phase(conditions) == "Denied" def test_escalated(self) -> None: """Escalated=True derives Escalated.""" conditions = [{"type": "Escalated", "status": "True"}] - assert _derive_phase(conditions) == "Escalated" + assert derive_phase(conditions) == "Escalated" def test_in_progress(self) -> None: """Unknown status derives InProgress.""" conditions = [{"type": "Analyzed", "status": "Unknown"}] - assert _derive_phase(conditions) == "InProgress" + assert derive_phase(conditions) == "InProgress" def test_no_proposal_spec_infers_last_step(self) -> None: """Without proposal_spec, infers last step from conditions.""" @@ -121,7 +121,7 @@ def test_no_proposal_spec_infers_last_step(self) -> None: {"type": "Executed", "status": "True"}, {"type": "Verified", "status": "True"}, ] - assert _derive_phase(conditions, None) == "Completed" + assert derive_phase(conditions, None) == "Completed" class TestPhaseCheck: diff --git a/tests/unit/pipeline/evaluation/test_driver.py b/tests/unit/pipeline/evaluation/test_driver.py index 78a85f5d..564ed780 100644 --- a/tests/unit/pipeline/evaluation/test_driver.py +++ b/tests/unit/pipeline/evaluation/test_driver.py @@ -14,6 +14,7 @@ AgentDriver, HttpApiDriver, ProposalDriver, + TerminalOutcome, ) from lightspeed_evaluation.pipeline.evaluation.registry import AgentDriverRegistry @@ -176,6 +177,83 @@ def test_validate_config_invalid(self) -> None: ) +class TestProposalDriverExecuteTurn: + """Unit tests for ProposalDriver.execute_turn outcome routing.""" + + @pytest.fixture() + def _proposal_driver(self, mocker: MockerFixture) -> ProposalDriver: + """Create a ProposalDriver with mocked infrastructure.""" + mocker.patch("shutil.which", return_value="/usr/bin/oc") + driver = ProposalDriver( + {"type": "proposal", "namespace": "ns", "timeout": 10, "poll_interval": 1}, + enabled=True, + ) + apply_result = mocker.Mock(returncode=0, stderr="") + mocker.patch.object(driver, "_apply", return_value=apply_result) + mocker.patch.object(driver, "_cleanup") + mocker.patch.object(driver, "_approve_when_ready", return_value=None) + mocker.patch.object(driver._amender, "amend", return_value=None) + return driver + + def _stub_terminal( + self, + mocker: MockerFixture, + driver: ProposalDriver, + outcome: TerminalOutcome, + ) -> None: + """Stub _get_status and _is_terminal to return a given outcome immediately.""" + mocker.patch.object( + driver, "_get_status", return_value=({"conditions": []}, None) + ) + mocker.patch.object(driver, "_is_terminal", return_value=outcome) + + def test_completed_returns_no_error( + self, mocker: MockerFixture, _proposal_driver: ProposalDriver + ) -> None: + """Test COMPLETED outcome returns no error.""" + self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.COMPLETED) + turn = TurnData(turn_id="1", query="Q") + + error, _ = _proposal_driver.execute_turn(turn) + + assert error is None + + def test_failed_returns_error( + self, mocker: MockerFixture, _proposal_driver: ProposalDriver + ) -> None: + """Test FAILED outcome returns an error message.""" + self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.FAILED) + turn = TurnData(turn_id="1", query="Q") + + error, _ = _proposal_driver.execute_turn(turn) + + assert error is not None + assert "execution failed" in error + + def test_denied_returns_no_error( + self, mocker: MockerFixture, _proposal_driver: ProposalDriver + ) -> None: + """Test DENIED outcome returns no error so metrics are still evaluated.""" + self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.DENIED) + turn = TurnData(turn_id="1", query="Q") + + error, _ = _proposal_driver.execute_turn(turn) + + assert error is None + + def test_escalated_returns_error( + self, mocker: MockerFixture, _proposal_driver: ProposalDriver + ) -> None: + """Test ESCALATED outcome returns an error message.""" + self._stub_terminal(mocker, _proposal_driver, TerminalOutcome.ESCALATED) + turn = TurnData(turn_id="1", query="Q") + + error, _ = _proposal_driver.execute_turn(turn) + + assert error is not None + assert "escalated" in error + + class TestAgentDriverBase: """Unit tests for AgentDriver base class defaults.""" diff --git a/tests/unit/pipeline/evaluation/test_proposal_amender.py b/tests/unit/pipeline/evaluation/test_proposal_amender.py new file mode 100644 index 00000000..ffab7b9f --- /dev/null +++ b/tests/unit/pipeline/evaluation/test_proposal_amender.py @@ -0,0 +1,517 @@ +"""Unit tests for ProposalAmender module.""" + +import subprocess +from typing import Any, Optional + +from pytest_mock import MockerFixture + +from lightspeed_evaluation.core.models import TurnData +from lightspeed_evaluation.pipeline.evaluation.cli import CLIClient, KubeCLI +from lightspeed_evaluation.pipeline.evaluation.proposal_amender import ( + ProposalAmender, +) + + +class MockCLI(CLIClient): + """Mock CLIClient that returns pre-configured resources.""" + + def __init__(self, resources: Optional[dict[str, dict[str, Any]]] = None) -> None: + """Initialize with a map of resource name -> full CR dict.""" + super().__init__(timeout=30) + self._resources: dict[str, dict[str, Any]] = resources or {} + + def run( + self, + args: list[str], + stdin: Optional[str] = None, + ) -> subprocess.CompletedProcess[str]: + """Not used in amender tests.""" + return subprocess.CompletedProcess( + args=args, returncode=0, stdout="", stderr="" + ) + + def get_resource( + self, + resource_plural: str, + name: str, + ) -> tuple[dict[str, Any], Optional[str]]: + """Return pre-configured resource or error.""" + key = f"{resource_plural}/{name}" + if key in self._resources: + return self._resources[key], None + return {}, f"not found: {key}" + + def apply( + self, + manifest: dict[str, Any], + ) -> subprocess.CompletedProcess[str]: + """Not used in amender tests.""" + return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="") + + def delete(self, resource_plural: str, name: str) -> None: + """Not used in amender tests.""" + + +def _make_turn(query: str = "Fix pod crash") -> TurnData: + """Create a minimal TurnData for testing.""" + return TurnData(turn_id="t1", query=query) + + +def _get_results(turn: TurnData) -> dict[str, Any]: + """Extract proposal_results, failing if None.""" + assert turn.proposal_results is not None + return dict(turn.proposal_results) + + +def _get_response(turn: TurnData) -> str: + """Extract response, failing if None.""" + assert turn.response is not None + return str(turn.response) + + +DIAGNOSIS_STATUS: dict[str, Any] = { + "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}], + "options": [ + { + "title": "Increase memory limit", + "summary": "Bump memory to 512Mi", + "diagnosis": { + "summary": "OOMKilled due to low memory", + "confidence": "High", + "rootCause": "Memory limit 256Mi too low", + }, + "proposal": { + "description": "Patch deployment memory", + "actions": [ + {"type": "patch", "description": "Set memory limit to 512Mi"}, + ], + "risk": "Low", + "reversible": "Reversible", + "estimatedImpact": "Brief pod restart", + }, + }, + { + "title": "Scale horizontally", + "diagnosis": { + "summary": "High load causing OOM", + "confidence": "Medium", + "rootCause": "Single replica under load", + }, + "proposal": { + "description": "Add replicas", + "actions": [ + {"type": "scale", "description": "Scale to 3 replicas"}, + ], + "risk": "Low", + "reversible": "Reversible", + "estimatedImpact": "No downtime", + }, + }, + ], +} + +EXECUTION_STATUS: dict[str, Any] = { + "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}], + "actionsTaken": [ + { + "type": "patch", + "description": "Set memory limit to 512Mi", + "outcome": "Succeeded", + "output": "deployment.apps/web patched", + }, + ], + "verification": { + "conditionOutcome": "Improved", + "summary": "Pod running with new limits", + }, +} + +VERIFICATION_STATUS: dict[str, Any] = { + "conditions": [{"type": "Completed", "status": "True", "reason": "Succeeded"}], + "checks": [ + { + "name": "pod-running", + "source": "oc", + "value": "Running", + "result": "Passed", + }, + ], + "summary": "All checks passed", +} + + +class TestAmendAnalysisOnly: + """Test ProposalAmender with analysis-only workflow.""" + + def test_populates_proposal_results(self) -> None: + """Test amend populates proposal_results with analysis data.""" + cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [{"type": "Analyzed", "status": "True", "message": "Done"}], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + }, + } + + err = amender.amend(turn, status) + + assert err is None + assert turn.proposal_status == status + results = _get_results(turn) + assert "analysis" in results + assert len(results["analysis"]) == 1 + assert results["analysis"][0]["options"][0]["title"] == "Increase memory limit" + + def test_response_contains_diagnosis(self) -> None: + """Test Markdown summary includes diagnosis details.""" + cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [{"type": "Analyzed", "status": "True", "message": "Done"}], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + }, + } + + amender.amend(turn, status) + + response = _get_response(turn) + assert "OOMKilled due to low memory" in response + assert "Memory limit 256Mi too low" in response + assert "Confidence: High" in response + + def test_option_zero_marked_approved(self) -> None: + """Test option 0 is marked as (Approved) in summary.""" + cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + }, + } + + amender.amend(turn, status) + + response = _get_response(turn) + assert "(Approved)" in response + assert "### Option 0 (Approved): Increase memory limit" in response + assert "### Option 1 : Scale horizontally" in response + + def test_no_execution_or_verification_in_results(self) -> None: + """Test analysis-only workflow has no execution/verification keys.""" + cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + }, + } + + amender.amend(turn, status) + + results = _get_results(turn) + assert "execution" not in results + assert "verification" not in results + + def test_proposal_phases_analysis_only(self) -> None: + """Test proposal_phases contains only 'analysis' for analysis-only workflow.""" + cli = MockCLI({"analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + }, + } + + amender.amend(turn, status) + + assert turn.proposal_phases == ["analysis"] + + +class TestAmendFullPipeline: + """Test ProposalAmender with analysis + execution + verification.""" + + def _make_cli(self) -> MockCLI: + """Create MockCLI with all three result CRs.""" + return MockCLI( + { + "analysisresults/ar-1": {"status": DIAGNOSIS_STATUS}, + "executionresults/er-1": {"status": EXECUTION_STATUS}, + "verificationresults/vr-1": {"status": VERIFICATION_STATUS}, + } + ) + + def _make_status(self) -> dict[str, Any]: + """Create proposal status with all three steps.""" + return { + "conditions": [ + {"type": "Analyzed", "status": "True"}, + {"type": "Executed", "status": "True"}, + {"type": "Verified", "status": "True", "message": "All passed"}, + ], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + "execution": { + "results": [{"name": "er-1", "outcome": "Succeeded"}], + }, + "verification": { + "results": [{"name": "vr-1", "outcome": "Succeeded"}], + }, + }, + } + + def test_all_results_populated(self) -> None: + """Test all three step results are populated.""" + amender = ProposalAmender(self._make_cli()) + turn = _make_turn() + + amender.amend(turn, self._make_status()) + + results = _get_results(turn) + assert "analysis" in results + assert "execution" in results + assert "verification" in results + assert len(results["analysis"]) == 1 + assert len(results["execution"]) == 1 + assert len(results["verification"]) == 1 + + def test_execution_in_summary(self) -> None: + """Test Markdown includes execution actions.""" + amender = ProposalAmender(self._make_cli()) + turn = _make_turn() + + amender.amend(turn, self._make_status()) + + response = _get_response(turn) + assert "## Execution" in response + assert "Set memory limit to 512Mi" in response + assert "Succeeded" in response + assert "deployment.apps/web patched" in response + + def test_verification_in_summary(self) -> None: + """Test Markdown includes verification checks.""" + amender = ProposalAmender(self._make_cli()) + turn = _make_turn() + + amender.amend(turn, self._make_status()) + + response = _get_response(turn) + assert "## Verification" in response + assert "pod-running" in response + assert "Passed" in response + assert "All checks passed" in response + + def test_proposal_phases_full_pipeline(self) -> None: + """Test proposal_phases lists all three phases.""" + amender = ProposalAmender(self._make_cli()) + turn = _make_turn() + + amender.amend(turn, self._make_status()) + + assert turn.proposal_phases == ["analysis", "execution", "verification"] + + +class TestAmendEdgeCases: + """Test ProposalAmender edge cases.""" + + def test_empty_steps(self) -> None: + """Test status with no steps produces empty results.""" + cli = MockCLI() + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = {"conditions": []} + + amender.amend(turn, status) + + assert not turn.proposal_results + assert not turn.proposal_phases + response = _get_response(turn) + assert "## Request" in response + + def test_step_with_no_results(self) -> None: + """Test step present but no result refs gives empty list.""" + cli = MockCLI() + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [], + "steps": { + "analysis": {"results": []}, + }, + } + + amender.amend(turn, status) + + results = _get_results(turn) + assert results["analysis"] == [] + + def test_failed_fetch_logged_and_skipped(self, mocker: MockerFixture) -> None: + """Test failed CR fetch is logged and result is skipped.""" + cli = MockCLI() + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [], + "steps": { + "analysis": { + "results": [{"name": "missing-cr", "outcome": "Succeeded"}], + }, + }, + } + mock_logger = mocker.patch( + "lightspeed_evaluation.pipeline.evaluation.proposal_amender.logger" + ) + + err = amender.amend(turn, status) + + assert err is None + results = _get_results(turn) + assert results["analysis"] == [] + mock_logger.warning.assert_called_once() + + def test_analysis_with_failure_reason(self) -> None: + """Test analysis result with failureReason.""" + failed_status: dict[str, Any] = { + "conditions": [{"type": "Completed", "status": "False"}], + "failureReason": "LLM timeout", + } + cli = MockCLI({"analysisresults/ar-fail": {"status": failed_status}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [{"type": "Analyzed", "status": "False"}], + "steps": { + "analysis": { + "results": [{"name": "ar-fail", "outcome": "Failed"}], + }, + }, + } + + amender.amend(turn, status) + + response = _get_response(turn) + assert "LLM timeout" in response + + def test_escalation_results(self) -> None: + """Test escalation step results are captured.""" + esc_status: dict[str, Any] = { + "conditions": [], + "summary": "Requires manual intervention", + "content": "Cluster admin must review", + } + cli = MockCLI({"escalationresults/esc-1": {"status": esc_status}}) + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [{"type": "Escalated", "status": "True"}], + "steps": { + "escalation": { + "results": [{"name": "esc-1", "outcome": "Succeeded"}], + }, + }, + } + + amender.amend(turn, status) + + results = _get_results(turn) + assert "escalation" in results + response = _get_response(turn) + assert "Requires manual intervention" in response + assert "Cluster admin must review" in response + + def test_outcome_section_from_conditions(self) -> None: + """Test outcome section uses condition messages.""" + cli = MockCLI() + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [ + {"type": "Analyzed", "status": "True", "message": "Analysis complete"}, + {"type": "Executed", "status": "True", "message": "Execution done"}, + ], + } + + amender.amend(turn, status) + + response = _get_response(turn) + assert "## Outcome" in response + assert "Analysis complete; Execution done" in response + + def test_subprocess_error_caught_by_amend(self, mocker: MockerFixture) -> None: + """Test that subprocess errors are caught by the broadened except clause.""" + cli = MockCLI() + amender = ProposalAmender(cli) + turn = _make_turn() + status: dict[str, Any] = { + "conditions": [], + "steps": { + "analysis": { + "results": [{"name": "ar-1", "outcome": "Succeeded"}], + }, + }, + } + mocker.patch.object( + cli, + "get_resource", + side_effect=subprocess.SubprocessError("unexpected"), + ) + + err = amender.amend(turn, status) + + assert err is not None + assert "ProposalAmender error" in err + + +class TestKubeCLITimeoutHandling: + """Test KubeCLI normalizes subprocess.TimeoutExpired to CompletedProcess.""" + + def test_run_returns_completed_process_on_timeout( + self, mocker: MockerFixture + ) -> None: + """Test that TimeoutExpired is caught and a failing CompletedProcess returned.""" + mocker.patch( + "subprocess.run", + side_effect=subprocess.TimeoutExpired(cmd="oc get pods", timeout=30), + ) + + cli = KubeCLI(cli_path="oc", namespace="test-ns", timeout=30) + result = cli.run(["get", "pods"]) + + assert result.returncode == 1 + assert result.stdout == "" + assert "timed out after 30s" in result.stderr + + def test_get_resource_returns_error_on_timeout(self, mocker: MockerFixture) -> None: + """Test that get_resource returns an error tuple when run times out.""" + mocker.patch( + "subprocess.run", + side_effect=subprocess.TimeoutExpired(cmd="oc get pods", timeout=30), + ) + + cli = KubeCLI(cli_path="oc", namespace="test-ns", timeout=30) + resource, err = cli.get_resource("pods", "my-pod") + + assert resource == {} + assert err is not None + assert "timed out" in err diff --git a/tests/unit/pipeline/evaluation/test_proposal_driver.py b/tests/unit/pipeline/evaluation/test_proposal_driver.py index 32d5fb22..3bcbc022 100644 --- a/tests/unit/pipeline/evaluation/test_proposal_driver.py +++ b/tests/unit/pipeline/evaluation/test_proposal_driver.py @@ -57,6 +57,7 @@ def test_valid_config_all_fields(self) -> None: "auto_approve": False, "cleanup_proposals": False, "timeout": 60, + "cli_timeout": 15, "poll_interval": 5, } ) @@ -64,6 +65,7 @@ def test_valid_config_all_fields(self) -> None: assert config.auto_approve is False assert config.cleanup_proposals is False assert config.timeout == 60 + assert config.cli_timeout == 15 assert config.poll_interval == 5 def test_valid_config_defaults(self) -> None: @@ -72,6 +74,7 @@ def test_valid_config_defaults(self) -> None: assert config.auto_approve is True assert config.cleanup_proposals is True assert config.timeout == 900 + assert config.cli_timeout == 30 assert config.poll_interval == 2 def test_missing_namespace(self) -> None: @@ -94,6 +97,16 @@ def test_invalid_timeout_negative(self) -> None: with pytest.raises(ValidationError): ProposalAgentConfig.model_validate({**VALID_CONFIG, "timeout": -1}) + def test_invalid_cli_timeout_zero(self) -> None: + """Test cli_timeout=0 raises ValidationError.""" + with pytest.raises(ValidationError): + ProposalAgentConfig.model_validate({**VALID_CONFIG, "cli_timeout": 0}) + + def test_invalid_cli_timeout_negative(self) -> None: + """Test negative cli_timeout raises ValidationError.""" + with pytest.raises(ValidationError): + ProposalAgentConfig.model_validate({**VALID_CONFIG, "cli_timeout": -1}) + def test_invalid_poll_interval_zero(self) -> None: """Test poll_interval=0 raises ValidationError.""" with pytest.raises(ValidationError): @@ -476,7 +489,9 @@ def test_happy_path_completed( assert error is None assert conv_id is None - assert turn.response == "Analysis done; Passed" + response = str(turn.response) + assert "Analysis done" in response + assert "Passed" in response assert turn.proposal_status == terminal_status driver._cleanup.assert_called_once_with("eval-abcd1234") @@ -584,14 +599,13 @@ def test_failed_terminal( error, _ = driver.execute_turn(turn) assert error is not None - assert "Failed" in error + assert "failed" in error assert turn.proposal_status == status - assert turn.response == "LLM error" def test_denied_terminal( self, mocker: MockerFixture, driver: ProposalDriver ) -> None: - """Test denied proposal returns error.""" + """Test denied proposal populates turn data and returns no error.""" mock_time = mocker.patch(f"{MODULE}.time") mock_time.monotonic.side_effect = [0.0, 0.0, 0.0, 1.0] @@ -607,8 +621,8 @@ def test_denied_terminal( turn = TurnData(turn_id="t1", query="Q", proposal_spec=SPEC_FULL) error, _ = driver.execute_turn(turn) - assert error is not None - assert "Denied" in error + assert error is None + assert turn.proposal_status == status def test_get_status_error( self, mocker: MockerFixture, driver: ProposalDriver