Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/lightspeed_evaluation/core/llm/deepeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

import litellm
from deepeval.models import LiteLLMModel
from tenacity import stop_after_attempt

from lightspeed_evaluation.core.constants import DEFAULT_LLM_RETRIES
from lightspeed_evaluation.core.llm.litellm_patch import setup_litellm_ssl

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -46,15 +48,46 @@ def __init__(self, model_name: str, llm_params: dict[str, Any]):
# LiteLLMModel stores **kwargs in self.kwargs and merges them into
# every litellm.completion() call
# Note: Forbidden keys are rejected at LLMParametersConfig load time

# Override DeepEval's hardcoded retry logic with user configuration
# DeepEval uses @retry decorators that capture MAX_RETRIES at import time
# We must patch the retry decorators after import but before instantiation
num_retries = self.llm_params.get("num_retries", DEFAULT_LLM_RETRIES)

self._patch_deepeval_retries(num_retries)

Comment thread
xmican10 marked this conversation as resolved.
self.llm_model = LiteLLMModel(
model=self.model_name,
timeout=self.llm_params.get("timeout"),
num_retries=self.llm_params.get("num_retries"),
**self.llm_params.get("parameters", {}),
)

print(f"✅ DeepEval LLM Manager: {self.model_name}")

def _patch_deepeval_retries(self, max_retries: int) -> None:
"""Monkey-patch DeepEval's retry decorators to use configured max_retries.

DeepEval's @retry decorators capture MAX_RETRIES at import time.
We patch the 'stop' attribute on each retry decorator to use our value.
"""
# Patch the stop condition on all retry-decorated methods
for method_name in [
"generate",
"a_generate",
"generate_raw_response",
"a_generate_raw_response",
"generate_samples",
]:
method = getattr(LiteLLMModel, method_name)
method.retry.stop = stop_after_attempt( # pylint: disable=no-member
Comment on lines +81 to +82
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really don't like this but I can't figure out a better solution :-(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bug_test_geval_score_mismatch.yaml was alredy removed, the tests do cover the case :-)

max_retries
Comment thread
xmican10 marked this conversation as resolved.
)

logger.info(
"Patched DeepEval retry logic: max_retries=%d",
max_retries,
)

def setup_ssl_verify(self) -> None:
"""Setup SSL verification based on LLM parameters.

Expand Down
26 changes: 22 additions & 4 deletions src/lightspeed_evaluation/core/metrics/deepeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,34 @@ def _build_conversational_test_case(self, conv_data: Any) -> ConversationalTestC

return ConversationalTestCase(turns=turns)

def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float, str]:
def _evaluate_metric(self, metric: Any, test_case: Any) -> tuple[float | None, str]:
"""Evaluate and get result."""
metric.measure(test_case)
self.llm_manager.flush_deepevals_pending_tasks()

score = metric.score
reason = (
metric.reason
if hasattr(metric, "reason") and metric.reason
else f"Score: {metric.score:.2f}"
else f"Score: {score:.2f}" if score is not None else "No score returned"
)
return metric.score, reason

# CRITICAL: Warn if score is None (indicates evaluation failure)
# None scores indicate evaluation failures that need investigation:
# - Rate limiting (429 errors)
# - LLM judge returning malformed JSON that fails parsing
# - Timeout errors from LLM provider
# - API quota/credits exhausted
if score is None:
logger.warning(
"%s metric returned None score. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
metric.__class__.__name__,
reason,
)

return score, reason

def evaluate(
self,
Expand All @@ -117,7 +134,8 @@ def evaluate(
scope: EvaluationScope containing turn info and conversation flag

Returns:
Tuple of (score, reason)
tuple[float | None, str]: Tuple of (score, reason).
Score is in [0, 1] or None if evaluation failed.
"""
# Route to standard DeepEval metrics
if metric_name in self.supported_metrics:
Expand Down
37 changes: 33 additions & 4 deletions src/lightspeed_evaluation/core/metrics/geval.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,17 +292,34 @@ def _evaluate_turn( # pylint: disable=R0913,R0917
# Create test case for a single turn
test_case = LLMTestCase(**test_case_kwargs)

# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
try:
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()

score = metric.score if metric.score is not None else 0.0
# Extract score and reason
score = metric.score
reason = (
str(metric.reason)
if hasattr(metric, "reason") and metric.reason
else "No reason provided"
)

# CRITICAL: Warn if score is None (indicates evaluation failure)
# None scores indicate evaluation failures that need investigation:
# - Rate limiting (429 errors after all retries exhausted)
# - LLM judge returning malformed JSON that fails parsing
# - Timeout errors from LLM provider
# - API quota/credits exhausted
# Warning helps identify these failures for debugging.
if score is None:
logger.warning(
"GEval turn-level metric returned None score. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
reason,
)

return score, reason
except Exception as e: # pylint: disable=W0718
logger.error(
Expand Down Expand Up @@ -405,17 +422,29 @@ def _evaluate_conversation( # pylint: disable=R0913,R0917,R0914
actual_output="\n".join(conversation_output),
)

# Evaluate (DeepEval normalizes score to [0, 1]; pass through as-is)
# Evaluate with retry (DeepEval normalizes score to [0, 1]; pass through as-is)
try:
metric.measure(test_case)
self.deepeval_llm_manager.flush_deepevals_pending_tasks()

score = metric.score if metric.score is not None else 0.0
# Extract score and reason
score = metric.score
reason = (
str(metric.reason)
if hasattr(metric, "reason") and metric.reason
else "No reason provided"
)

# CRITICAL: Warn if score is None (indicates evaluation failure)
# See turn-level evaluation for detailed explanation of why this matters
if score is None:
logger.warning(
"GEval conversation-level metric returned None score. "
"This typically indicates LLM judge failure (rate limiting, timeout, "
"invalid JSON response, or quota exhausted). Reason: %s",
reason,
)

return score, reason
except Exception as e: # pylint: disable=W0718
logger.error(
Expand Down
61 changes: 61 additions & 0 deletions tests/unit/core/llm/test_deepeval_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Unit tests for DeepEval LLM Manager."""

import logging

import pytest
from pytest_mock import MockerFixture

Expand Down Expand Up @@ -95,3 +97,62 @@ def test_drop_params_always_enabled(self, mocker: MockerFixture) -> None:
DeepEvalLLMManager("gpt-4", {})

assert mock_litellm.drop_params is True

def test_patch_deepeval_retries_called_with_configured_value(
self, mocker: MockerFixture
) -> None:
"""Test that _patch_deepeval_retries patches LiteLLMModel retry logic."""
# Mock LiteLLMModel with methods that have retry decorators
mock_generate = mocker.Mock()
mock_generate.retry = mocker.Mock()
mock_a_generate = mocker.Mock()
mock_a_generate.retry = mocker.Mock()
mock_generate_raw = mocker.Mock()
mock_generate_raw.retry = mocker.Mock()
mock_a_generate_raw = mocker.Mock()
mock_a_generate_raw.retry = mocker.Mock()
mock_generate_samples = mocker.Mock()
mock_generate_samples.retry = mocker.Mock()

mock_litellm_class = mocker.patch(
"lightspeed_evaluation.core.llm.deepeval.LiteLLMModel"
)
mock_litellm_class.generate = mock_generate
mock_litellm_class.a_generate = mock_a_generate
mock_litellm_class.generate_raw_response = mock_generate_raw
mock_litellm_class.a_generate_raw_response = mock_a_generate_raw
mock_litellm_class.generate_samples = mock_generate_samples

# Mock stop_after_attempt to return a mock stop condition
mock_stop_condition = mocker.Mock()
mock_stop_after_attempt = mocker.patch(
"lightspeed_evaluation.core.llm.deepeval.stop_after_attempt",
return_value=mock_stop_condition,
)

params = {"num_retries": 5}
DeepEvalLLMManager("gpt-4", params)

# Verify stop_after_attempt was called 5 times (once per method) with the configured retries
assert mock_stop_after_attempt.call_count == 5
mock_stop_after_attempt.assert_called_with(5)

# Verify each method's retry.stop was set to the new stop condition
assert mock_generate.retry.stop == mock_stop_condition
assert mock_a_generate.retry.stop == mock_stop_condition
assert mock_generate_raw.retry.stop == mock_stop_condition
assert mock_a_generate_raw.retry.stop == mock_stop_condition
assert mock_generate_samples.retry.stop == mock_stop_condition

def test_patch_deepeval_retries_logs_operation(
self, mocker: MockerFixture, caplog: pytest.LogCaptureFixture
) -> None:
"""Test that retry patching logs the max_retries value."""
mocker.patch("lightspeed_evaluation.core.llm.deepeval.LiteLLMModel")

params = {"num_retries": 7}

with caplog.at_level(logging.INFO):
DeepEvalLLMManager("gpt-4", params)

assert "Patched DeepEval retry logic: max_retries=7" in caplog.text
66 changes: 66 additions & 0 deletions tests/unit/core/metrics/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
"""Pytest configuration and fixtures for metrics tests."""

import sys
from typing import Any

import pytest
from pytest_mock import MockerFixture

from lightspeed_evaluation.core.metrics.deepeval import DeepEvalMetrics
from lightspeed_evaluation.core.metrics.nlp import NLPMetrics
from lightspeed_evaluation.core.models import EvaluationScope, SystemConfig, TurnData

Expand Down Expand Up @@ -154,3 +156,67 @@ def mock_similarity_scorer(mocker: MockerFixture) -> MockerFixture:
return_value=mock_scorer_instance,
)
return mock_scorer_instance


@pytest.fixture
def mock_llm_manager(mocker: MockerFixture) -> Any:
"""Create a mock LLMManager for DeepEval tests."""
mock_manager = mocker.MagicMock()
mock_config = mocker.MagicMock()
mock_config.cache_enabled = False
mock_manager.get_config.return_value = mock_config
mock_manager.get_model_name.return_value = "gpt-4"
mock_manager.get_llm_params.return_value = {"num_retries": 3}
return mock_manager


@pytest.fixture
def mock_metric_manager(mocker: MockerFixture) -> Any:
"""Create a mock MetricManager."""
return mocker.MagicMock()


@pytest.fixture
def mock_deepeval_llm_manager(mocker: MockerFixture) -> Any:
"""Create a mock DeepEvalLLMManager."""
mock_manager = mocker.MagicMock()
mock_llm = mocker.MagicMock()
mock_manager.get_llm.return_value = mock_llm
mock_manager.flush_deepevals_pending_tasks.return_value = None
return mock_manager


@pytest.fixture
def mock_conv_data(mocker: MockerFixture) -> Any:
"""Create mock conversation data with turns."""
turn1 = mocker.MagicMock()
turn1.query = "What is AI?"
turn1.response = "AI stands for Artificial Intelligence."

turn2 = mocker.MagicMock()
turn2.query = "Can you explain more?"
turn2.response = "AI is the simulation of human intelligence by machines."

conv_data = mocker.MagicMock()
conv_data.turns = [turn1, turn2]
return conv_data


@pytest.fixture
def deepeval_metrics(
mock_llm_manager: Any,
mock_metric_manager: Any,
mock_deepeval_llm_manager: Any,
mocker: MockerFixture,
) -> DeepEvalMetrics:
"""Create DeepEvalMetrics instance with mocked dependencies."""
mocker.patch(
"lightspeed_evaluation.core.metrics.deepeval.DeepEvalLLMManager",
return_value=mock_deepeval_llm_manager,
)
mocker.patch("lightspeed_evaluation.core.metrics.deepeval.GEvalHandler")

return DeepEvalMetrics(
llm_manager=mock_llm_manager,
metric_manager=mock_metric_manager,
)
Loading
Loading