Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ update-deps: ## Check pyproject.toml for changes, update the lock file if needed
uv sync --group dev

check-types: ## Checks type hints in sources
uv run mypy --explicit-package-bases --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs src/ lsc_agent_eval/src/ tests
uv run mypy src/ lsc_agent_eval/src/ tests

black-check:
uv run black src tests script lsc_agent_eval --check
uv run black --check src tests script lsc_agent_eval

black-format:
uv run black src tests script lsc_agent_eval
Expand Down Expand Up @@ -118,7 +118,7 @@ shellcheck: ## Run shellcheck

pylint:
uv run pylint src
uv run pylint --disable=R0801 lsc_agent_eval/src tests
uv run pylint lsc_agent_eval/src tests

pyright:
uv run pyright src lsc_agent_eval/src tests
Expand All @@ -130,4 +130,4 @@ ruff:
uv run ruff check src tests script lsc_agent_eval

bandit: ## Security scanning with Bandit
uv run bandit -r src/lightspeed_evaluation -ll
uv run bandit -c pyproject.toml -r src/lightspeed_evaluation -ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..utils.exceptions import AgentAPIError, JudgeModelError, ScriptExecutionError
from ..utils.prompt import ANSWER_CORRECTNESS_PROMPT, INTENT_DETECTION_PROMPT
from .tool_call_eval import compare_tool_calls
from .utils import create_evaluation_results
from .utils import EvalResultItem, create_evaluation_results

if TYPE_CHECKING:
from ..utils.api_client import AgentHttpClient
Expand Down Expand Up @@ -42,12 +42,13 @@ def run_evaluation( # pylint: disable=too-many-arguments,too-many-positional-ar
"""Run multiple evaluations based on configuration."""
try:
# Query the agent once
api_input = {
api_input: dict[str, str] = {
"query": data_config.eval_query,
"provider": agent_provider,
"model": agent_model,
"conversation_id": conversation_id,
}
if conversation_id is not None:
api_input["conversation_id"] = conversation_id

if endpoint_type == "streaming":
agent_response = self.agent_client.streaming_query_agent(api_input)
Expand All @@ -61,7 +62,7 @@ def run_evaluation( # pylint: disable=too-many-arguments,too-many-positional-ar
tool_calls = agent_response.get("tool_calls", [])

# Run all evaluations
evaluation_results = []
evaluation_results: list[EvalResultItem] = []
for eval_type in data_config.eval_types:
try:
success = self._evaluate_single_type(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
import logging
from datetime import datetime
from datetime import UTC, datetime
from pathlib import Path

import pandas as pd
Expand Down Expand Up @@ -32,7 +32,7 @@ def save_results(self, result_dir: str) -> None:
output_dir = Path(result_dir)
output_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
csv_file = output_dir / f"agent_goal_eval_results_{timestamp}.csv"
json_file = output_dir / f"agent_goal_eval_summary_{timestamp}.json"

Expand Down
20 changes: 14 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ dev = [
"pytest-cov>=6.0.0,<=6.2.1",
"pytest-mock==3.15.1",
"pytest-timeout==2.4.0",
"types-PyYAML>=6.0.0",
]

[project.scripts]
Expand All @@ -75,32 +76,39 @@ generate_answers = "generate_answers.generate_answers:main"
# Note: torch[cpu] variant configuration removed for uv compatibility
# Modern PyTorch versions are available on PyPI directly

[tool.isort]
src_paths = ["src", "tests"]

[tool.black]
line-length = 88

[tool.pydocstyle]
convention = "google"

[tool.mypy]
disable_error_code = ["union-attr", "return-value", "arg-type", "import-untyped"]
ignore_missing_imports = true
plugins = ["pydantic.mypy"]
explicit_package_bases = true
disallow_untyped_calls = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
ignore_missing_imports = true

[tool.pydantic-mypy]
init_forbid_extra = true
init_typed = true
warn_required_dynamic_aliases = true

[tool.pylint.MASTER]
source-roots = ["src", "script", "tests"]
load-plugins = ["pylint_pydantic"]
init-hook = "import sys; sys.path.append('.')"
[tool.pylint."MESSAGES CONTROL"]
disable = ["R0801"]

[tool.pyright]
extraPaths = ["./src"]

[tool.ruff]
line-length = 88
[tool.ruff.lint]
extend-select = ["TID251"]
extend-select = ["TID251", "UP006", "UP007", "UP010", "UP017", "UP035", "RUF100", "B009", "B010", "DTZ005", "D202", "I001", "PLR1733"]
[tool.ruff.lint.flake8-tidy-imports.banned-api]
unittest = { msg = "use pytest instead of unittest" }
"unittest.mock" = { msg = "use pytest-mock instead of unittest.mock" }
Expand Down
20 changes: 11 additions & 9 deletions requirements-all-extras.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ googleapis-common-protos==1.75.0
# grpcio-status
greenlet==3.5.1 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
# via sqlalchemy
grpcio==1.80.0
grpcio==1.81.0
# via
# deepeval
# google-ai-generativelanguage
# google-api-core
# grpcio-status
grpcio-status==1.80.0
grpcio-status==1.81.0
# via google-api-core
h11==0.16.0
# via httpcore
Expand All @@ -148,7 +148,7 @@ huggingface-hub==1.16.1
# sentence-transformers
# tokenizers
# transformers
idna==3.16
idna==3.17
# via
# anyio
# httpx
Expand Down Expand Up @@ -210,7 +210,7 @@ langchain-huggingface==1.2.2
# via langchain
langchain-openai==1.1.10
# via ragas
langchain-protocol==0.0.15
langchain-protocol==0.0.16
# via langchain-core
langgraph==1.1.10
# via langchain
Expand All @@ -222,7 +222,7 @@ langgraph-prebuilt==1.0.13
# via langgraph
langgraph-sdk==0.3.15
# via langgraph
langsmith==0.8.5
langsmith==0.8.8
# via
# langchain-community
# langchain-core
Expand Down Expand Up @@ -338,7 +338,7 @@ proto-plus==1.28.0
# via
# google-ai-generativelanguage
# google-api-core
protobuf==6.33.6
protobuf==7.35.0
# via
# google-ai-generativelanguage
# google-api-core
Expand Down Expand Up @@ -455,7 +455,7 @@ rich==14.3.4
# typer
rouge-score==0.1.2
# via lightspeed-evaluation
rpds-py==0.30.0
rpds-py==2026.5.1
# via
# jsonschema
# referencing
Expand All @@ -477,7 +477,7 @@ seaborn==0.13.2
# via lightspeed-evaluation
sentence-transformers==5.2.3
# via lightspeed-evaluation
sentry-sdk==2.60.0
sentry-sdk==2.61.1
# via deepeval
setuptools==82.0.1
# via
Expand Down Expand Up @@ -534,7 +534,7 @@ tqdm==4.67.1
# transformers
transformers==5.9.0
# via sentence-transformers
typer==0.26.1
typer==0.26.5
# via
# deepeval
# huggingface-hub
Expand Down Expand Up @@ -578,6 +578,8 @@ uuid-utils==0.16.0
# via
# langchain-core
# langsmith
websockets==16.0
# via langsmith
wheel==0.47.0
# via deepeval
xxhash==3.7.0
Expand Down
20 changes: 11 additions & 9 deletions requirements-local-embeddings.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,13 @@ googleapis-common-protos==1.75.0
# grpcio-status
greenlet==3.5.1 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
# via sqlalchemy
grpcio==1.80.0
grpcio==1.81.0
# via
# deepeval
# google-ai-generativelanguage
# google-api-core
# grpcio-status
grpcio-status==1.80.0
grpcio-status==1.81.0
# via google-api-core
h11==0.16.0
# via httpcore
Expand All @@ -144,7 +144,7 @@ huggingface-hub==1.16.1
# sentence-transformers
# tokenizers
# transformers
idna==3.16
idna==3.17
# via
# anyio
# httpx
Expand Down Expand Up @@ -204,7 +204,7 @@ langchain-huggingface==1.2.2
# via langchain
langchain-openai==1.1.10
# via ragas
langchain-protocol==0.0.15
langchain-protocol==0.0.16
# via langchain-core
langgraph==1.1.10
# via langchain
Expand All @@ -216,7 +216,7 @@ langgraph-prebuilt==1.0.13
# via langgraph
langgraph-sdk==0.3.15
# via langgraph
langsmith==0.8.5
langsmith==0.8.8
# via
# langchain-community
# langchain-core
Expand Down Expand Up @@ -324,7 +324,7 @@ proto-plus==1.28.0
# via
# google-ai-generativelanguage
# google-api-core
protobuf==6.33.6
protobuf==7.35.0
# via
# google-ai-generativelanguage
# google-api-core
Expand Down Expand Up @@ -435,7 +435,7 @@ rich==14.3.4
# instructor
# ragas
# typer
rpds-py==0.30.0
rpds-py==2026.5.1
# via
# jsonschema
# referencing
Expand All @@ -455,7 +455,7 @@ seaborn==0.13.2
# via lightspeed-evaluation
sentence-transformers==5.2.3
# via lightspeed-evaluation
sentry-sdk==2.60.0
sentry-sdk==2.61.1
# via deepeval
setuptools==82.0.1
# via
Expand Down Expand Up @@ -508,7 +508,7 @@ tqdm==4.67.1
# transformers
transformers==5.9.0
# via sentence-transformers
typer==0.26.1
typer==0.26.5
# via
# deepeval
# huggingface-hub
Expand Down Expand Up @@ -552,6 +552,8 @@ uuid-utils==0.16.0
# via
# langchain-core
# langsmith
websockets==16.0
# via langsmith
wheel==0.47.0
# via deepeval
xxhash==3.7.0
Expand Down
20 changes: 11 additions & 9 deletions requirements-nlp-metrics.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,13 @@ googleapis-common-protos==1.75.0
# grpcio-status
greenlet==3.5.1 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
# via sqlalchemy
grpcio==1.80.0
grpcio==1.81.0
# via
# deepeval
# google-ai-generativelanguage
# google-api-core
# grpcio-status
grpcio-status==1.80.0
grpcio-status==1.81.0
# via google-api-core
h11==0.16.0
# via httpcore
Expand All @@ -144,7 +144,7 @@ huggingface-hub==1.16.1
# datasets
# langchain-huggingface
# tokenizers
idna==3.16
idna==3.17
# via
# anyio
# httpx
Expand Down Expand Up @@ -203,7 +203,7 @@ langchain-huggingface==1.2.2
# via langchain
langchain-openai==1.1.10
# via ragas
langchain-protocol==0.0.15
langchain-protocol==0.0.16
# via langchain-core
langgraph==1.1.10
# via langchain
Expand All @@ -215,7 +215,7 @@ langgraph-prebuilt==1.0.13
# via langgraph
langgraph-sdk==0.3.15
# via langgraph
langsmith==0.8.5
langsmith==0.8.8
# via
# langchain-community
# langchain-core
Expand Down Expand Up @@ -323,7 +323,7 @@ proto-plus==1.28.0
# via
# google-ai-generativelanguage
# google-api-core
protobuf==6.33.6
protobuf==7.35.0
# via
# google-ai-generativelanguage
# google-api-core
Expand Down Expand Up @@ -438,7 +438,7 @@ rich==14.3.4
# typer
rouge-score==0.1.2
# via lightspeed-evaluation
rpds-py==0.30.0
rpds-py==2026.5.1
# via
# jsonschema
# referencing
Expand All @@ -452,7 +452,7 @@ scipy==1.16.2
# scikit-network
seaborn==0.13.2
# via lightspeed-evaluation
sentry-sdk==2.60.0
sentry-sdk==2.61.1
# via deepeval
setuptools==82.0.1
# via deepeval
Expand Down Expand Up @@ -498,7 +498,7 @@ tqdm==4.67.1
# nltk
# openai
# ragas
typer==0.26.1
typer==0.26.5
# via
# deepeval
# huggingface-hub
Expand Down Expand Up @@ -539,6 +539,8 @@ uuid-utils==0.16.0
# via
# langchain-core
# langsmith
websockets==16.0
# via langsmith
wheel==0.47.0
# via deepeval
xxhash==3.7.0
Expand Down
Loading
Loading