Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/app/api/docs/assessment/get_dataset.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Get a single assessment dataset by ID.

Optionally include a signed URL to download the original uploaded file.

Pass `limit_rows=N` (1-100) to additionally include a lightweight preview
of the dataset's column headers and the first N data rows. When omitted,
the underlying file is not fetched and the response stays small.
40 changes: 38 additions & 2 deletions backend/app/api/routes/assessment/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Assessment dataset endpoints."""

import logging
from typing import Annotated

from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile

Expand All @@ -12,8 +13,14 @@
get_assessment_dataset_by_id,
list_assessment_datasets,
)
from app.models.assessment import AssessmentDatasetResponse
from app.models.assessment import (
AssessmentDatasetPreview,
AssessmentDatasetResponse,
)
from app.models.evaluation import EvaluationDataset
from app.services.assessment.dataset import (
preview_dataset as preview_assessment_dataset,
)
from app.services.assessment.dataset import upload_dataset as upload_assessment_dataset
from app.services.assessment.validators import validate_dataset_file
from app.utils import APIResponse, load_description
Expand All @@ -26,6 +33,7 @@
def _dataset_to_response(
dataset: EvaluationDataset,
signed_url: str | None = None,
preview: AssessmentDatasetPreview | None = None,
) -> AssessmentDatasetResponse:
metadata = dataset.dataset_metadata or {}
return AssessmentDatasetResponse(
Expand All @@ -36,6 +44,7 @@ def _dataset_to_response(
file_extension=metadata.get("file_extension"),
object_store_url=dataset.object_store_url,
signed_url=signed_url,
preview=preview,
)


Expand Down Expand Up @@ -111,6 +120,18 @@ def get_dataset(
include_signed_url: bool = Query(
False, description="Include a signed URL for downloading the raw file from S3"
),
limit_rows: Annotated[
int | None,
Query(
ge=1,
le=100,
description=(
"If set, fetch the underlying file and include a preview of the "
"first N data rows plus column headers. Skip to avoid the file "
"download."
),
),
] = None,
) -> APIResponse[AssessmentDatasetResponse]:
"""Get a specific assessment dataset."""
dataset = get_assessment_dataset_by_id(
Expand All @@ -127,8 +148,23 @@ def get_dataset(
)
signed_url = storage.get_signed_url(dataset.object_store_url)

preview: AssessmentDatasetPreview | None = None
if limit_rows is not None:
headers, rows = preview_assessment_dataset(
session=session,
dataset=dataset,
project_id=auth_context.project_.id,
limit=limit_rows,
)
preview = AssessmentDatasetPreview(
headers=headers,
rows=rows,
returned_rows=len(rows),
truncated=len(rows) >= limit_rows,
)
Comment on lines +152 to +164
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

truncated is over-reported at the limit boundary.

Line 160 uses len(rows) >= limit_rows, so exact-limit results are marked truncated even when no more rows exist.

Suggested fix
-    if limit_rows is not None:
+    if limit_rows is not None:
         headers, rows = preview_assessment_dataset(
             session=session,
             dataset=dataset,
             project_id=auth_context.project_.id,
-            limit=limit_rows,
+            limit=limit_rows + 1,
         )
+        is_truncated = len(rows) > limit_rows
+        rows = rows[:limit_rows]
         preview = AssessmentDatasetPreview(
             headers=headers,
             rows=rows,
             returned_rows=len(rows),
-            truncated=len(rows) >= limit_rows,
+            truncated=is_truncated,
         )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/app/api/routes/assessment/datasets.py` around lines 149 - 161, The
truncated flag is over-reported because the code treats len(rows) >= limit_rows
as truncated; to fix, request one extra row from preview_assessment_dataset
(call with limit=limit_rows + 1), set truncated = len(rows) > limit_rows, and if
truncated trim rows to the original limit_rows before constructing
AssessmentDatasetPreview (use the existing names session, dataset, limit_rows,
preview_assessment_dataset, headers, rows, and AssessmentDatasetPreview).


return APIResponse.success_response(
data=_dataset_to_response(dataset, signed_url=signed_url)
data=_dataset_to_response(dataset, signed_url=signed_url, preview=preview)
)


Expand Down
2 changes: 1 addition & 1 deletion backend/app/crud/assessment/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def build_google_jsonl(

jsonl_data.append(
{
"metadata": {"key": f"row_{idx}"},
"key": f"row_{idx}",
"request": request,
}
)
Expand Down
10 changes: 10 additions & 0 deletions backend/app/models/assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,15 @@ class AssessmentExportRow(BaseModel):
updated_at: datetime


class AssessmentDatasetPreview(BaseModel):
"""Lightweight preview of a dataset's columns and first N rows."""

headers: list[str]
rows: list[list[str]]
returned_rows: int = 0
truncated: bool = False


class AssessmentDatasetResponse(BaseModel):
"""Response model for assessment dataset."""

Expand All @@ -343,3 +352,4 @@ class AssessmentDatasetResponse(BaseModel):
file_extension: str | None = None
object_store_url: str | None = None
signed_url: str | None = None
preview: AssessmentDatasetPreview | None = None
104 changes: 104 additions & 0 deletions backend/app/services/assessment/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,110 @@ def _count_rows(content: bytes, file_ext: str) -> int:
return _count_csv_rows(content)


def _stringify(value: object) -> str:
if value is None:
return ""
return str(value)


def _preview_csv(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]:
for encoding in ("utf-8-sig", "utf-8", "latin-1"):
try:
text = content.decode(encoding)
break
except (UnicodeDecodeError, ValueError):
continue
else:
text = content.decode("utf-8", errors="replace")

reader = csv.reader(io.StringIO(text))
header = next(reader, None) or []
headers = [_stringify(cell) for cell in header]

rows: list[list[str]] = []
for row in reader:
if not any(cell.strip() for cell in row):
continue
rows.append([_stringify(cell) for cell in row])
if len(rows) >= limit:
break
return headers, rows


def _preview_excel(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]:
import openpyxl

wb = None
try:
wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
ws = wb.active
if ws is None:
return [], []

rows_iter = ws.iter_rows(values_only=True)
header = next(rows_iter, None) or ()
headers = [_stringify(cell) for cell in header]

rows: list[list[str]] = []
for row in rows_iter:
if not row or not any(cell is not None for cell in row):
continue
rows.append([_stringify(cell) for cell in row])
if len(rows) >= limit:
break
return headers, rows
finally:
if wb is not None:
wb.close()


def preview_dataset(
session: Session,
dataset: EvaluationDataset,
project_id: int,
limit: int,
) -> tuple[list[str], list[list[str]]]:
"""Return the first `limit` data rows (plus header) of a dataset file."""
if not dataset.object_store_url:
raise HTTPException(
status_code=404, detail="Dataset has no underlying file to preview."
)

file_ext = (dataset.dataset_metadata or {}).get("file_extension")
if file_ext == ".xls":
raise HTTPException(
status_code=422,
detail="Legacy Excel format (.xls) is not supported.",
)

storage = get_cloud_storage(session=session, project_id=project_id)
try:
content = storage.get(dataset.object_store_url)
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to fetch file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=502, detail="Failed to fetch dataset file from storage."
) from e

try:
if file_ext == ".xlsx":
return _preview_excel(content, limit)
return _preview_csv(content, limit)
Comment on lines +197 to +219
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Reject unknown/missing file extensions instead of defaulting to CSV.

Line 219 defaults to CSV parsing for non-.xlsx values. If metadata is missing/incorrect, preview can return garbage instead of a clear 422.

Suggested fix
-    file_ext = (dataset.dataset_metadata or {}).get("file_extension")
+    file_ext = ((dataset.dataset_metadata or {}).get("file_extension") or "").lower()
     if file_ext == ".xls":
         raise HTTPException(
             status_code=422,
             detail="Legacy Excel format (.xls) is not supported.",
         )
+    if file_ext not in {".csv", ".xlsx"}:
+        raise HTTPException(
+            status_code=422,
+            detail="Unsupported or missing dataset file extension for preview.",
+        )
...
-        if file_ext == ".xlsx":
+        if file_ext == ".xlsx":
             return _preview_excel(content, limit)
         return _preview_csv(content, limit)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
file_ext = (dataset.dataset_metadata or {}).get("file_extension")
if file_ext == ".xls":
raise HTTPException(
status_code=422,
detail="Legacy Excel format (.xls) is not supported.",
)
storage = get_cloud_storage(session=session, project_id=project_id)
try:
content = storage.get(dataset.object_store_url)
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to fetch file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=502, detail="Failed to fetch dataset file from storage."
) from e
try:
if file_ext == ".xlsx":
return _preview_excel(content, limit)
return _preview_csv(content, limit)
file_ext = ((dataset.dataset_metadata or {}).get("file_extension") or "").lower()
if file_ext == ".xls":
raise HTTPException(
status_code=422,
detail="Legacy Excel format (.xls) is not supported.",
)
if file_ext not in {".csv", ".xlsx"}:
raise HTTPException(
status_code=422,
detail="Unsupported or missing dataset file extension for preview.",
)
storage = get_cloud_storage(session=session, project_id=project_id)
try:
content = storage.get(dataset.object_store_url)
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to fetch file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=502, detail="Failed to fetch dataset file from storage."
) from e
try:
if file_ext == ".xlsx":
return _preview_excel(content, limit)
return _preview_csv(content, limit)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/app/services/assessment/dataset.py` around lines 197 - 219, The
current preview logic defaults to CSV for any non-".xlsx" file_ext which can
silently mis-handle missing/invalid metadata; update the preview path in the
preview function (where file_ext is derived) to validate file_ext explicitly
(normalize with .lower() and strip), and only allow known extensions like
".xlsx" and ".csv"; if file_ext is None or not in the allowed set, raise
HTTPException(status_code=422, detail="Unsupported or missing file extension.")
instead of calling _preview_csv, otherwise call _preview_excel for ".xlsx" and
_preview_csv for ".csv".

except InvalidFileException as e:
raise HTTPException(status_code=422, detail="Invalid XLSX file content.") from e
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to parse file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=422, detail="Unable to parse dataset file for preview."
) from e


def upload_dataset(
session: Session,
file_content: bytes,
Expand Down
2 changes: 1 addition & 1 deletion backend/app/tests/assessment/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def test_build_openai_and_google_jsonl(self) -> None:
google_params={"temperature": 0.2, "instructions": "system"},
)
assert len(google_jsonl) == 1
assert google_jsonl[0]["metadata"]["key"] == "row_0"
assert google_jsonl[0]["key"] == "row_0"
assert google_jsonl[0]["request"]["systemInstruction"] == {
"parts": [{"text": "system"}]
}
Loading