Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions backend/app/api/docs/assessment/get_dataset.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Get a single assessment dataset by ID.

Optionally include a signed URL to download the original uploaded file.

Pass `limit_rows=N` (1-100) to additionally include a lightweight preview
of the dataset's column headers and the first N data rows. When omitted,
the underlying file is not fetched and the response stays small.
37 changes: 35 additions & 2 deletions backend/app/api/routes/assessment/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,14 @@
get_assessment_dataset_by_id,
list_assessment_datasets,
)
from app.models.assessment import AssessmentDatasetResponse
from app.models.assessment import (
AssessmentDatasetPreview,
AssessmentDatasetResponse,
)
from app.models.evaluation import EvaluationDataset
from app.services.assessment.dataset import (
preview_dataset as preview_assessment_dataset,
)
from app.services.assessment.dataset import upload_dataset as upload_assessment_dataset
from app.services.assessment.validators import validate_dataset_file
from app.utils import APIResponse, load_description
Expand All @@ -26,6 +32,7 @@
def _dataset_to_response(
dataset: EvaluationDataset,
signed_url: str | None = None,
preview: AssessmentDatasetPreview | None = None,
) -> AssessmentDatasetResponse:
metadata = dataset.dataset_metadata or {}
return AssessmentDatasetResponse(
Expand All @@ -36,6 +43,7 @@ def _dataset_to_response(
file_extension=metadata.get("file_extension"),
object_store_url=dataset.object_store_url,
signed_url=signed_url,
preview=preview,
)


Expand Down Expand Up @@ -111,6 +119,16 @@ def get_dataset(
include_signed_url: bool = Query(
False, description="Include a signed URL for downloading the raw file from S3"
),
limit_rows: int
| None = Query(
None,
ge=1,
le=100,
description=(
"If set, fetch the underlying file and include a preview of the first "
"N data rows plus column headers. Skip to avoid the file download."
),
),
) -> APIResponse[AssessmentDatasetResponse]:
"""Get a specific assessment dataset."""
dataset = get_assessment_dataset_by_id(
Expand All @@ -127,8 +145,23 @@ def get_dataset(
)
signed_url = storage.get_signed_url(dataset.object_store_url)

preview: AssessmentDatasetPreview | None = None
if limit_rows is not None:
headers, rows = preview_assessment_dataset(
session=session,
dataset=dataset,
project_id=auth_context.project_.id,
limit=limit_rows,
)
preview = AssessmentDatasetPreview(
headers=headers,
rows=rows,
returned_rows=len(rows),
truncated=len(rows) >= limit_rows,
)
Comment thread
Ayush8923 marked this conversation as resolved.

return APIResponse.success_response(
data=_dataset_to_response(dataset, signed_url=signed_url)
data=_dataset_to_response(dataset, signed_url=signed_url, preview=preview)
)


Expand Down
2 changes: 1 addition & 1 deletion backend/app/crud/assessment/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def build_google_jsonl(

jsonl_data.append(
{
"metadata": {"key": f"row_{idx}"},
"key": f"row_{idx}",
"request": request,
}
)
Expand Down
10 changes: 10 additions & 0 deletions backend/app/models/assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,15 @@ class AssessmentExportRow(BaseModel):
updated_at: datetime


class AssessmentDatasetPreview(BaseModel):
"""Lightweight preview of a dataset's columns and first N rows."""

headers: list[str]
rows: list[list[str]]
returned_rows: int = 0
truncated: bool = False


class AssessmentDatasetResponse(BaseModel):
"""Response model for assessment dataset."""

Expand All @@ -343,3 +352,4 @@ class AssessmentDatasetResponse(BaseModel):
file_extension: str | None = None
object_store_url: str | None = None
signed_url: str | None = None
preview: AssessmentDatasetPreview | None = None
104 changes: 104 additions & 0 deletions backend/app/services/assessment/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,110 @@ def _count_rows(content: bytes, file_ext: str) -> int:
return _count_csv_rows(content)


def _stringify(value: object) -> str:
if value is None:
return ""
return str(value)


def _preview_csv(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]:
for encoding in ("utf-8-sig", "utf-8", "latin-1"):
try:
text = content.decode(encoding)
break
except (UnicodeDecodeError, ValueError):
continue
else:
text = content.decode("utf-8", errors="replace")

reader = csv.reader(io.StringIO(text))
header = next(reader, None) or []
headers = [_stringify(cell) for cell in header]

rows: list[list[str]] = []
for row in reader:
if not any(cell.strip() for cell in row):
continue
rows.append([_stringify(cell) for cell in row])
if len(rows) >= limit:
break
return headers, rows


def _preview_excel(content: bytes, limit: int) -> tuple[list[str], list[list[str]]]:
import openpyxl

wb = None
try:
wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
ws = wb.active
if ws is None:
return [], []

rows_iter = ws.iter_rows(values_only=True)
header = next(rows_iter, None) or ()
headers = [_stringify(cell) for cell in header]

rows: list[list[str]] = []
for row in rows_iter:
if not row or not any(cell is not None for cell in row):
continue
rows.append([_stringify(cell) for cell in row])
if len(rows) >= limit:
break
return headers, rows
finally:
if wb is not None:
wb.close()


def preview_dataset(
session: Session,
dataset: EvaluationDataset,
project_id: int,
limit: int,
) -> tuple[list[str], list[list[str]]]:
"""Return the first `limit` data rows (plus header) of a dataset file."""
if not dataset.object_store_url:
raise HTTPException(
status_code=404, detail="Dataset has no underlying file to preview."
)

file_ext = (dataset.dataset_metadata or {}).get("file_extension")
if file_ext == ".xls":
raise HTTPException(
status_code=422,
detail="Legacy Excel format (.xls) is not supported.",
)

storage = get_cloud_storage(session=session, project_id=project_id)
try:
content = storage.get(dataset.object_store_url)
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to fetch file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=502, detail="Failed to fetch dataset file from storage."
) from e

try:
if file_ext == ".xlsx":
return _preview_excel(content, limit)
return _preview_csv(content, limit)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
except InvalidFileException as e:
raise HTTPException(status_code=422, detail="Invalid XLSX file content.") from e
except Exception as e:
logger.warning(
f"[preview_dataset] Failed to parse file | dataset_id={dataset.id} | {e}",
exc_info=True,
)
raise HTTPException(
status_code=422, detail="Unable to parse dataset file for preview."
) from e


def upload_dataset(
session: Session,
file_content: bytes,
Expand Down
2 changes: 1 addition & 1 deletion backend/app/tests/assessment/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def test_build_openai_and_google_jsonl(self) -> None:
google_params={"temperature": 0.2, "instructions": "system"},
)
assert len(google_jsonl) == 1
assert google_jsonl[0]["metadata"]["key"] == "row_0"
assert google_jsonl[0]["key"] == "row_0"
assert google_jsonl[0]["request"]["systemInstruction"] == {
"parts": [{"text": "system"}]
}
Loading