Skip to content
4 changes: 4 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ def load_routing_configuration(file: Path = _config_file) -> TomlTable:
return typing.cast("TomlTable", _load_configuration(file)["routing"])


def load_run_configuration(file: Path = _config_file) -> TomlTable:
return typing.cast("TomlTable", _load_configuration(file).get("run", {}))
Comment thread
PGijsbers marked this conversation as resolved.
Outdated


@functools.cache
def load_database_configuration(file: Path = _config_file) -> TomlTable:
configuration = _load_configuration(file)
Expand Down
3 changes: 3 additions & 0 deletions src/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,6 @@ database="openml"
[routing]
minio_url="http://minio:9000/"
server_url="http://php-api:80/"

[run]
evaluation_engine_ids = [1]
2 changes: 1 addition & 1 deletion src/database/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ async def get(id_: int, expdb: AsyncConnection) -> Row | None:
row = await expdb.execute(
text(
"""
SELECT *, uploadDate as upload_date
SELECT *, uploadDate as upload_date, fullName AS full_name
FROM implementation
WHERE id = :flow_id
""",
Expand Down
179 changes: 179 additions & 0 deletions src/database/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,185 @@ async def exist(id_: int, expdb: AsyncConnection) -> bool:
return bool(row.one_or_none())


async def get(run_id: int, expdb: AsyncConnection) -> Row | None:
"""Fetch the core run row from the `run` table.

Returns the row if found, or None if no run with `run_id` exists.
The `error_message` column is NULL when the run completed without errors.
"""
row = await expdb.execute(
text(
"""
SELECT `rid`, `uploader`, `setup`, `task_id`, `error_message`
FROM `run`
WHERE `rid` = :run_id
""",
),
parameters={"run_id": run_id},
)
return row.one_or_none()


async def get_uploader_name(uploader_id: int, userdb: AsyncConnection) -> str | None:
"""Fetch the display name of a user from the openml database.

Queries the `users` table in the separate openml DB and concatenates
first_name + ' ' + last_name. Returns None if the user does not exist.
"""
row = await userdb.execute(
text(
"""
SELECT CONCAT(`first_name`, ' ', `last_name`) AS `name`
FROM `users`
WHERE `id` = :uploader_id
""",
),
parameters={"uploader_id": uploader_id},
)
result = row.one_or_none()
return result.name if result else None
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could rather extend the database.users.User class to keep around more information than just the identifier, instead of creating separate method to fetch different attributes.

  • extend the User class with first_name and last_name attribute, and a full_name property.
  • refactor get_user_id_for to return a user:
    • rename to get_user
    • allow fetching by either apikey or id
    • return an instantiated User.

then you can use that method instead.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is really a good thought, will extend User class and use it.



async def get_tags(run_id: int, expdb: AsyncConnection) -> list[str]:
"""Fetch all tags associated with a run from the `run_tag` table.

The `id` column in `run_tag` refers to the run ID
"""
rows = await expdb.execute(
text(
"""
SELECT `tag`
FROM `run_tag`
WHERE `id` = :run_id
""",
),
parameters={"run_id": run_id},
)
return [row.tag for row in rows.all()]


async def get_input_data(run_id: int, expdb: AsyncConnection) -> list[Row]:
"""Fetch the dataset(s) used as input for a run, with name and url.

Joins `input_data` with `dataset` to include the dataset name and ARFF URL.
"""
rows = await expdb.execute(
text(
"""
SELECT `id`.`data` AS `did`, `d`.`name`, `d`.`url`
FROM `input_data` `id`
JOIN `dataset` `d` ON `id`.`data` = `d`.`did`
WHERE `id`.`run` = :run_id
""",
),
parameters={"run_id": run_id},
)
return cast("list[Row]", rows.all())


async def get_output_files(run_id: int, expdb: AsyncConnection) -> list[Row]:
"""Fetch output files attached to a run from the `runfile` table.

Typical entries include the description XML and predictions ARFF.
The `field` column holds the file label (e.g. "description", "predictions").

Note: the PHP response includes a deprecated `did` field hardcoded to "-1"
for each file. This implementation omits it entirely.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a detail of the API layer, not the database layer. Can move the comment there.

"""
rows = await expdb.execute(
text(
"""
SELECT `file_id`, `field`
FROM `runfile`
WHERE `source` = :run_id
""",
),
parameters={"run_id": run_id},
)
return cast("list[Row]", rows.all())


async def get_evaluations(
run_id: int,
expdb: AsyncConnection,
*,
evaluation_engine_ids: list[int],
) -> list[Row]:
"""Fetch evaluation metric results for a run.

Joins `evaluation` with `math_function` to resolve the metric name
(the `evaluation` table stores only a `function_id`, not the name directly).

Filters by `evaluation_engine_id IN (...)`. The list is configurable
via `config.toml [run] evaluation_engine_ids`.
Dynamic named parameters are used for aiomysql compatibility.
"""
if not evaluation_engine_ids:
return []

# Build :eid_0, :eid_1, ... placeholders — one per engine ID.
eid_params = {f"eid_{i}": eid for i, eid in enumerate(evaluation_engine_ids)}
placeholders = ", ".join(f":eid_{i}" for i in range(len(evaluation_engine_ids)))

query = text(
f"""
SELECT `m`.`name`, `e`.`value`, `e`.`array_data`
FROM `evaluation` `e`
JOIN `math_function` `m` ON `e`.`function_id` = `m`.`id`
WHERE `e`.`source` = :run_id
AND `e`.`evaluation_engine_id` IN ({placeholders})
""", # noqa: S608 # placeholders are trusted integer params, not user input
)
rows = await expdb.execute(
query,
parameters={"run_id": run_id, **eid_params},
)
return cast("list[Row]", rows.all())


async def get_task_type(task_id: int, expdb: AsyncConnection) -> str | None:
"""Fetch the human-readable task type name for the task associated with a run.

Joins `task` and `task_type` on `ttid` to resolve the name
(e.g. "Supervised Classification").
"""
row = await expdb.execute(
text(
"""
SELECT `tt`.`name`
FROM `task` `t`
JOIN `task_type` `tt` ON `t`.`ttid` = `tt`.`ttid`
WHERE `t`.`task_id` = :task_id
""",
),
parameters={"task_id": task_id},
)
result = row.one_or_none()
return result.name if result else None


async def get_task_evaluation_measure(task_id: int, expdb: AsyncConnection) -> str | None:
"""Fetch the evaluation measure configured for a task, if any.

Queries `task_inputs` for the row where `input = 'evaluation_measures'`.
Returns None (not an empty string) when no such row exists, so callers
can treat a falsy result uniformly.
"""
row = await expdb.execute(
text(
"""
SELECT `value`
FROM `task_inputs`
WHERE `task_id` = :task_id
AND `input` = 'evaluation_measures'
""",
),
parameters={"task_id": task_id},
)
result = row.one_or_none()
return result.value if result else None

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move these to the tasks module. I will have a look at consolidating these if appropriate.



async def get_trace(run_id: int, expdb: AsyncConnection) -> Sequence[Row]:
"""Get trace rows for a run from the trace table."""
rows = await expdb.execute(
Expand Down
140 changes: 137 additions & 3 deletions src/routers/openml/runs.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
"""Endpoints for run-related data."""

from typing import Annotated
import asyncio
from typing import TYPE_CHECKING, Annotated, cast

from fastapi import APIRouter, Depends

if TYPE_CHECKING:
from sqlalchemy import Row
from sqlalchemy.ext.asyncio import AsyncConnection

import config
import database.flows
import database.runs
import database.setups
from core.errors import RunNotFoundError, RunTraceNotFoundError
from routers.dependencies import expdb_connection
from schemas.runs import RunTrace, TraceIteration
from routers.dependencies import expdb_connection, userdb_connection
from schemas.runs import (
EvaluationScore,
InputDataset,
OutputFile,
ParameterSetting,
Run,
RunTrace,
TraceIteration,
)

router = APIRouter(prefix="/run", tags=["run"])

Expand Down Expand Up @@ -42,3 +57,122 @@ async def get_run_trace(
for row in trace_rows
],
)


@router.post(
path="/{run_id}",
description="Provided for convenience, same as `GET` endpoint.",
response_model_exclude_none=True,
)
@router.get("/{run_id}", response_model_exclude_none=True)
async def get_run(
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
Outdated
run_id: int,
expdb: Annotated[AsyncConnection, Depends(expdb_connection)],
userdb: Annotated[AsyncConnection, Depends(userdb_connection)],
) -> Run:
"""Get full metadata for a run by ID.

No authentication or visibility check is performed — all runs are
publicly accessible.
"""
# Core run record — all other data depends on uploader, setup, and task_id.
run = await database.runs.get(run_id, expdb)
if run is None:
msg = f"Run {run_id} not found."
# Reuse RunNotFoundError and pass code=236 at the call site for
# backward compat with the PHP GET /run/{id} error code
raise RunNotFoundError(msg, code=236)

# Evaluation engine IDs come from config.toml [run] so they can be
# extended when a new evaluation engine is deployed, without code changes.
engine_ids: list[int] = config.load_run_configuration().get("evaluation_engine_ids", [1])

# Fetch all independent data concurrently.
(
uploader_name,
tags,
input_data_rows,
output_file_rows,
evaluation_rows,
task_type,
task_evaluation_measure,
setup,
parameter_rows,
) = cast(
"tuple[str | None, list[str], list[Row], list[Row], list[Row], str | None, str"
"| None, Row | None, list[Row]]",
await asyncio.gather(
database.runs.get_uploader_name(run.uploader, userdb),
database.runs.get_tags(run_id, expdb),
database.runs.get_input_data(run_id, expdb),
database.runs.get_output_files(run_id, expdb),
database.runs.get_evaluations(run_id, expdb, evaluation_engine_ids=engine_ids),
database.runs.get_task_type(run.task_id, expdb),
database.runs.get_task_evaluation_measure(run.task_id, expdb),
database.setups.get(run.setup, expdb),
database.setups.get_parameters(run.setup, expdb),
Comment thread
PGijsbers marked this conversation as resolved.
),
)

# Flow is fetched after the gather because it requires setup.implementation_id.
# flows.get() selects fullName AS full_name for reliable case-insensitive access.
flow = await database.flows.get(setup.implementation_id, expdb) if setup else None

# Build parameter_setting list from the denormalised parameter rows
# returned by database.setups.get_parameters (which already JOINs input + implementation).
parameter_settings = [
ParameterSetting(
name=p["name"],
value=p["value"],
component=p["flow_id"], # implementation_id of the sub-flow owning this param
)
for p in parameter_rows
]

input_datasets = [
InputDataset(did=row.did, name=row.name, url=row.url) for row in input_data_rows
]

# runfile.field is the file label (e.g. "description", "predictions")
output_files = [OutputFile(file_id=row.file_id, name=row.field) for row in output_file_rows]

evaluations = [
EvaluationScore(
name=row.name,
# Whole-number floats (e.g. counts) are converted to int to match PHP's
# integer representation. e.g. 253.0 → 253, 0.0 → 0.
value=int(row.value)
if isinstance(row.value, float) and row.value.is_integer()
else row.value,
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
Outdated
array_data=row.array_data,
)
for row in evaluation_rows
]

# Normalise task_evaluation_measure: empty string → None so the field is
# excluded entirely by response_model_exclude_none=True (matches PHP behaviour
# of returning "" but we opt to omit rather than return an empty string).
normalised_measure = task_evaluation_measure or None

# error_message is NULL in the DB when the run has no error.
# The PHP response returns an empty array [] in that case.
error_messages = [run.error_message] if run.error_message else []

return Run(
run_id=run_id,
uploader=run.uploader,
uploader_name=uploader_name,
task_id=run.task_id,
task_type=task_type,
task_evaluation_measure=normalised_measure,
flow_id=setup.implementation_id if setup else 0,
flow_name=flow.full_name if flow else None,
setup_id=run.setup,
setup_string=setup.setup_string if setup else None,
parameter_setting=parameter_settings,
error_message=error_messages,
tag=tags,
# Preserve PHP envelope structure for backward compat
input_data={"dataset": input_datasets},
output_data={"file": output_files, "evaluation": evaluations},
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Loading
Loading