From 61bd8f3b8922ff5a3b2cc36004d715fb6c2273c7 Mon Sep 17 00:00:00 2001 From: Alex Jones Date: Thu, 30 Apr 2026 15:48:18 +0100 Subject: [PATCH 1/2] fix: timed out jobs now go through RuntimeBackend._finish_job This fixes an issue inherited from the old base launcher in DVSim, but only for the new runtime backend interface (i.e. not backported to the launchers, due to their fragility). Jobs that are killed after they time out directly emit a completion event instead of going through the regular `_finish_job` call, which mean that we do not get the job runtime / simulated time reported in the final completed spec, which is not correct. To alleviate this issue, change the LocalRuntimeBackend logic to always go through this `RuntimeBackend._finish_job` interface, where an exit code of `None` is now used to denote the fact that the process did not finish naturally and instead timed out. This case is the first thing checked before any log statuses, and so will always be reported as a timeout failure without the different backends needing to implement it manually (which should reduce some duplication in the future, and potential for inconsistencies in the failure messages). Signed-off-by: Alex Jones --- src/dvsim/runtime/backend.py | 17 ++++++++++++++++- src/dvsim/runtime/local.py | 10 +++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/dvsim/runtime/backend.py b/src/dvsim/runtime/backend.py index 1c6f355d..7f22203d 100644 --- a/src/dvsim/runtime/backend.py +++ b/src/dvsim/runtime/backend.py @@ -218,11 +218,21 @@ def _prepare_launch(self, job: JobSpec) -> None: self._make_job_output_directory(job) def _finish_job( - self, handle: JobHandle, exit_code: int, runtime: float | None + self, handle: JobHandle, exit_code: int | None, runtime: float | None ) -> tuple[JobStatus, JobStatusInfo | None]: """Determine the outcome of a job that ran to completion, and parse extra log info. Updates the handle with any extracted job runtime & simulation time info. + + Args: + handle: The handle to the job that finished. + exit_code: The exit code if the job finished gracefully, or None if it was terminated. + runtime: The amount of time taken to complete the job, if known. + + Returns: + A tuple containing the final job status and optionally an additional context/reason + object describing the job completion. + """ if handle.spec.dry_run: return JobStatus.PASSED, None @@ -241,6 +251,11 @@ def _finish_job( handle.simulated_time.set(*simulated_time.get()) # Determine the final status from the logs and exit code. + if exit_code is None: + return JobStatus.KILLED, JobStatusInfo( + message=f"Job timed out after {handle.spec.timeout_mins} minutes" + ) + status, reason = log_results.get_status_from_logs() if status is not None: return status, reason diff --git a/src/dvsim/runtime/local.py b/src/dvsim/runtime/local.py index 8d7a128f..8fe4efbd 100644 --- a/src/dvsim/runtime/local.py +++ b/src/dvsim/runtime/local.py @@ -109,19 +109,17 @@ async def _monitor_job(self, handle: LocalJobHandle) -> None: ] status = JobStatus.KILLED reason = None + exit_code = None try: exit_code = await asyncio.wait_for( handle.process.wait(), timeout=handle.spec.timeout_secs ) - runtime = time.monotonic() - handle.start_time - status, reason = self._finish_job(handle, exit_code, runtime) except asyncio.TimeoutError: await self._kill_job(handle) - status = JobStatus.KILLED - timeout_message = f"Job timed out after {handle.spec.timeout_mins} minutes" - reason = JobStatusInfo(message=timeout_message) finally: + runtime = time.monotonic() - handle.start_time + # Explicitly cancel reader tasks and wait for them to finish before closing the log # file. We first give them a second to finish naturally to reduce log loss. with contextlib.suppress(asyncio.TimeoutError): @@ -131,6 +129,8 @@ async def _monitor_job(self, handle: LocalJobHandle) -> None: task.cancel() await asyncio.gather(*reader_tasks, return_exceptions=True) + status, reason = self._finish_job(handle, exit_code, runtime) + if handle.log_file: handle.log_file.close() if handle.kill_requested: From e9a0d72747e5e4bad90bf3c05db7b95d593b2cdc Mon Sep 17 00:00:00 2001 From: Alex Jones Date: Thu, 30 Apr 2026 15:52:47 +0100 Subject: [PATCH 2/2] fix: local job timeout reported as job being killed This was a small bug introduced in the new local runtime backend. Originally the `_kill_job` method was just used by `_kill_many` (for when a SIGINT/SIGTERM is sent to the scheduler), so I used this `kill_requested` flag on the handle to determine when to override the kill reason. However, I then reused this function for the timeout case to kill the ongoing job, without realising that this flag was still being set in that function. In reality, to fix this issue we should just set it directly in `kill_many` to denote that we are manually trying to kill a job. Signed-off-by: Alex Jones --- src/dvsim/runtime/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dvsim/runtime/local.py b/src/dvsim/runtime/local.py index 8fe4efbd..e01e82fe 100644 --- a/src/dvsim/runtime/local.py +++ b/src/dvsim/runtime/local.py @@ -322,7 +322,6 @@ async def _kill_job(self, handle: LocalJobHandle) -> None: return if proc.returncode is None: - handle.kill_requested = True try: self._send_kill_signal(proc, signal.SIGTERM) except ProcessLookupError: @@ -366,6 +365,7 @@ async def kill_many(self, handles: Iterable[JobHandle]) -> None: msg = f"Local backend expected handle of type LocalJobHandle, not `{type(handle)}`." raise TypeError(msg) if handle.process and not handle.kill_requested and handle.process.returncode is None: + handle.kill_requested = True tasks.append(asyncio.create_task(self._kill_job(handle))) if tasks: