From 61bd8f3b8922ff5a3b2cc36004d715fb6c2273c7 Mon Sep 17 00:00:00 2001
From: Alex Jones <alex.jones@lowrisc.org>
Date: Thu, 30 Apr 2026 15:48:18 +0100
Subject: [PATCH 1/2] fix: timed out jobs now go through
 RuntimeBackend._finish_job

This fixes an issue inherited from the old base launcher in DVSim, but
only for the new runtime backend interface (i.e. not backported to the
launchers, due to their fragility).

Jobs that are killed after they time out directly emit a completion
event instead of going through the regular `_finish_job` call, which
mean that we do not get the job runtime / simulated time reported in the
final completed spec, which is not correct.

To alleviate this issue, change the LocalRuntimeBackend logic to always
go through this `RuntimeBackend._finish_job` interface, where an exit
code of `None` is now used to denote the fact that the process did not
finish naturally and instead timed out. This case is the first thing
checked before any log statuses, and so will always be reported as a
timeout failure without the different backends needing to implement it
manually (which should reduce some duplication in the future, and
potential for inconsistencies in the failure messages).

Signed-off-by: Alex Jones <alex.jones@lowrisc.org>
---
 src/dvsim/runtime/backend.py | 17 ++++++++++++++++-
 src/dvsim/runtime/local.py   | 10 +++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/dvsim/runtime/backend.py b/src/dvsim/runtime/backend.py
index 1c6f355d..7f22203d 100644
--- a/src/dvsim/runtime/backend.py
+++ b/src/dvsim/runtime/backend.py
@@ -218,11 +218,21 @@ def _prepare_launch(self, job: JobSpec) -> None:
         self._make_job_output_directory(job)
 
     def _finish_job(
-        self, handle: JobHandle, exit_code: int, runtime: float | None
+        self, handle: JobHandle, exit_code: int | None, runtime: float | None
     ) -> tuple[JobStatus, JobStatusInfo | None]:
         """Determine the outcome of a job that ran to completion, and parse extra log info.
 
         Updates the handle with any extracted job runtime & simulation time info.
+
+        Args:
+            handle: The handle to the job that finished.
+            exit_code: The exit code if the job finished gracefully, or None if it was terminated.
+            runtime: The amount of time taken to complete the job, if known.
+
+        Returns:
+            A tuple containing the final job status and optionally an additional context/reason
+            object describing the job completion.
+
         """
         if handle.spec.dry_run:
             return JobStatus.PASSED, None
@@ -241,6 +251,11 @@ def _finish_job(
             handle.simulated_time.set(*simulated_time.get())
 
         # Determine the final status from the logs and exit code.
+        if exit_code is None:
+            return JobStatus.KILLED, JobStatusInfo(
+                message=f"Job timed out after {handle.spec.timeout_mins} minutes"
+            )
+
         status, reason = log_results.get_status_from_logs()
         if status is not None:
             return status, reason
diff --git a/src/dvsim/runtime/local.py b/src/dvsim/runtime/local.py
index 8d7a128f..8fe4efbd 100644
--- a/src/dvsim/runtime/local.py
+++ b/src/dvsim/runtime/local.py
@@ -109,19 +109,17 @@ async def _monitor_job(self, handle: LocalJobHandle) -> None:
         ]
         status = JobStatus.KILLED
         reason = None
+        exit_code = None
 
         try:
             exit_code = await asyncio.wait_for(
                 handle.process.wait(), timeout=handle.spec.timeout_secs
             )
-            runtime = time.monotonic() - handle.start_time
-            status, reason = self._finish_job(handle, exit_code, runtime)
         except asyncio.TimeoutError:
             await self._kill_job(handle)
-            status = JobStatus.KILLED
-            timeout_message = f"Job timed out after {handle.spec.timeout_mins} minutes"
-            reason = JobStatusInfo(message=timeout_message)
         finally:
+            runtime = time.monotonic() - handle.start_time
+
             # Explicitly cancel reader tasks and wait for them to finish before closing the log
             # file. We first give them a second to finish naturally to reduce log loss.
             with contextlib.suppress(asyncio.TimeoutError):
@@ -131,6 +129,8 @@ async def _monitor_job(self, handle: LocalJobHandle) -> None:
                     task.cancel()
             await asyncio.gather(*reader_tasks, return_exceptions=True)
 
+            status, reason = self._finish_job(handle, exit_code, runtime)
+
             if handle.log_file:
                 handle.log_file.close()
             if handle.kill_requested:

From e9a0d72747e5e4bad90bf3c05db7b95d593b2cdc Mon Sep 17 00:00:00 2001
From: Alex Jones <alex.jones@lowrisc.org>
Date: Thu, 30 Apr 2026 15:52:47 +0100
Subject: [PATCH 2/2] fix: local job timeout reported as job being killed

This was a small bug introduced in the new local runtime backend.
Originally the `_kill_job` method was just used by `_kill_many` (for
when a SIGINT/SIGTERM is sent to the scheduler), so I used this
`kill_requested` flag on the handle to determine when to override the
kill reason.

However, I then reused this function for the timeout case to kill the
ongoing job, without realising that this flag was still being set in
that function. In reality, to fix this issue we should just set it
directly in `kill_many` to denote that we are manually trying to kill a
job.

Signed-off-by: Alex Jones <alex.jones@lowrisc.org>
---
 src/dvsim/runtime/local.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dvsim/runtime/local.py b/src/dvsim/runtime/local.py
index 8fe4efbd..e01e82fe 100644
--- a/src/dvsim/runtime/local.py
+++ b/src/dvsim/runtime/local.py
@@ -322,7 +322,6 @@ async def _kill_job(self, handle: LocalJobHandle) -> None:
             return
 
         if proc.returncode is None:
-            handle.kill_requested = True
             try:
                 self._send_kill_signal(proc, signal.SIGTERM)
             except ProcessLookupError:
@@ -366,6 +365,7 @@ async def kill_many(self, handles: Iterable[JobHandle]) -> None:
                 msg = f"Local backend expected handle of type LocalJobHandle, not `{type(handle)}`."
                 raise TypeError(msg)
             if handle.process and not handle.kill_requested and handle.process.returncode is None:
+                handle.kill_requested = True
                 tasks.append(asyncio.create_task(self._kill_job(handle)))
 
         if tasks: