From 6118fee973dc213bd6d0853cc5046dd31eb92667 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Thu, 19 Mar 2026 13:30:22 +0000
Subject: [PATCH 01/16] Add adaptive message batching to reduce overhead under
 load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under sustained load the fixed per-batch overhead (workflow graph evaluation,
serialization, thread dispatch) dominates cycle time. AdaptiveMessageBatcher
wraps SimpleMessageBatcher and widens the batch window (1s → 2s → 4s → 8s)
when consecutive non-empty batches indicate the system cannot keep up, then
de-escalates when idle cycles show spare capacity. This is lossless — the
dashboard simply updates less frequently, but no data is dropped.

The current batch interval is reported in ServiceStatus and displayed in the
backend status widget when above the base 1s value.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ess/livedata/core/job.py                  |   1 +
 src/ess/livedata/core/message_batcher.py      | 122 ++++++++++
 .../livedata/core/orchestrating_processor.py  |  15 +-
 .../widgets/backend_status_widget.py          |   5 +-
 src/ess/livedata/kafka/x5f2_compat.py         |   5 +
 tests/core/message_batcher_test.py            | 222 +++++++++++++++++-
 tests/kafka/status_message_test.py            |  14 ++
 7 files changed, 380 insertions(+), 4 deletions(-)

diff --git a/src/ess/livedata/core/job.py b/src/ess/livedata/core/job.py
index dc7d92f32..edbab5781 100644
--- a/src/ess/livedata/core/job.py
+++ b/src/ess/livedata/core/job.py
@@ -121,6 +121,7 @@ class ServiceStatus:
     active_job_count: int
     messages_processed: int
     error: str | None = None
+    batch_interval_s: float = 1.0
 
 
 def _add_time_coords(
diff --git a/src/ess/livedata/core/message_batcher.py b/src/ess/livedata/core/message_batcher.py
index 5cc925fb7..7b895e9a3 100644
--- a/src/ess/livedata/core/message_batcher.py
+++ b/src/ess/livedata/core/message_batcher.py
@@ -1,12 +1,17 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2025 Scipp contributors (https://github.com/scipp)
+import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from numbers import Number
 from typing import Any
 
+import structlog
+
 from ess.livedata.core.message import Message
 
+logger = structlog.get_logger(__name__)
+
 
 @dataclass(slots=True, kw_only=True)
 class MessageBatch:
@@ -23,6 +28,26 @@ def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
         If no batch can be created (batch incomplete), return None.
         """
 
+    def report_batch(self, message_count: int | None) -> None:  # noqa: B027
+        """Report the outcome of the last processing cycle.
+
+        Called by the processor after each cycle. Batchers that support adaptive
+        behavior override this to adjust their batch length. The default is a
+        no-op.
+
+        Parameters
+        ----------
+        message_count:
+            Number of messages in the processed batch. ``None`` if the batcher
+            returned ``None`` (idle cycle). 0 indicates an empty batch from a
+            time gap.
+        """
+
+    @property
+    def batch_length_s(self) -> float:
+        """Current effective batch length in seconds."""
+        return 1.0
+
 
 class NaiveMessageBatcher(MessageBatcher):
     """
@@ -35,9 +60,14 @@ def __init__(
         self, batch_length_s: float = 1.0, pulse_length_s: float = 1.0 / 14
     ) -> None:
         # Batch length is currently ignored.
+        self._batch_length_s = batch_length_s
         self._batch_length_ns = int(batch_length_s * 1_000_000_000)
         self._pulse_length_ns = int(pulse_length_s * 1_000_000_000)
 
+    @property
+    def batch_length_s(self) -> float:
+        return self._batch_length_s
+
     def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
         # Filter messages with incompatible (broken) timestamps to avoid issues below.
         messages = [msg for msg in messages if isinstance(msg.timestamp, Number)]
@@ -81,10 +111,15 @@ class SimpleMessageBatcher(MessageBatcher):
     """
 
     def __init__(self, batch_length_s: float = 1.0) -> None:
+        self._batch_length_s_value = batch_length_s
         self._batch_length_ns = int(batch_length_s * 1_000_000_000)
         self._active_batch: MessageBatch | None = None
         self._future_messages: list[Message[Any]] = []
 
+    @property
+    def batch_length_s(self) -> float:
+        return self._batch_length_s_value
+
     def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
         # Filter messages with incompatible (broken) timestamps to avoid issues below.
         messages = [msg for msg in messages if isinstance(msg.timestamp, Number)]
@@ -143,3 +178,90 @@ def _split_messages(
         before = [msg for msg in messages if msg.timestamp < timestamp]
         after = [msg for msg in messages if msg.timestamp >= timestamp]
         return before, after
+
+
+ESCALATION_THRESHOLD = 5
+DEESCALATION_IDLE_WINDOWS = 3
+
+
+@dataclass(frozen=True)
+class AdaptiveBatcherState:
+    """State snapshot of an AdaptiveMessageBatcher for status reporting."""
+
+    level: int
+    batch_length_s: float
+
+
+class AdaptiveMessageBatcher(MessageBatcher):
+    """A message batcher that dynamically adjusts its batch length based on load.
+
+    Wraps a ``SimpleMessageBatcher`` and monitors the pattern of batch results
+    to detect sustained overload. When the system cannot keep up with the current
+    batch window, consecutive non-empty batches accumulate and the batcher
+    escalates to a longer window. When the system demonstrates spare capacity
+    (consecutive idle cycles), it de-escalates.
+
+    Each escalation level doubles the batch window from the base length.
+    """
+
+    def __init__(self, base_batch_length_s: float = 1.0, max_level: int = 3) -> None:
+        self._base_batch_length_s = base_batch_length_s
+        self._max_level = max_level
+        self._level = 0
+        self._consecutive_batches = 0
+        self._last_nonempty_batch_time: float | None = None
+        self._inner = SimpleMessageBatcher(batch_length_s=base_batch_length_s)
+
+    def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
+        return self._inner.batch(messages)
+
+    def report_batch(self, message_count: int | None) -> None:
+        if message_count is None:
+            # Idle cycle (batcher returned None). De-escalate only after being
+            # idle for multiple batch windows, not just multiple process-loop
+            # iterations. This prevents oscillation when the system barely keeps
+            # up: the short inter-batch gap at 0.1s sleep intervals would
+            # otherwise produce many rapid idle reports.
+            self._consecutive_batches = 0
+            if self._level > 0 and self._last_nonempty_batch_time is not None:
+                idle_s = time.monotonic() - self._last_nonempty_batch_time
+                idle_windows = idle_s / self.batch_length_s
+                if idle_windows >= DEESCALATION_IDLE_WINDOWS:
+                    self._set_level(self._level - 1)
+                    self._last_nonempty_batch_time = time.monotonic()
+        elif message_count == 0:
+            # Empty batch from time gap — not a load signal
+            pass
+        else:
+            # Non-empty batch
+            self._last_nonempty_batch_time = time.monotonic()
+            self._consecutive_batches += 1
+            if (
+                self._consecutive_batches >= ESCALATION_THRESHOLD
+                and self._level < self._max_level
+            ):
+                self._set_level(self._level + 1)
+                self._consecutive_batches = 0
+
+    def _set_level(self, new_level: int) -> None:
+        old_length = self.batch_length_s
+        self._level = new_level
+        new_length = self.batch_length_s
+        logger.warning(
+            'adaptive_batch_level_change',
+            old_batch_length_s=old_length,
+            new_batch_length_s=new_length,
+            level=new_level,
+        )
+        self._inner = SimpleMessageBatcher(batch_length_s=new_length)
+
+    @property
+    def batch_length_s(self) -> float:
+        return self._base_batch_length_s * (2**self._level)
+
+    @property
+    def state(self) -> AdaptiveBatcherState:
+        return AdaptiveBatcherState(
+            level=self._level,
+            batch_length_s=self.batch_length_s,
+        )
diff --git a/src/ess/livedata/core/orchestrating_processor.py b/src/ess/livedata/core/orchestrating_processor.py
index d871302c0..be043f588 100644
--- a/src/ess/livedata/core/orchestrating_processor.py
+++ b/src/ess/livedata/core/orchestrating_processor.py
@@ -28,7 +28,11 @@
     Tin,
     Tout,
 )
-from .message_batcher import MessageBatch, MessageBatcher, SimpleMessageBatcher
+from .message_batcher import (
+    AdaptiveMessageBatcher,
+    MessageBatch,
+    MessageBatcher,
+)
 
 logger = structlog.get_logger(__name__)
 
@@ -112,7 +116,7 @@ def __init__(
             job_factory=JobFactory(instrument=instrument), job_threads=job_threads
         )
         self._job_manager_adapter = JobManagerAdapter(job_manager=self._job_manager)
-        self._message_batcher = message_batcher or SimpleMessageBatcher()
+        self._message_batcher = message_batcher or AdaptiveMessageBatcher()
         self._config_processor = ConfigProcessor(
             job_manager_adapter=self._job_manager_adapter
         )
@@ -172,6 +176,7 @@ def process(self) -> None:
 
         message_batch = self._message_batcher.batch(data_messages)
         if message_batch is None:
+            self._report_batch(None)
             self._empty_batches += 1
             self._maybe_log_metrics()
             self._sink.publish_messages(result_messages)
@@ -220,6 +225,7 @@ def process(self) -> None:
             else:
                 valid_results.append(result)
 
+        self._report_batch(len(message_batch.messages))
         self._batches_processed += 1
         self._maybe_log_metrics()
 
@@ -248,6 +254,10 @@ def _report_status(self) -> None:
 
         self._sink.publish_messages(messages)
 
+    def _report_batch(self, message_count: int | None) -> None:
+        """Forward batch outcome to the batcher for adaptive behavior."""
+        self._message_batcher.report_batch(message_count)
+
     def _get_service_status(self, job_statuses: list[JobStatus]) -> ServiceStatus:
         """Get the current service status for heartbeat publishing."""
         return ServiceStatus(
@@ -259,6 +269,7 @@ def _get_service_status(self, job_statuses: list[JobStatus]) -> ServiceStatus:
             active_job_count=len(job_statuses),
             messages_processed=self._messages_processed,
             error=self._service_error,
+            batch_interval_s=self._message_batcher.batch_length_s,
         )
 
     def _maybe_log_metrics(self) -> None:
diff --git a/src/ess/livedata/dashboard/widgets/backend_status_widget.py b/src/ess/livedata/dashboard/widgets/backend_status_widget.py
index f079e6ad2..272b3c04d 100644
--- a/src/ess/livedata/dashboard/widgets/backend_status_widget.py
+++ b/src/ess/livedata/dashboard/widgets/backend_status_widget.py
@@ -190,7 +190,10 @@ def update(
         # Stats
         jobs_text = f"Jobs: {status.active_job_count}"
         msgs_text = f"Msgs: {_format_messages(status.messages_processed)}"
-        self._stats_pane.object = f"<span>{jobs_text} | {msgs_text}</span>"
+        batch_text = f"Batch: {status.batch_interval_s:.0f}s"
+        self._stats_pane.object = (
+            f"<span>{jobs_text} | {msgs_text} | {batch_text}</span>"
+        )
 
     def _calculate_uptime(self, started_at_ns: int) -> float:
         """Calculate uptime in seconds from started_at timestamp."""
diff --git a/src/ess/livedata/kafka/x5f2_compat.py b/src/ess/livedata/kafka/x5f2_compat.py
index 9617f77a0..a0d6a63b6 100644
--- a/src/ess/livedata/kafka/x5f2_compat.py
+++ b/src/ess/livedata/kafka/x5f2_compat.py
@@ -195,6 +195,9 @@ class ServiceStatusPayload(pydantic.BaseModel):
         description="Total messages processed since startup"
     )
     error: str | None = pydantic.Field(default=None, description="Error message if any")
+    batch_interval_s: float = pydantic.Field(
+        default=1.0, description="Current batch interval in seconds"
+    )
 
 
 class ServiceStatusJSON(pydantic.BaseModel):
@@ -280,6 +283,7 @@ def from_service_status(
                     active_job_count=status.active_job_count,
                     messages_processed=status.messages_processed,
                     error=status.error,
+                    batch_interval_s=status.batch_interval_s,
                 ),
             ),
         )
@@ -296,6 +300,7 @@ def to_service_status(self) -> ServiceStatus:
             active_job_count=message.active_job_count,
             messages_processed=message.messages_processed,
             error=message.error,
+            batch_interval_s=message.batch_interval_s,
         )
 
 
diff --git a/tests/core/message_batcher_test.py b/tests/core/message_batcher_test.py
index ce27bee24..de4765ee2 100644
--- a/tests/core/message_batcher_test.py
+++ b/tests/core/message_batcher_test.py
@@ -1,7 +1,14 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2025 Scipp contributors (https://github.com/scipp)
+from unittest.mock import patch
+
 from ess.livedata.core.message import Message, StreamId, StreamKind
-from ess.livedata.core.message_batcher import SimpleMessageBatcher
+from ess.livedata.core.message_batcher import (
+    DEESCALATION_IDLE_WINDOWS,
+    ESCALATION_THRESHOLD,
+    AdaptiveMessageBatcher,
+    SimpleMessageBatcher,
+)
 
 
 def make_message(timestamp_ns: int, value: str = "test") -> Message[str]:
@@ -385,3 +392,216 @@ def test_large_gap_single_call_returns_first_empty_batch(self):
         assert next_batch.start_time == 1000 + batch_length_ns
         assert next_batch.end_time == 1000 + 2 * batch_length_ns
         assert len(next_batch.messages) == 0
+
+
+class FakeClock:
+    """Fake monotonic clock for testing time-based de-escalation."""
+
+    def __init__(self, start: float = 0.0) -> None:
+        self.now = start
+
+    def __call__(self) -> float:
+        return self.now
+
+    def advance(self, seconds: float) -> None:
+        self.now += seconds
+
+
+class TestAdaptiveMessageBatcher:
+    def test_initial_state_is_level_zero(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        assert batcher.state.level == 0
+        assert batcher.state.batch_length_s == 1.0
+
+    def test_delegates_to_inner_batcher(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0)
+        msg = make_message(1000)
+        batch = batcher.batch([msg])
+        assert batch is not None
+        assert batch.messages == [msg]
+
+    def test_escalates_after_consecutive_non_empty_batches(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        for _ in range(ESCALATION_THRESHOLD):
+            batcher.report_batch(100)
+
+        assert batcher.state.level == 1
+        assert batcher.state.batch_length_s == 2.0
+
+    def test_does_not_escalate_before_threshold(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        for _ in range(ESCALATION_THRESHOLD - 1):
+            batcher.report_batch(100)
+
+        assert batcher.state.level == 0
+
+    def test_escalation_capped_at_max_level(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        # Escalate to max
+        for _ in range(ESCALATION_THRESHOLD):
+            batcher.report_batch(100)
+        assert batcher.state.level == 1
+
+        for _ in range(ESCALATION_THRESHOLD):
+            batcher.report_batch(100)
+        assert batcher.state.level == 2
+
+        # Further non-empty batches should not exceed max
+        for _ in range(ESCALATION_THRESHOLD * 2):
+            batcher.report_batch(100)
+        assert batcher.state.level == 2
+        assert batcher.state.batch_length_s == 4.0
+
+    def test_deescalates_after_idle_duration(self):
+        clock = FakeClock()
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            # Escalate to level 1 (batch_length = 2s)
+            for _ in range(ESCALATION_THRESHOLD):
+                batcher.report_batch(100)
+            assert batcher.state.level == 1
+
+            # Idle for just under the threshold — no de-escalation
+            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
+            batcher.report_batch(None)
+            assert batcher.state.level == 1
+
+            # Cross the threshold
+            clock.advance(0.2)
+            batcher.report_batch(None)
+            assert batcher.state.level == 0
+
+    def test_does_not_deescalate_below_zero(self):
+        clock = FakeClock()
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            # Many idle cycles at level 0 should stay at 0
+            clock.advance(100.0)
+            batcher.report_batch(None)
+            assert batcher.state.level == 0
+
+    def test_idle_cycle_resets_consecutive_batches(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        # Almost reach escalation threshold
+        for _ in range(ESCALATION_THRESHOLD - 1):
+            batcher.report_batch(100)
+
+        # One idle cycle resets
+        batcher.report_batch(None)
+
+        # Need full threshold again
+        for _ in range(ESCALATION_THRESHOLD - 1):
+            batcher.report_batch(100)
+        assert batcher.state.level == 0
+
+    def test_non_empty_batch_resets_idle_timer(self):
+        clock = FakeClock()
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            # Escalate to level 1 (batch_length = 2s)
+            for _ in range(ESCALATION_THRESHOLD):
+                batcher.report_batch(100)
+            assert batcher.state.level == 1
+
+            # Almost reach de-escalation time
+            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
+            batcher.report_batch(None)
+            assert batcher.state.level == 1
+
+            # A non-empty batch resets the idle timer
+            batcher.report_batch(100)
+
+            # Now need the full idle duration again
+            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
+            batcher.report_batch(None)
+            assert batcher.state.level == 1
+
+    def test_empty_batches_excluded_from_both_counters(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        # Interleave empty batches (count=0) with non-empty — should not reset
+        for _ in range(ESCALATION_THRESHOLD - 1):
+            batcher.report_batch(100)
+            batcher.report_batch(0)  # empty batch from time gap
+
+        batcher.report_batch(100)
+        assert batcher.state.level == 1
+
+    def test_empty_batches_do_not_contribute_to_escalation(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        # Only empty batches should not escalate
+        for _ in range(ESCALATION_THRESHOLD * 3):
+            batcher.report_batch(0)
+        assert batcher.state.level == 0
+
+    def test_multi_level_escalation_and_deescalation(self):
+        clock = FakeClock()
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            # Escalate to level 2
+            for _ in range(ESCALATION_THRESHOLD):
+                batcher.report_batch(100)
+            assert batcher.state.level == 1
+            assert batcher.state.batch_length_s == 2.0
+
+            for _ in range(ESCALATION_THRESHOLD):
+                batcher.report_batch(100)
+            assert batcher.state.level == 2
+            assert batcher.state.batch_length_s == 4.0
+
+            # De-escalate one level at a time
+            # At level 2 (4s window), need 3 x4s = 12s idle
+            clock.advance(DEESCALATION_IDLE_WINDOWS * 4.0)
+            batcher.report_batch(None)
+            assert batcher.state.level == 1
+            assert batcher.state.batch_length_s == 2.0
+
+            # At level 1 (2s window), need 3 x2s = 6s idle
+            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0)
+            batcher.report_batch(None)
+            assert batcher.state.level == 0
+            assert batcher.state.batch_length_s == 1.0
+
+    def test_state_reflects_custom_base_length(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=0.5, max_level=2)
+        assert batcher.state.batch_length_s == 0.5
+
+        for _ in range(ESCALATION_THRESHOLD):
+            batcher.report_batch(100)
+        assert batcher.state.batch_length_s == 1.0
+
+        for _ in range(ESCALATION_THRESHOLD):
+            batcher.report_batch(100)
+        assert batcher.state.batch_length_s == 2.0
+
+    def test_no_oscillation_when_barely_keeping_up(self):
+        """At 8s window, rapid idle cycles between batches should not de-escalate."""
+        clock = FakeClock()
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            # Escalate to level 3 (8s window)
+            for _ in range(ESCALATION_THRESHOLD * 3):
+                batcher.report_batch(100)
+            assert batcher.state.level == 3
+
+            # Simulate "barely keeping up": process batch in 7s, then 1s of idle
+            for _ in range(10):
+                clock.advance(7.0)  # batch processing
+                batcher.report_batch(100)
+                # Rapid idle cycles during the 1s gap (0.1s each)
+                for _ in range(10):
+                    clock.advance(0.1)
+                    batcher.report_batch(None)
+
+            # Should stay at level 3 — the 1s inter-batch gaps are not enough
+            assert batcher.state.level == 3
diff --git a/tests/kafka/status_message_test.py b/tests/kafka/status_message_test.py
index 10e88dc2f..90ca2f530 100644
--- a/tests/kafka/status_message_test.py
+++ b/tests/kafka/status_message_test.py
@@ -868,6 +868,20 @@ def test_service_status_x5f2_all_states(self):
             converted = x5f2_to_service_status(x5f2_data)
             assert converted.state == state, f"Failed for state {state}"
 
+    def test_service_status_x5f2_batch_interval_round_trip(self):
+        """Test that batch_interval_s survives x5f2 encode/decode."""
+        original = make_service_status(batch_interval_s=4.0)
+        x5f2_data = service_status_to_x5f2(original)
+        converted = x5f2_to_service_status(x5f2_data)
+        assert converted.batch_interval_s == 4.0
+
+    def test_service_status_x5f2_batch_interval_default(self):
+        """Test that batch_interval_s defaults to 1.0 for backward compatibility."""
+        original = make_service_status()
+        x5f2_data = service_status_to_x5f2(original)
+        converted = x5f2_to_service_status(x5f2_data)
+        assert converted.batch_interval_s == 1.0
+
 
 class TestX5f2ToStatusDiscriminator:
     """Test x5f2_to_status function.

From 92b938c514ebac127e98ccfd9403be758f600fad Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 06:03:15 +0000
Subject: [PATCH 02/16] Replace count-based adaptive batching with
 processing-time-aware MIAD strategy

The old count-based heuristic used consecutive non-empty batches as a proxy
for overload, causing slow escalation, inability to de-escalate under light
continuous load, and false escalation when processing fits within the window.

The new strategy uses actual processing_time_s feedback:
- Escalation: after 2 consecutive batches where processing exceeds the window
- De-escalation: after 5 consecutive batches with <70% window utilization
- Idle fallback: wall-clock de-escalation after 3 idle windows (unchanged)
---
 src/ess/livedata/core/message_batcher.py |  81 ++++++++----
 tests/core/message_batcher_test.py       | 153 ++++++++++++++---------
 2 files changed, 151 insertions(+), 83 deletions(-)

diff --git a/src/ess/livedata/core/message_batcher.py b/src/ess/livedata/core/message_batcher.py
index 7b895e9a3..6c2372073 100644
--- a/src/ess/livedata/core/message_batcher.py
+++ b/src/ess/livedata/core/message_batcher.py
@@ -28,7 +28,11 @@ def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
         If no batch can be created (batch incomplete), return None.
         """
 
-    def report_batch(self, message_count: int | None) -> None:  # noqa: B027
+    def report_batch(  # noqa: B027
+        self,
+        message_count: int | None,
+        processing_time_s: float = 0.0,
+    ) -> None:
         """Report the outcome of the last processing cycle.
 
         Called by the processor after each cycle. Batchers that support adaptive
@@ -41,6 +45,10 @@ def report_batch(self, message_count: int | None) -> None:  # noqa: B027
             Number of messages in the processed batch. ``None`` if the batcher
             returned ``None`` (idle cycle). 0 indicates an empty batch from a
             time gap.
+        processing_time_s:
+            Wall-clock time spent processing the batch (preprocessing, workflow
+            execution, serialization). Used by adaptive batchers to detect
+            overload. Ignored for idle cycles.
         """
 
     @property
@@ -180,7 +188,10 @@ def _split_messages(
         return before, after
 
 
-ESCALATION_THRESHOLD = 5
+ESCALATION_OVERLOAD_THRESHOLD = 2
+ESCALATION_LEVEL_JUMP = 1
+DEESCALATION_HEADROOM_RATIO = 0.7
+DEESCALATION_UNDERLOAD_THRESHOLD = 5
 DEESCALATION_IDLE_WINDOWS = 3
 
 
@@ -195,11 +206,11 @@ class AdaptiveBatcherState:
 class AdaptiveMessageBatcher(MessageBatcher):
     """A message batcher that dynamically adjusts its batch length based on load.
 
-    Wraps a ``SimpleMessageBatcher`` and monitors the pattern of batch results
-    to detect sustained overload. When the system cannot keep up with the current
-    batch window, consecutive non-empty batches accumulate and the batcher
-    escalates to a longer window. When the system demonstrates spare capacity
-    (consecutive idle cycles), it de-escalates.
+    Wraps a ``SimpleMessageBatcher`` and uses processing-time feedback to detect
+    overload. When processing consistently exceeds the batch window, the batcher
+    escalates to a longer window (multiplicative increase). When processing
+    completes with significant headroom, it de-escalates (additive decrease).
+    Idle periods also trigger de-escalation via a wall-clock fallback.
 
     Each escalation level doubles the batch window from the base length.
     """
@@ -208,21 +219,24 @@ def __init__(self, base_batch_length_s: float = 1.0, max_level: int = 3) -> None
         self._base_batch_length_s = base_batch_length_s
         self._max_level = max_level
         self._level = 0
-        self._consecutive_batches = 0
+        self._consecutive_overloaded = 0
+        self._consecutive_underloaded = 0
         self._last_nonempty_batch_time: float | None = None
         self._inner = SimpleMessageBatcher(batch_length_s=base_batch_length_s)
 
     def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
         return self._inner.batch(messages)
 
-    def report_batch(self, message_count: int | None) -> None:
+    def report_batch(
+        self,
+        message_count: int | None,
+        processing_time_s: float = 0.0,
+    ) -> None:
         if message_count is None:
-            # Idle cycle (batcher returned None). De-escalate only after being
-            # idle for multiple batch windows, not just multiple process-loop
-            # iterations. This prevents oscillation when the system barely keeps
-            # up: the short inter-batch gap at 0.1s sleep intervals would
-            # otherwise produce many rapid idle reports.
-            self._consecutive_batches = 0
+            # Idle cycle — reset both counters. Fall back to wall-clock
+            # de-escalation when data stops entirely.
+            self._consecutive_overloaded = 0
+            self._consecutive_underloaded = 0
             if self._level > 0 and self._last_nonempty_batch_time is not None:
                 idle_s = time.monotonic() - self._last_nonempty_batch_time
                 idle_windows = idle_s / self.batch_length_s
@@ -233,15 +247,36 @@ def report_batch(self, message_count: int | None) -> None:
             # Empty batch from time gap — not a load signal
             pass
         else:
-            # Non-empty batch
+            # Non-empty batch — use processing time to decide
             self._last_nonempty_batch_time = time.monotonic()
-            self._consecutive_batches += 1
-            if (
-                self._consecutive_batches >= ESCALATION_THRESHOLD
-                and self._level < self._max_level
-            ):
-                self._set_level(self._level + 1)
-                self._consecutive_batches = 0
+
+            if processing_time_s > self.batch_length_s:
+                # Overloaded: processing exceeded the batch window
+                self._consecutive_overloaded += 1
+                self._consecutive_underloaded = 0
+                if (
+                    self._consecutive_overloaded >= ESCALATION_OVERLOAD_THRESHOLD
+                    and self._level < self._max_level
+                ):
+                    new_level = min(
+                        self._level + ESCALATION_LEVEL_JUMP, self._max_level
+                    )
+                    self._set_level(new_level)
+                    self._consecutive_overloaded = 0
+            elif processing_time_s < self.batch_length_s * DEESCALATION_HEADROOM_RATIO:
+                # Underloaded: significant headroom
+                self._consecutive_underloaded += 1
+                self._consecutive_overloaded = 0
+                if (
+                    self._consecutive_underloaded >= DEESCALATION_UNDERLOAD_THRESHOLD
+                    and self._level > 0
+                ):
+                    self._set_level(self._level - 1)
+                    self._consecutive_underloaded = 0
+            else:
+                # In between — processing fits but without much headroom
+                self._consecutive_overloaded = 0
+                self._consecutive_underloaded = 0
 
     def _set_level(self, new_level: int) -> None:
         old_length = self.batch_length_s
diff --git a/tests/core/message_batcher_test.py b/tests/core/message_batcher_test.py
index de4765ee2..82a9cff52 100644
--- a/tests/core/message_batcher_test.py
+++ b/tests/core/message_batcher_test.py
@@ -4,8 +4,10 @@
 
 from ess.livedata.core.message import Message, StreamId, StreamKind
 from ess.livedata.core.message_batcher import (
+    DEESCALATION_HEADROOM_RATIO,
     DEESCALATION_IDLE_WINDOWS,
-    ESCALATION_THRESHOLD,
+    DEESCALATION_UNDERLOAD_THRESHOLD,
+    ESCALATION_OVERLOAD_THRESHOLD,
     AdaptiveMessageBatcher,
     SimpleMessageBatcher,
 )
@@ -407,6 +409,14 @@ def advance(self, seconds: float) -> None:
         self.now += seconds
 
 
+def _escalate_to_level(batcher: AdaptiveMessageBatcher, level: int) -> None:
+    """Drive the batcher to the given level by reporting overloaded batches."""
+    while batcher.state.level < level:
+        window = batcher.batch_length_s
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD):
+            batcher.report_batch(100, processing_time_s=window * 1.5)
+
+
 class TestAdaptiveMessageBatcher:
     def test_initial_state_is_level_zero(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
@@ -420,11 +430,11 @@ def test_delegates_to_inner_batcher(self):
         assert batch is not None
         assert batch.messages == [msg]
 
-    def test_escalates_after_consecutive_non_empty_batches(self):
+    def test_escalates_after_consecutive_overloaded_batches(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        for _ in range(ESCALATION_THRESHOLD):
-            batcher.report_batch(100)
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD):
+            batcher.report_batch(100, processing_time_s=1.5)
 
         assert batcher.state.level == 1
         assert batcher.state.batch_length_s == 2.0
@@ -432,37 +442,37 @@ def test_escalates_after_consecutive_non_empty_batches(self):
     def test_does_not_escalate_before_threshold(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        for _ in range(ESCALATION_THRESHOLD - 1):
-            batcher.report_batch(100)
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=1.5)
 
         assert batcher.state.level == 0
 
-    def test_escalation_capped_at_max_level(self):
+    def test_does_not_escalate_when_processing_fits(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        # Escalate to max
-        for _ in range(ESCALATION_THRESHOLD):
-            batcher.report_batch(100)
-        assert batcher.state.level == 1
+        for _ in range(20):
+            batcher.report_batch(100, processing_time_s=0.8)
 
-        for _ in range(ESCALATION_THRESHOLD):
-            batcher.report_batch(100)
-        assert batcher.state.level == 2
+        assert batcher.state.level == 0
+
+    def test_escalation_capped_at_max_level(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        # Further non-empty batches should not exceed max
-        for _ in range(ESCALATION_THRESHOLD * 2):
-            batcher.report_batch(100)
+        _escalate_to_level(batcher, 2)
         assert batcher.state.level == 2
         assert batcher.state.batch_length_s == 4.0
 
+        # Further overloaded batches should not exceed max
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD * 2):
+            batcher.report_batch(100, processing_time_s=10.0)
+        assert batcher.state.level == 2
+
     def test_deescalates_after_idle_duration(self):
         clock = FakeClock()
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            # Escalate to level 1 (batch_length = 2s)
-            for _ in range(ESCALATION_THRESHOLD):
-                batcher.report_batch(100)
+            _escalate_to_level(batcher, 1)
             assert batcher.state.level == 1
 
             # Idle for just under the threshold — no de-escalation
@@ -480,24 +490,23 @@ def test_does_not_deescalate_below_zero(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            # Many idle cycles at level 0 should stay at 0
             clock.advance(100.0)
             batcher.report_batch(None)
             assert batcher.state.level == 0
 
-    def test_idle_cycle_resets_consecutive_batches(self):
+    def test_idle_cycle_resets_overload_counter(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         # Almost reach escalation threshold
-        for _ in range(ESCALATION_THRESHOLD - 1):
-            batcher.report_batch(100)
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=1.5)
 
         # One idle cycle resets
         batcher.report_batch(None)
 
         # Need full threshold again
-        for _ in range(ESCALATION_THRESHOLD - 1):
-            batcher.report_batch(100)
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=1.5)
         assert batcher.state.level == 0
 
     def test_non_empty_batch_resets_idle_timer(self):
@@ -505,9 +514,7 @@ def test_non_empty_batch_resets_idle_timer(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            # Escalate to level 1 (batch_length = 2s)
-            for _ in range(ESCALATION_THRESHOLD):
-                batcher.report_batch(100)
+            _escalate_to_level(batcher, 1)
             assert batcher.state.level == 1
 
             # Almost reach de-escalation time
@@ -516,56 +523,71 @@ def test_non_empty_batch_resets_idle_timer(self):
             assert batcher.state.level == 1
 
             # A non-empty batch resets the idle timer
-            batcher.report_batch(100)
+            batcher.report_batch(100, processing_time_s=1.5)
 
             # Now need the full idle duration again
             clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
             batcher.report_batch(None)
             assert batcher.state.level == 1
 
-    def test_empty_batches_excluded_from_both_counters(self):
+    def test_empty_batches_excluded_from_counters(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        # Interleave empty batches (count=0) with non-empty — should not reset
-        for _ in range(ESCALATION_THRESHOLD - 1):
-            batcher.report_batch(100)
-            batcher.report_batch(0)  # empty batch from time gap
+        # Interleave empty batches with overloaded — should not reset counter
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=1.5)
+            batcher.report_batch(0)
 
-        batcher.report_batch(100)
+        batcher.report_batch(100, processing_time_s=1.5)
         assert batcher.state.level == 1
 
     def test_empty_batches_do_not_contribute_to_escalation(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        # Only empty batches should not escalate
-        for _ in range(ESCALATION_THRESHOLD * 3):
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD * 3):
             batcher.report_batch(0)
         assert batcher.state.level == 0
 
+    def test_deescalates_under_sustained_light_load(self):
+        """De-escalation via underload: processing uses less than headroom ratio."""
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        _escalate_to_level(batcher, 1)
+        assert batcher.state.level == 1
+
+        # Report underloaded batches (processing < 70% of 2s window)
+        underloaded_time = batcher.batch_length_s * DEESCALATION_HEADROOM_RATIO - 0.1
+        for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD):
+            batcher.report_batch(100, processing_time_s=underloaded_time)
+
+        assert batcher.state.level == 0
+
+    def test_does_not_deescalate_without_enough_headroom(self):
+        """No de-escalation when processing uses most of the window."""
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        _escalate_to_level(batcher, 1)
+        assert batcher.state.level == 1
+
+        # Processing at 80% of 2s window — above headroom threshold
+        for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD * 3):
+            batcher.report_batch(100, processing_time_s=1.6)
+
+        assert batcher.state.level == 1
+
     def test_multi_level_escalation_and_deescalation(self):
         clock = FakeClock()
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            # Escalate to level 2
-            for _ in range(ESCALATION_THRESHOLD):
-                batcher.report_batch(100)
-            assert batcher.state.level == 1
-            assert batcher.state.batch_length_s == 2.0
-
-            for _ in range(ESCALATION_THRESHOLD):
-                batcher.report_batch(100)
+            _escalate_to_level(batcher, 2)
             assert batcher.state.level == 2
             assert batcher.state.batch_length_s == 4.0
 
-            # De-escalate one level at a time
-            # At level 2 (4s window), need 3 x4s = 12s idle
+            # De-escalate via idle — one level at a time
             clock.advance(DEESCALATION_IDLE_WINDOWS * 4.0)
             batcher.report_batch(None)
             assert batcher.state.level == 1
             assert batcher.state.batch_length_s == 2.0
 
-            # At level 1 (2s window), need 3 x2s = 6s idle
             clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0)
             batcher.report_batch(None)
             assert batcher.state.level == 0
@@ -575,12 +597,10 @@ def test_state_reflects_custom_base_length(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=0.5, max_level=2)
         assert batcher.state.batch_length_s == 0.5
 
-        for _ in range(ESCALATION_THRESHOLD):
-            batcher.report_batch(100)
+        _escalate_to_level(batcher, 1)
         assert batcher.state.batch_length_s == 1.0
 
-        for _ in range(ESCALATION_THRESHOLD):
-            batcher.report_batch(100)
+        _escalate_to_level(batcher, 2)
         assert batcher.state.batch_length_s == 2.0
 
     def test_no_oscillation_when_barely_keeping_up(self):
@@ -589,19 +609,32 @@ def test_no_oscillation_when_barely_keeping_up(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            # Escalate to level 3 (8s window)
-            for _ in range(ESCALATION_THRESHOLD * 3):
-                batcher.report_batch(100)
+            _escalate_to_level(batcher, 3)
             assert batcher.state.level == 3
 
             # Simulate "barely keeping up": process batch in 7s, then 1s of idle
             for _ in range(10):
-                clock.advance(7.0)  # batch processing
-                batcher.report_batch(100)
-                # Rapid idle cycles during the 1s gap (0.1s each)
+                clock.advance(7.0)
+                batcher.report_batch(100, processing_time_s=7.0)
                 for _ in range(10):
                     clock.advance(0.1)
                     batcher.report_batch(None)
 
-            # Should stay at level 3 — the 1s inter-batch gaps are not enough
             assert batcher.state.level == 3
+
+    def test_overload_resets_underload_counter(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        _escalate_to_level(batcher, 1)
+
+        # Almost enough underloaded batches
+        underloaded_time = batcher.batch_length_s * DEESCALATION_HEADROOM_RATIO - 0.1
+        for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=underloaded_time)
+
+        # One overloaded batch resets the counter
+        batcher.report_batch(100, processing_time_s=batcher.batch_length_s + 0.1)
+
+        # Need full threshold again
+        for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=underloaded_time)
+        assert batcher.state.level == 1

From 9c0e80b32e1ff8095397f469ededcec5957c39f0 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 06:07:51 +0000
Subject: [PATCH 03/16] Initial adaptive batching scenario tests

---
 .../core/adaptive_batching_scenarios_test.py  | 854 ++++++++++++++++++
 1 file changed, 854 insertions(+)
 create mode 100644 tests/core/adaptive_batching_scenarios_test.py

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
new file mode 100644
index 000000000..04955896d
--- /dev/null
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -0,0 +1,854 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2025 Scipp contributors (https://github.com/scipp)
+"""Scenario-based tests for adaptive message batching strategies.
+
+These tests simulate realistic load patterns by running a processing loop that
+feeds batch outcomes back into a ``MessageBatcher``.  They assert on observable
+properties — escalation time, maximum backlog, oscillation — rather than on
+implementation internals, so they remain valid as the strategy evolves.
+
+The simulation model:
+- Time advances discretely per processing cycle.
+- Each cycle, the batcher's current ``batch_length_s`` determines how much
+  wall-clock data is covered.
+- A ``processing_cost`` function returns how long the batch *takes* to process,
+  based on the batch window and a per-batch overhead.
+- If processing takes longer than the batch window, backlog accumulates
+  (the system falls behind real-time).
+- Random jitter is optionally added to processing times.
+
+All acceptance thresholds are collected in :data:`LIMITS` so that tuning the
+strategy and its acceptable bounds can be done in one place.
+"""
+
+from __future__ import annotations
+
+import random
+from dataclasses import dataclass, field
+from typing import Protocol
+from unittest.mock import patch
+
+from ess.livedata.core.message_batcher import (
+    AdaptiveMessageBatcher,
+    MessageBatcher,
+)
+
+# ===========================================================================
+# Acceptance limits — one place to view and adjust all thresholds
+# ===========================================================================
+
+# Each scenario test references a key from this dict.  When tuning the
+# batching strategy or its parameters, start here: tighten the bounds,
+# run the tests, and iterate.
+#
+# Convention:
+#   max_*    — upper bound (test asserts  value <= limit)
+#   min_*    — lower bound (test asserts  value >= limit)
+#
+# All time values are in seconds.
+
+LIMITS: dict[str, dict[str, float]] = {
+    # -- Step-function escalation (shutter open) --------------------------
+    "step_function_escalation": {
+        "max_time_to_first_escalation_s": 10.0,
+    },
+    "step_function_backlog": {
+        "max_backlog_s": 5.0,
+        "max_final_backlog_s": 1.0,
+    },
+    "severe_step_function": {
+        "min_level_reached": 2,
+    },
+    # -- No escalation when not needed ------------------------------------
+    "light_load": {
+        "max_level": 0,
+    },
+    "gc_jitter": {
+        "max_level": 0,
+    },
+    # -- No oscillation ---------------------------------------------------
+    "steady_load_oscillation": {
+        "max_oscillations": 0,
+    },
+    "boundary_oscillation": {
+        "max_oscillations": 4,
+    },
+    # -- Creeping overload ------------------------------------------------
+    "creeping_overload": {
+        "min_level_reached": 1,
+    },
+    "creeping_overload_backlog": {
+        "max_backlog_s": 5.0,
+    },
+    "mild_creeping_overload": {
+        "max_level": 1,
+    },
+    # -- De-escalation ----------------------------------------------------
+    "deescalation_to_idle": {
+        "max_final_level": 0,
+    },
+    "deescalation_to_light_load": {
+        "max_final_level": 0,
+    },
+    # -- Realistic shutter ------------------------------------------------
+    "shutter_open_close": {
+        "min_level_reached": 1,
+        "max_final_level": 0,
+        "max_backlog_s": 10.0,
+    },
+    "repeated_shutter_cycles": {
+        "max_final_level": 0,
+    },
+    # -- Backlog draining -------------------------------------------------
+    "backlog_drains": {
+        "max_final_backlog_s": 1.0,
+    },
+    # -- Processing-time awareness ----------------------------------------
+    "fast_escalation_clear_overload": {
+        "max_time_to_first_escalation_s": 5.0,
+    },
+    "no_escalation_when_fits": {
+        "max_level": 0,
+    },
+}
+
+
+# ---------------------------------------------------------------------------
+# Simulation infrastructure
+# ---------------------------------------------------------------------------
+
+
+class ProcessingCostFn(Protocol):
+    """Returns the processing time (seconds) for a batch of given window."""
+
+    def __call__(
+        self, batch_window_s: float, wall_time_s: float
+    ) -> float: ...
+
+
+@dataclass
+class CycleRecord:
+    """A single processing-loop iteration."""
+
+    wall_time_s: float
+    batch_window_s: float
+    processing_time_s: float
+    backlog_s: float
+    level: int
+
+
+@dataclass
+class SimulationResult:
+    """Aggregate outcome of a simulation run."""
+
+    cycles: list[CycleRecord] = field(default_factory=list)
+
+    @property
+    def max_backlog_s(self) -> float:
+        if not self.cycles:
+            return 0.0
+        return max(c.backlog_s for c in self.cycles)
+
+    @property
+    def final_backlog_s(self) -> float:
+        return self.cycles[-1].backlog_s if self.cycles else 0.0
+
+    @property
+    def final_level(self) -> int:
+        return self.cycles[-1].level if self.cycles else 0
+
+    @property
+    def max_level(self) -> int:
+        if not self.cycles:
+            return 0
+        return max(c.level for c in self.cycles)
+
+    @property
+    def total_wall_time_s(self) -> float:
+        return self.cycles[-1].wall_time_s if self.cycles else 0.0
+
+    def time_at_level(self, level: int) -> float:
+        """Total wall time spent at a given level."""
+        return sum(
+            c.processing_time_s
+            for c in self.cycles
+            if c.level == level
+        )
+
+    def level_changes(self) -> list[tuple[float, int, int]]:
+        """List of (wall_time, old_level, new_level) transitions."""
+        return [
+            (
+                self.cycles[i].wall_time_s,
+                self.cycles[i - 1].level,
+                self.cycles[i].level,
+            )
+            for i in range(1, len(self.cycles))
+            if self.cycles[i].level != self.cycles[i - 1].level
+        ]
+
+    def first_escalation_time_s(self) -> float | None:
+        """Wall time of the first escalation, or None."""
+        for t, old, new in self.level_changes():
+            if new > old:
+                return t
+        return None
+
+    def oscillation_count(self) -> int:
+        """Number of direction changes (up->down or down->up)."""
+        changes = self.level_changes()
+        if len(changes) < 2:
+            return 0
+        directions = [
+            1 if new > old else -1 for _, old, new in changes
+        ]
+        return sum(
+            1
+            for i in range(1, len(directions))
+            if directions[i] != directions[i - 1]
+        )
+
+
+class FakeClock:
+    """Deterministic monotonic clock for simulation."""
+
+    def __init__(self, start: float = 0.0) -> None:
+        self.now = start
+
+    def __call__(self) -> float:
+        return self.now
+
+    def advance(self, seconds: float) -> None:
+        self.now += seconds
+
+
+def simulate(
+    batcher: MessageBatcher,
+    duration_s: float,
+    cost_fn: ProcessingCostFn,
+    clock: FakeClock,
+    *,
+    idle_poll_interval_s: float = 0.1,
+) -> SimulationResult:
+    """Run a simulated processing loop.
+
+    The loop mimics ``OrchestratingProcessor.process()``:
+    1. Read the batcher's current window size.
+    2. Compute how long processing takes (via ``cost_fn``).
+    3. If processing < window, the remaining time is idle cycles.
+    4. Advance the clock and report the batch outcome.
+    5. If processing > window, backlog accumulates.
+    """
+    result = SimulationResult()
+    backlog_s = 0.0
+
+    while clock.now < duration_s:
+        window = batcher.batch_length_s
+        processing_time = cost_fn(window, clock.now)
+
+        if processing_time <= 0:
+            clock.advance(idle_poll_interval_s)
+            batcher.report_batch(None, processing_time_s=0.0)
+            level = _get_level(batcher)
+            result.cycles.append(CycleRecord(
+                wall_time_s=clock.now,
+                batch_window_s=window,
+                processing_time_s=0.0,
+                backlog_s=backlog_s,
+                level=level,
+            ))
+            continue
+
+        clock.advance(processing_time)
+
+        if processing_time > window:
+            backlog_s += processing_time - window
+        else:
+            spare = window - processing_time
+            drained = min(spare, backlog_s)
+            backlog_s -= drained
+            remaining_idle = spare - drained
+            if remaining_idle > 0:
+                n_idle = int(remaining_idle / idle_poll_interval_s)
+                for _ in range(n_idle):
+                    clock.advance(idle_poll_interval_s)
+                    batcher.report_batch(
+                        None, processing_time_s=0.0
+                    )
+
+        batcher.report_batch(
+            100, processing_time_s=processing_time
+        )
+
+        level = _get_level(batcher)
+        result.cycles.append(CycleRecord(
+            wall_time_s=clock.now,
+            batch_window_s=window,
+            processing_time_s=processing_time,
+            backlog_s=backlog_s,
+            level=level,
+        ))
+
+    return result
+
+
+def _get_level(batcher: MessageBatcher) -> int:
+    if hasattr(batcher, 'state'):
+        return batcher.state.level
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Processing cost models
+# ---------------------------------------------------------------------------
+
+
+def constant_overhead_cost(
+    overhead_s: float,
+    per_second_cost: float,
+    *,
+    jitter_fraction: float = 0.0,
+    rng: random.Random | None = None,
+) -> ProcessingCostFn:
+    """Fixed overhead + linear data cost, with optional jitter.
+
+    ``processing_time = overhead_s + per_second_cost * window + jitter``
+
+    The system keeps up when ``overhead_s < window * (1 - per_second_cost)``.
+    """
+    _rng = rng or random.Random(42)
+
+    def cost(batch_window_s: float, wall_time_s: float) -> float:
+        base = overhead_s + per_second_cost * batch_window_s
+        if jitter_fraction > 0:
+            jitter = _rng.gauss(0, jitter_fraction * base)
+            base = max(0.01, base + jitter)
+        return base
+
+    return cost
+
+
+def step_function_cost(
+    step_time_s: float,
+    before: ProcessingCostFn,
+    after: ProcessingCostFn,
+) -> ProcessingCostFn:
+    """Switch cost functions at a given wall-clock time."""
+
+    def cost(batch_window_s: float, wall_time_s: float) -> float:
+        if wall_time_s < step_time_s:
+            return before(batch_window_s, wall_time_s)
+        return after(batch_window_s, wall_time_s)
+
+    return cost
+
+
+def idle_cost() -> ProcessingCostFn:
+    """No data to process."""
+
+    def cost(batch_window_s: float, wall_time_s: float) -> float:
+        return 0.0
+
+    return cost
+
+
+def creeping_cost(
+    overhead_s: float,
+    per_second_cost_start: float,
+    per_second_cost_end: float,
+    ramp_duration_s: float,
+    ramp_start_s: float = 0.0,
+    *,
+    jitter_fraction: float = 0.0,
+    rng: random.Random | None = None,
+) -> ProcessingCostFn:
+    """Processing cost that linearly ramps up over time."""
+    _rng = rng or random.Random(42)
+
+    def cost(batch_window_s: float, wall_time_s: float) -> float:
+        elapsed = max(0.0, wall_time_s - ramp_start_s)
+        frac = (
+            min(1.0, elapsed / ramp_duration_s)
+            if ramp_duration_s > 0
+            else 1.0
+        )
+        rate_range = per_second_cost_end - per_second_cost_start
+        per_s = per_second_cost_start + frac * rate_range
+        base = overhead_s + per_s * batch_window_s
+        if jitter_fraction > 0:
+            jitter = _rng.gauss(0, jitter_fraction * base)
+            base = max(0.01, base + jitter)
+        return base
+
+    return cost
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def run_scenario(
+    batcher: MessageBatcher,
+    duration_s: float,
+    cost_fn: ProcessingCostFn,
+) -> SimulationResult:
+    clock = FakeClock()
+    with patch(
+        'ess.livedata.core.message_batcher.time.monotonic', clock
+    ):
+        return simulate(batcher, duration_s, cost_fn, clock)
+
+
+def make_default_batcher(**kwargs) -> AdaptiveMessageBatcher:
+    defaults = {"base_batch_length_s": 1.0, "max_level": 3}
+    defaults.update(kwargs)
+    return AdaptiveMessageBatcher(**defaults)
+
+
+# ===========================================================================
+# Scenario tests
+# ===========================================================================
+
+
+class TestStepFunctionEscalation:
+    """Shutter-open scenario: sudden jump from idle to high load."""
+
+    def test_escalates_within_bounded_time(self):
+        """After a step increase in load, the batcher must escalate quickly."""
+        lim = LIMITS["step_function_escalation"]
+        batcher = make_default_batcher()
+
+        # 10s idle, then overhead-dominated load with jitter
+        # At 1s window: 0.8 + 0.3 = 1.1s → overloaded
+        cost = step_function_cost(
+            step_time_s=10.0,
+            before=idle_cost(),
+            after=constant_overhead_cost(
+                overhead_s=0.8,
+                per_second_cost=0.3,
+                jitter_fraction=0.1,
+                rng=random.Random(123),
+            ),
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+
+        first_esc = result.first_escalation_time_s()
+        assert first_esc is not None, "Batcher never escalated"
+        time_to_escalate = first_esc - 10.0
+        assert time_to_escalate < lim["max_time_to_first_escalation_s"], (
+            f"Took {time_to_escalate:.1f}s to first escalate after step "
+            f"(limit: {lim['max_time_to_first_escalation_s']}s)"
+        )
+
+    def test_limits_backlog(self):
+        """Backlog during escalation must remain bounded.
+
+        At 1s window: 0.6 + 0.6 = 1.2s (20% over budget).
+        At 2s window: 0.6 + 1.2 = 1.8s (OK).
+        """
+        lim = LIMITS["step_function_backlog"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=5.0,
+            before=idle_cost(),
+            after=constant_overhead_cost(
+                overhead_s=0.6, per_second_cost=0.6
+            ),
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+
+        assert result.max_backlog_s < lim["max_backlog_s"], (
+            f"Backlog reached {result.max_backlog_s:.1f}s "
+            f"(limit: {lim['max_backlog_s']}s)"
+        )
+        assert result.final_backlog_s < lim["max_final_backlog_s"], (
+            f"Residual backlog {result.final_backlog_s:.1f}s "
+            f"(limit: {lim['max_final_backlog_s']}s)"
+        )
+
+    def test_severe_overload_reaches_adequate_level(self):
+        """Under severe overload, the batcher must reach a high enough level.
+
+        At 1s: 1.8 + 0.2 = 2.0s (2x overloaded).
+        At 2s: 1.8 + 0.4 = 2.2s (1.1x overloaded).
+        At 4s: 1.8 + 0.8 = 2.6s < 4s (OK).
+        """
+        lim = LIMITS["severe_step_function"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=5.0,
+            before=idle_cost(),
+            after=constant_overhead_cost(
+                overhead_s=1.8, per_second_cost=0.2
+            ),
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Only reached level {result.max_level} "
+            f"(need >= {lim['min_level_reached']})"
+        )
+
+
+class TestNoEscalationWhenNotNeeded:
+    """The batcher must not escalate when the system keeps up."""
+
+    def test_no_escalation_under_light_load(self):
+        """Fast processing should never trigger escalation."""
+        lim = LIMITS["light_load"]
+        batcher = make_default_batcher()
+        cost = constant_overhead_cost(
+            overhead_s=0.1, per_second_cost=0.1
+        )
+
+        result = run_scenario(batcher, 60.0, cost)
+        assert result.max_level <= lim["max_level"], (
+            f"Escalated to level {result.max_level} under light load "
+            f"(limit: {lim['max_level']})"
+        )
+
+    def test_no_escalation_with_gc_jitter(self):
+        """Occasional GC/scheduling spikes should not cause escalation.
+
+        Processing is fast on average (0.3s) but with significant jitter
+        that occasionally exceeds the 1s window.
+        """
+        lim = LIMITS["gc_jitter"]
+        batcher = make_default_batcher()
+        cost = constant_overhead_cost(
+            overhead_s=0.2,
+            per_second_cost=0.1,
+            jitter_fraction=0.5,
+            rng=random.Random(999),
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+        assert result.max_level <= lim["max_level"], (
+            f"Escalated to level {result.max_level} from jitter alone "
+            f"(limit: {lim['max_level']})"
+        )
+
+
+class TestNoOscillation:
+    """The batcher must not oscillate between levels."""
+
+    def test_no_oscillation_at_steady_load(self):
+        """Constant load near the threshold should stabilize."""
+        lim = LIMITS["steady_load_oscillation"]
+        batcher = make_default_batcher()
+
+        # Processing at ~90% of 1s window
+        cost = constant_overhead_cost(
+            overhead_s=0.5, per_second_cost=0.4
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+        assert result.oscillation_count() <= lim["max_oscillations"], (
+            f"Oscillated {result.oscillation_count()} times "
+            f"(limit: {lim['max_oscillations']})"
+        )
+
+    def test_limited_oscillation_at_boundary(self):
+        """Processing right at the window with jitter: bounded oscillation."""
+        lim = LIMITS["boundary_oscillation"]
+        batcher = make_default_batcher()
+
+        # Mean processing = 1.0s = window, jitter +-10%
+        cost = constant_overhead_cost(
+            overhead_s=0.5,
+            per_second_cost=0.5,
+            jitter_fraction=0.1,
+            rng=random.Random(42),
+        )
+
+        result = run_scenario(batcher, 180.0, cost)
+        assert result.oscillation_count() <= lim["max_oscillations"], (
+            f"Oscillated {result.oscillation_count()} times "
+            f"(limit: {lim['max_oscillations']})"
+        )
+
+
+class TestCreepingOverload:
+    """Load that gradually increases past processing capacity."""
+
+    def test_eventually_escalates(self):
+        """As cost ramps up, the batcher must escalate."""
+        lim = LIMITS["creeping_overload"]
+        batcher = make_default_batcher()
+
+        # Ramp from 0.5s to 1.3s at 1s window over 60s
+        cost = creeping_cost(
+            overhead_s=0.3,
+            per_second_cost_start=0.2,
+            per_second_cost_end=1.0,
+            ramp_duration_s=60.0,
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Only reached level {result.max_level} "
+            f"(need >= {lim['min_level_reached']})"
+        )
+
+    def test_limits_backlog(self):
+        """Backlog from creeping overload should remain bounded."""
+        lim = LIMITS["creeping_overload_backlog"]
+        batcher = make_default_batcher()
+
+        cost = creeping_cost(
+            overhead_s=0.3,
+            per_second_cost_start=0.2,
+            per_second_cost_end=1.0,
+            ramp_duration_s=60.0,
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+        assert result.max_backlog_s < lim["max_backlog_s"], (
+            f"Backlog reached {result.max_backlog_s:.1f}s "
+            f"(limit: {lim['max_backlog_s']}s)"
+        )
+
+    def test_mild_overload_does_not_over_escalate(self):
+        """A slow creep to barely over 1x should not jump to max level.
+
+        overhead=0.3, per_s ramps 0.5 -> 0.8 over 60s.
+        At 1s window: 0.3 + 0.8 = 1.1s -> needs escalation.
+        At 2s window: 0.3 + 0.8*2 = 1.9s < 2s -> stable at level 1.
+        """
+        lim = LIMITS["mild_creeping_overload"]
+        batcher = make_default_batcher()
+
+        cost = creeping_cost(
+            overhead_s=0.3,
+            per_second_cost_start=0.5,
+            per_second_cost_end=0.8,
+            ramp_duration_s=60.0,
+        )
+
+        result = run_scenario(batcher, 180.0, cost)
+        assert result.max_level <= lim["max_level"], (
+            f"Over-escalated to level {result.max_level} "
+            f"(limit: {lim['max_level']})"
+        )
+
+
+class TestDeescalation:
+    """The batcher must de-escalate when load subsides."""
+
+    def test_deescalates_after_load_drops_to_idle(self):
+        """After high load followed by idle, must return to level 0."""
+        lim = LIMITS["deescalation_to_idle"]
+        batcher = make_default_batcher()
+
+        # 30s high load, then idle
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=30.0,
+                before=constant_overhead_cost(
+                    overhead_s=0.8, per_second_cost=0.3
+                ),
+                after=idle_cost(),
+            ),
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} "
+            f"(limit: {lim['max_final_level']})"
+        )
+
+    def test_deescalates_after_step_down_to_light_load(self):
+        """When load decreases to light (but non-zero), must de-escalate.
+
+        This requires the batcher to de-escalate even when data is flowing
+        continuously, not just when the system goes fully idle.
+        """
+        lim = LIMITS["deescalation_to_light_load"]
+        batcher = make_default_batcher()
+
+        # Heavy load for 40s, then light load
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=40.0,
+                before=constant_overhead_cost(
+                    overhead_s=0.9, per_second_cost=0.05
+                ),
+                after=constant_overhead_cost(
+                    overhead_s=0.1, per_second_cost=0.1
+                ),
+            ),
+        )
+
+        result = run_scenario(batcher, 180.0, cost)
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} "
+            f"(limit: {lim['max_final_level']})"
+        )
+
+
+class TestRealisticShutterScenario:
+    """End-to-end shutter open/close simulation with noise."""
+
+    def test_shutter_open_close_cycle(self):
+        """Idle -> shutter open (high load) -> shutter close (idle).
+
+        Must handle the full cycle: escalation, stable operation,
+        de-escalation back to base.
+        """
+        lim = LIMITS["shutter_open_close"]
+        batcher = make_default_batcher()
+
+        rng = random.Random(42)
+        cost = step_function_cost(
+            step_time_s=10.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=70.0,
+                before=constant_overhead_cost(
+                    overhead_s=0.7,
+                    per_second_cost=0.4,
+                    jitter_fraction=0.15,
+                    rng=rng,
+                ),
+                after=idle_cost(),
+            ),
+        )
+
+        result = run_scenario(batcher, 180.0, cost)
+
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Only reached level {result.max_level} during shutter open "
+            f"(need >= {lim['min_level_reached']})"
+        )
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} after shutter close "
+            f"(limit: {lim['max_final_level']})"
+        )
+        assert result.max_backlog_s < lim["max_backlog_s"], (
+            f"Backlog reached {result.max_backlog_s:.1f}s "
+            f"(limit: {lim['max_backlog_s']}s)"
+        )
+
+    def test_repeated_shutter_cycles(self):
+        """Multiple on/off cycles should not cause runaway escalation."""
+        lim = LIMITS["repeated_shutter_cycles"]
+        batcher = make_default_batcher()
+
+        rng = random.Random(42)
+        high = constant_overhead_cost(
+            overhead_s=0.7,
+            per_second_cost=0.4,
+            jitter_fraction=0.1,
+            rng=rng,
+        )
+        low = idle_cost()
+
+        # 20s on / 20s off cycles
+        def cost(
+            batch_window_s: float, wall_time_s: float
+        ) -> float:
+            cycle_pos = wall_time_s % 40.0
+            if cycle_pos < 20.0:
+                return high(batch_window_s, wall_time_s)
+            return low(batch_window_s, wall_time_s)
+
+        result = run_scenario(batcher, 200.0, cost)
+
+        assert result.final_level <= lim["max_final_level"], (
+            f"Stuck at level {result.final_level} after repeated cycles "
+            f"(limit: {lim['max_final_level']})"
+        )
+
+
+class TestBacklogDraining:
+    """Once the batcher escalates, accumulated backlog should drain."""
+
+    def test_backlog_drains_after_escalation(self):
+        """Sustained load triggering escalation should drain the backlog.
+
+        At 1s: 0.6 + 0.6 = 1.2s (overloaded).
+        At 2s: 0.6 + 1.2 = 1.8s (OK, surplus drains backlog).
+        """
+        lim = LIMITS["backlog_drains"]
+        batcher = make_default_batcher()
+
+        cost = constant_overhead_cost(
+            overhead_s=0.6, per_second_cost=0.6
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+
+        assert result.final_backlog_s < lim["max_final_backlog_s"], (
+            f"Backlog not drained: {result.final_backlog_s:.1f}s "
+            f"(limit: {lim['max_final_backlog_s']}s)"
+        )
+
+    def test_backlog_does_not_grow_indefinitely(self):
+        """Even under sustained load, the backlog must peak and decrease.
+
+        At 1s: 0.8 + 0.3 = 1.1s (overloaded).
+        At 2s: 0.8 + 0.6 = 1.4s (OK).
+        """
+        batcher = make_default_batcher()
+
+        cost = constant_overhead_cost(
+            overhead_s=0.8, per_second_cost=0.3
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+
+        peak_idx = max(
+            range(len(result.cycles)),
+            key=lambda i: result.cycles[i].backlog_s,
+        )
+        assert peak_idx < len(result.cycles) - 1, (
+            "Backlog was still at peak at end of simulation"
+        )
+
+
+class TestProcessingTimeAwareness:
+    """The batcher should use processing_time_s for faster decisions."""
+
+    def test_fast_escalation_on_clear_overload(self):
+        """When processing demonstrably exceeds the batch window,
+        escalation should be fast."""
+        lim = LIMITS["fast_escalation_clear_overload"]
+        batcher = make_default_batcher()
+
+        # Clear overload: 1.5x the window at every level
+        cost = constant_overhead_cost(
+            overhead_s=0.0, per_second_cost=1.5
+        )
+
+        result = run_scenario(batcher, 60.0, cost)
+
+        first_esc = result.first_escalation_time_s()
+        assert first_esc is not None, "Never escalated under overload"
+        assert first_esc < lim["max_time_to_first_escalation_s"], (
+            f"First escalation at {first_esc:.1f}s "
+            f"(limit: {lim['max_time_to_first_escalation_s']}s)"
+        )
+
+    def test_no_escalation_when_processing_fits(self):
+        """No escalation if processing completes within the window."""
+        lim = LIMITS["no_escalation_when_fits"]
+        batcher = make_default_batcher()
+
+        cost = constant_overhead_cost(
+            overhead_s=0.1, per_second_cost=0.3
+        )
+
+        result = run_scenario(batcher, 60.0, cost)
+        assert result.max_level <= lim["max_level"], (
+            f"Escalated to {result.max_level} despite fitting "
+            f"(limit: {lim['max_level']})"
+        )

From a2b92320c8faadef5e5bf47824e2c1338958f1be Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 06:09:19 +0000
Subject: [PATCH 04/16] Add batch time reporting

---
 pyproject.toml                                  |  1 +
 .../livedata/core/orchestrating_processor.py    | 17 +++++++++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 84a515a4b..14d6719af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -141,6 +141,7 @@ pydocstyle.convention = "numpy"
 # those files have an increased risk of relying on import order
 "tests/*" = [
     "S101",  # asserts are fine in tests
+    "S311",  # deterministic seeded RNG is fine for test simulations
     "B018",  # 'useless expressions' are ok because some tests just check for exceptions
     "TID251",  # tests can use stdlib logging
 ]
diff --git a/src/ess/livedata/core/orchestrating_processor.py b/src/ess/livedata/core/orchestrating_processor.py
index be043f588..6652dc8e2 100644
--- a/src/ess/livedata/core/orchestrating_processor.py
+++ b/src/ess/livedata/core/orchestrating_processor.py
@@ -176,7 +176,7 @@ def process(self) -> None:
 
         message_batch = self._message_batcher.batch(data_messages)
         if message_batch is None:
-            self._report_batch(None)
+            self._report_batch(None, processing_time_s=0.0)
             self._empty_batches += 1
             self._maybe_log_metrics()
             self._sink.publish_messages(result_messages)
@@ -187,6 +187,8 @@ def process(self) -> None:
                 time.sleep(0.1)
             return
 
+        batch_start = time.monotonic()
+
         # Pre-process message batch
         workflow_data = self._message_preprocessor.preprocess_messages(message_batch)
 
@@ -225,7 +227,10 @@ def process(self) -> None:
             else:
                 valid_results.append(result)
 
-        self._report_batch(len(message_batch.messages))
+        processing_time_s = time.monotonic() - batch_start
+        self._report_batch(
+            len(message_batch.messages), processing_time_s=processing_time_s
+        )
         self._batches_processed += 1
         self._maybe_log_metrics()
 
@@ -254,9 +259,13 @@ def _report_status(self) -> None:
 
         self._sink.publish_messages(messages)
 
-    def _report_batch(self, message_count: int | None) -> None:
+    def _report_batch(
+        self, message_count: int | None, processing_time_s: float = 0.0
+    ) -> None:
         """Forward batch outcome to the batcher for adaptive behavior."""
-        self._message_batcher.report_batch(message_count)
+        self._message_batcher.report_batch(
+            message_count, processing_time_s=processing_time_s
+        )
 
     def _get_service_status(self, job_statuses: list[JobStatus]) -> ServiceStatus:
         """Get the current service status for heartbeat publishing."""

From 57847521863d08f81a19358039ed5358cccd9110 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 06:38:07 +0000
Subject: [PATCH 05/16] Strengthen adaptive batching scenario tests

Add precondition assertions to transition tests (de-escalation, backlog
draining, repeated shutter) to prevent false positives when the expected
intermediate state is never reached. Parameterize light-load tests across
utilization levels (20%-85%), jitter tests across RNG seeds, and severity
tests across four overload intensities with min/max level bounds.

Add stabilization-after-escalation test and consolidate related thin
tests. Extract cyclic_cost helper.

The precondition guards exposed a real issue: the old
test_deescalates_after_step_down_to_light_load never actually triggered
escalation (0.95s < 1s window), passing trivially. With a corrected cost
function, the test reveals a batcher limitation where idle poll cycles
between batches reset the consecutive-underloaded counter, preventing
de-escalation under continuous light load. Marked xfail(strict=True).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../core/adaptive_batching_scenarios_test.py  | 413 ++++++++++++------
 1 file changed, 280 insertions(+), 133 deletions(-)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index 04955896d..9b1c2358b 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -28,6 +28,8 @@
 from typing import Protocol
 from unittest.mock import patch
 
+import pytest
+
 from ess.livedata.core.message_batcher import (
     AdaptiveMessageBatcher,
     MessageBatcher,
@@ -56,13 +58,34 @@
         "max_backlog_s": 5.0,
         "max_final_backlog_s": 1.0,
     },
-    "severe_step_function": {
-        "min_level_reached": 2,
+    # -- Escalation reaches appropriate level for given severity ----------
+    # overhead_s=0.6, per_s=0.6 -> at 1s: 1.2 (overloaded), at 2s: 1.8 (OK)
+    "severity_moderate": {
+        "min_level": 1,
+        "max_level": 1,
     },
-    # -- No escalation when not needed ------------------------------------
-    "light_load": {
-        "max_level": 0,
+    # overhead_s=0.8, per_s=0.3 -> at 1s: 1.1, at 2s: 1.4 (OK)
+    "severity_overhead_dominated": {
+        "min_level": 1,
+        "max_level": 1,
+    },
+    # overhead_s=1.8, per_s=0.2 -> at 1s: 2.0, at 2s: 2.2, at 4s: 2.6 (OK)
+    "severity_severe": {
+        "min_level": 2,
+        "max_level": 3,
     },
+    # overhead_s=0.5, per_s=1.5 -> at 1s: 2.0, at 2s: 3.5, at 4s: 6.5, at 8s: 12.5
+    # Overloaded at every level — must reach max.
+    "severity_extreme": {
+        "min_level": 3,
+        "max_level": 3,
+    },
+    # -- No escalation when not needed ------------------------------------
+    # Parameterized across utilization levels.
+    "light_load_20pct": {"max_level": 0},
+    "light_load_60pct": {"max_level": 0},
+    "light_load_80pct": {"max_level": 0},
+    "light_load_85pct": {"max_level": 0},
     "gc_jitter": {
         "max_level": 0,
     },
@@ -76,18 +99,19 @@
     # -- Creeping overload ------------------------------------------------
     "creeping_overload": {
         "min_level_reached": 1,
-    },
-    "creeping_overload_backlog": {
         "max_backlog_s": 5.0,
     },
     "mild_creeping_overload": {
+        "min_level_reached": 1,
         "max_level": 1,
     },
     # -- De-escalation ----------------------------------------------------
     "deescalation_to_idle": {
+        "min_level_during_load": 1,
         "max_final_level": 0,
     },
     "deescalation_to_light_load": {
+        "min_level_during_load": 1,
         "max_final_level": 0,
     },
     # -- Realistic shutter ------------------------------------------------
@@ -97,12 +121,19 @@
         "max_backlog_s": 10.0,
     },
     "repeated_shutter_cycles": {
+        "min_level_reached": 1,
         "max_final_level": 0,
     },
     # -- Backlog draining -------------------------------------------------
     "backlog_drains": {
+        "min_level_reached": 1,
+        "min_peak_backlog_s": 0.1,
         "max_final_backlog_s": 1.0,
     },
+    "backlog_peaks_and_decreases": {
+        "min_level_reached": 1,
+        "min_peak_backlog_s": 0.1,
+    },
     # -- Processing-time awareness ----------------------------------------
     "fast_escalation_clear_overload": {
         "max_time_to_first_escalation_s": 5.0,
@@ -110,6 +141,10 @@
     "no_escalation_when_fits": {
         "max_level": 0,
     },
+    # -- Stabilization after escalation -----------------------------------
+    "stabilization_after_step": {
+        "max_oscillations": 0,
+    },
 }
 
 
@@ -121,9 +156,7 @@
 class ProcessingCostFn(Protocol):
     """Returns the processing time (seconds) for a batch of given window."""
 
-    def __call__(
-        self, batch_window_s: float, wall_time_s: float
-    ) -> float: ...
+    def __call__(self, batch_window_s: float, wall_time_s: float) -> float: ...
 
 
 @dataclass
@@ -169,11 +202,7 @@ def total_wall_time_s(self) -> float:
 
     def time_at_level(self, level: int) -> float:
         """Total wall time spent at a given level."""
-        return sum(
-            c.processing_time_s
-            for c in self.cycles
-            if c.level == level
-        )
+        return sum(c.processing_time_s for c in self.cycles if c.level == level)
 
     def level_changes(self) -> list[tuple[float, int, int]]:
         """List of (wall_time, old_level, new_level) transitions."""
@@ -199,15 +228,15 @@ def oscillation_count(self) -> int:
         changes = self.level_changes()
         if len(changes) < 2:
             return 0
-        directions = [
-            1 if new > old else -1 for _, old, new in changes
-        ]
+        directions = [1 if new > old else -1 for _, old, new in changes]
         return sum(
-            1
-            for i in range(1, len(directions))
-            if directions[i] != directions[i - 1]
+            1 for i in range(1, len(directions)) if directions[i] != directions[i - 1]
         )
 
+    def cycles_after(self, wall_time_s: float) -> list[CycleRecord]:
+        """All cycles with wall_time_s > the given time."""
+        return [c for c in self.cycles if c.wall_time_s > wall_time_s]
+
 
 class FakeClock:
     """Deterministic monotonic clock for simulation."""
@@ -250,13 +279,15 @@ def simulate(
             clock.advance(idle_poll_interval_s)
             batcher.report_batch(None, processing_time_s=0.0)
             level = _get_level(batcher)
-            result.cycles.append(CycleRecord(
-                wall_time_s=clock.now,
-                batch_window_s=window,
-                processing_time_s=0.0,
-                backlog_s=backlog_s,
-                level=level,
-            ))
+            result.cycles.append(
+                CycleRecord(
+                    wall_time_s=clock.now,
+                    batch_window_s=window,
+                    processing_time_s=0.0,
+                    backlog_s=backlog_s,
+                    level=level,
+                )
+            )
             continue
 
         clock.advance(processing_time)
@@ -272,22 +303,20 @@ def simulate(
                 n_idle = int(remaining_idle / idle_poll_interval_s)
                 for _ in range(n_idle):
                     clock.advance(idle_poll_interval_s)
-                    batcher.report_batch(
-                        None, processing_time_s=0.0
-                    )
+                    batcher.report_batch(None, processing_time_s=0.0)
 
-        batcher.report_batch(
-            100, processing_time_s=processing_time
-        )
+        batcher.report_batch(100, processing_time_s=processing_time)
 
         level = _get_level(batcher)
-        result.cycles.append(CycleRecord(
-            wall_time_s=clock.now,
-            batch_window_s=window,
-            processing_time_s=processing_time,
-            backlog_s=backlog_s,
-            level=level,
-        ))
+        result.cycles.append(
+            CycleRecord(
+                wall_time_s=clock.now,
+                batch_window_s=window,
+                processing_time_s=processing_time,
+                backlog_s=backlog_s,
+                level=level,
+            )
+        )
 
     return result
 
@@ -367,11 +396,7 @@ def creeping_cost(
 
     def cost(batch_window_s: float, wall_time_s: float) -> float:
         elapsed = max(0.0, wall_time_s - ramp_start_s)
-        frac = (
-            min(1.0, elapsed / ramp_duration_s)
-            if ramp_duration_s > 0
-            else 1.0
-        )
+        frac = min(1.0, elapsed / ramp_duration_s) if ramp_duration_s > 0 else 1.0
         rate_range = per_second_cost_end - per_second_cost_start
         per_s = per_second_cost_start + frac * rate_range
         base = overhead_s + per_s * batch_window_s
@@ -383,6 +408,24 @@ def cost(batch_window_s: float, wall_time_s: float) -> float:
     return cost
 
 
+def cyclic_cost(
+    on_duration_s: float,
+    off_duration_s: float,
+    on_cost: ProcessingCostFn,
+    off_cost: ProcessingCostFn,
+) -> ProcessingCostFn:
+    """Alternating on/off cost function with configurable duty cycle."""
+    period = on_duration_s + off_duration_s
+
+    def cost(batch_window_s: float, wall_time_s: float) -> float:
+        cycle_pos = wall_time_s % period
+        if cycle_pos < on_duration_s:
+            return on_cost(batch_window_s, wall_time_s)
+        return off_cost(batch_window_s, wall_time_s)
+
+    return cost
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -394,9 +437,7 @@ def run_scenario(
     cost_fn: ProcessingCostFn,
 ) -> SimulationResult:
     clock = FakeClock()
-    with patch(
-        'ess.livedata.core.message_batcher.time.monotonic', clock
-    ):
+    with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
         return simulate(batcher, duration_s, cost_fn, clock)
 
 
@@ -420,7 +461,7 @@ def test_escalates_within_bounded_time(self):
         batcher = make_default_batcher()
 
         # 10s idle, then overhead-dominated load with jitter
-        # At 1s window: 0.8 + 0.3 = 1.1s → overloaded
+        # At 1s window: 0.8 + 0.3 = 1.1s -> overloaded
         cost = step_function_cost(
             step_time_s=10.0,
             before=idle_cost(),
@@ -454,13 +495,15 @@ def test_limits_backlog(self):
         cost = step_function_cost(
             step_time_s=5.0,
             before=idle_cost(),
-            after=constant_overhead_cost(
-                overhead_s=0.6, per_second_cost=0.6
-            ),
+            after=constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6),
         )
 
         result = run_scenario(batcher, 120.0, cost)
 
+        assert result.max_level >= 1, (
+            "Precondition: load must trigger escalation for backlog test "
+            "to be meaningful"
+        )
         assert result.max_backlog_s < lim["max_backlog_s"], (
             f"Backlog reached {result.max_backlog_s:.1f}s "
             f"(limit: {lim['max_backlog_s']}s)"
@@ -470,53 +513,133 @@ def test_limits_backlog(self):
             f"(limit: {lim['max_final_backlog_s']}s)"
         )
 
-    def test_severe_overload_reaches_adequate_level(self):
-        """Under severe overload, the batcher must reach a high enough level.
+    @pytest.mark.parametrize(
+        ("overhead_s", "per_second_cost", "limits_key"),
+        [
+            pytest.param(
+                0.6,
+                0.6,
+                "severity_moderate",
+                id="moderate: overhead=0.6 per_s=0.6",
+            ),
+            pytest.param(
+                0.8,
+                0.3,
+                "severity_overhead_dominated",
+                id="overhead-dominated: overhead=0.8 per_s=0.3",
+            ),
+            pytest.param(
+                1.8,
+                0.2,
+                "severity_severe",
+                id="severe: overhead=1.8 per_s=0.2",
+            ),
+            pytest.param(
+                0.5,
+                1.5,
+                "severity_extreme",
+                id="extreme: overhead=0.5 per_s=1.5",
+            ),
+        ],
+    )
+    def test_reaches_appropriate_level_for_severity(
+        self, overhead_s, per_second_cost, limits_key
+    ):
+        """The batcher must reach an appropriate level for the overload severity,
+        without over-escalating.
 
-        At 1s: 1.8 + 0.2 = 2.0s (2x overloaded).
-        At 2s: 1.8 + 0.4 = 2.2s (1.1x overloaded).
-        At 4s: 1.8 + 0.8 = 2.6s < 4s (OK).
+        The limits table specifies both a minimum and maximum level for each
+        severity, ensuring the response is proportional.
         """
-        lim = LIMITS["severe_step_function"]
+        lim = LIMITS[limits_key]
         batcher = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=5.0,
             before=idle_cost(),
             after=constant_overhead_cost(
-                overhead_s=1.8, per_second_cost=0.2
+                overhead_s=overhead_s, per_second_cost=per_second_cost
             ),
         )
 
         result = run_scenario(batcher, 120.0, cost)
-        assert result.max_level >= lim["min_level_reached"], (
-            f"Only reached level {result.max_level} "
-            f"(need >= {lim['min_level_reached']})"
+        assert result.max_level >= lim["min_level"], (
+            f"Only reached level {result.max_level} (need >= {lim['min_level']})"
+        )
+        assert result.max_level <= lim["max_level"], (
+            f"Over-escalated to level {result.max_level} (limit: {lim['max_level']})"
+        )
+
+    def test_stabilizes_after_escalation(self):
+        """After reaching the correct level, the batcher must not oscillate."""
+        lim = LIMITS["stabilization_after_step"]
+        batcher = make_default_batcher()
+
+        # At 1s: 0.8+0.3=1.1 (overloaded). At 2s: 0.8+0.6=1.4 (OK).
+        cost = step_function_cost(
+            step_time_s=5.0,
+            before=idle_cost(),
+            after=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
+        )
+
+        result = run_scenario(batcher, 120.0, cost)
+
+        assert result.max_level >= 1, "Precondition: must have escalated"
+        # After the initial transient, the level should be stable.
+        late_cycles = result.cycles_after(60.0)
+        assert late_cycles, "Simulation too short for stabilization check"
+        late_levels = {c.level for c in late_cycles}
+        assert len(late_levels) == 1, (
+            f"Not stabilized: levels {sorted(late_levels)} observed "
+            f"in second half of simulation"
+        )
+        assert result.oscillation_count() <= lim["max_oscillations"], (
+            f"Oscillated {result.oscillation_count()} times "
+            f"(limit: {lim['max_oscillations']})"
         )
 
 
 class TestNoEscalationWhenNotNeeded:
     """The batcher must not escalate when the system keeps up."""
 
-    def test_no_escalation_under_light_load(self):
-        """Fast processing should never trigger escalation."""
-        lim = LIMITS["light_load"]
+    @pytest.mark.parametrize(
+        ("overhead_s", "per_second_cost", "limits_key"),
+        [
+            pytest.param(0.1, 0.1, "light_load_20pct", id="20% utilization"),
+            pytest.param(0.3, 0.3, "light_load_60pct", id="60% utilization"),
+            pytest.param(0.4, 0.4, "light_load_80pct", id="80% utilization"),
+            pytest.param(0.3, 0.55, "light_load_85pct", id="85% utilization"),
+        ],
+    )
+    def test_no_escalation_under_light_load(
+        self, overhead_s, per_second_cost, limits_key
+    ):
+        """Processing that fits within the window should never trigger escalation,
+        even at high utilization.
+        """
+        lim = LIMITS[limits_key]
         batcher = make_default_batcher()
         cost = constant_overhead_cost(
-            overhead_s=0.1, per_second_cost=0.1
+            overhead_s=overhead_s, per_second_cost=per_second_cost
         )
 
         result = run_scenario(batcher, 60.0, cost)
         assert result.max_level <= lim["max_level"], (
-            f"Escalated to level {result.max_level} under light load "
+            f"Escalated to level {result.max_level} at "
+            f"{overhead_s + per_second_cost:.0%} utilization "
             f"(limit: {lim['max_level']})"
         )
 
-    def test_no_escalation_with_gc_jitter(self):
+    @pytest.mark.parametrize(
+        "seed",
+        [pytest.param(s, id=f"seed={s}") for s in (42, 999, 12345)],
+    )
+    def test_no_escalation_with_gc_jitter(self, seed):
         """Occasional GC/scheduling spikes should not cause escalation.
 
         Processing is fast on average (0.3s) but with significant jitter
-        that occasionally exceeds the 1s window.
+        that occasionally exceeds the 1s window.  Tested with multiple RNG
+        seeds to avoid seed-dependent false confidence.
         """
         lim = LIMITS["gc_jitter"]
         batcher = make_default_batcher()
@@ -524,13 +647,13 @@ def test_no_escalation_with_gc_jitter(self):
             overhead_s=0.2,
             per_second_cost=0.1,
             jitter_fraction=0.5,
-            rng=random.Random(999),
+            rng=random.Random(seed),
         )
 
         result = run_scenario(batcher, 120.0, cost)
         assert result.max_level <= lim["max_level"], (
             f"Escalated to level {result.max_level} from jitter alone "
-            f"(limit: {lim['max_level']})"
+            f"(seed={seed}, limit: {lim['max_level']})"
         )
 
 
@@ -543,9 +666,7 @@ def test_no_oscillation_at_steady_load(self):
         batcher = make_default_batcher()
 
         # Processing at ~90% of 1s window
-        cost = constant_overhead_cost(
-            overhead_s=0.5, per_second_cost=0.4
-        )
+        cost = constant_overhead_cost(overhead_s=0.5, per_second_cost=0.4)
 
         result = run_scenario(batcher, 120.0, cost)
         assert result.oscillation_count() <= lim["max_oscillations"], (
@@ -576,12 +697,14 @@ def test_limited_oscillation_at_boundary(self):
 class TestCreepingOverload:
     """Load that gradually increases past processing capacity."""
 
-    def test_eventually_escalates(self):
-        """As cost ramps up, the batcher must escalate."""
+    def test_eventually_escalates_and_limits_backlog(self):
+        """As cost ramps up, the batcher must escalate and keep backlog bounded.
+
+        Ramp from 0.5s to 1.3s at 1s window over 60s.
+        """
         lim = LIMITS["creeping_overload"]
         batcher = make_default_batcher()
 
-        # Ramp from 0.5s to 1.3s at 1s window over 60s
         cost = creeping_cost(
             overhead_s=0.3,
             per_second_cost_start=0.2,
@@ -594,27 +717,13 @@ def test_eventually_escalates(self):
             f"Only reached level {result.max_level} "
             f"(need >= {lim['min_level_reached']})"
         )
-
-    def test_limits_backlog(self):
-        """Backlog from creeping overload should remain bounded."""
-        lim = LIMITS["creeping_overload_backlog"]
-        batcher = make_default_batcher()
-
-        cost = creeping_cost(
-            overhead_s=0.3,
-            per_second_cost_start=0.2,
-            per_second_cost_end=1.0,
-            ramp_duration_s=60.0,
-        )
-
-        result = run_scenario(batcher, 120.0, cost)
         assert result.max_backlog_s < lim["max_backlog_s"], (
             f"Backlog reached {result.max_backlog_s:.1f}s "
             f"(limit: {lim['max_backlog_s']}s)"
         )
 
     def test_mild_overload_does_not_over_escalate(self):
-        """A slow creep to barely over 1x should not jump to max level.
+        """A slow creep to barely over 1x should escalate but not beyond level 1.
 
         overhead=0.3, per_s ramps 0.5 -> 0.8 over 60s.
         At 1s window: 0.3 + 0.8 = 1.1s -> needs escalation.
@@ -631,9 +740,12 @@ def test_mild_overload_does_not_over_escalate(self):
         )
 
         result = run_scenario(batcher, 180.0, cost)
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Only reached level {result.max_level} — mild overload should "
+            f"still trigger escalation (need >= {lim['min_level_reached']})"
+        )
         assert result.max_level <= lim["max_level"], (
-            f"Over-escalated to level {result.max_level} "
-            f"(limit: {lim['max_level']})"
+            f"Over-escalated to level {result.max_level} (limit: {lim['max_level']})"
         )
 
 
@@ -641,7 +753,10 @@ class TestDeescalation:
     """The batcher must de-escalate when load subsides."""
 
     def test_deescalates_after_load_drops_to_idle(self):
-        """After high load followed by idle, must return to level 0."""
+        """After high load followed by idle, must return to level 0.
+
+        At 1s window: 0.8 + 0.3 = 1.1s (overloaded, triggers escalation).
+        """
         lim = LIMITS["deescalation_to_idle"]
         batcher = make_default_batcher()
 
@@ -651,24 +766,39 @@ def test_deescalates_after_load_drops_to_idle(self):
             before=idle_cost(),
             after=step_function_cost(
                 step_time_s=30.0,
-                before=constant_overhead_cost(
-                    overhead_s=0.8, per_second_cost=0.3
-                ),
+                before=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
                 after=idle_cost(),
             ),
         )
 
         result = run_scenario(batcher, 120.0, cost)
-        assert result.final_level <= lim["max_final_level"], (
-            f"Final level {result.final_level} "
-            f"(limit: {lim['max_final_level']})"
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: batcher must have escalated during high-load phase "
+            f"(reached level {result.max_level}, "
+            f"need >= {lim['min_level_during_load']})"
         )
-
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} (limit: {lim['max_final_level']})"
+        )
+
+    @pytest.mark.xfail(
+        reason=(
+            "Known limitation: idle poll cycles between batches reset the "
+            "consecutive-underloaded counter, preventing de-escalation under "
+            "continuous light load. At level 1 (2s window) with 0.3s "
+            "processing, the 1.7s of spare time generates ~17 idle polls "
+            "that each reset _consecutive_underloaded to 0."
+        ),
+        strict=True,
+    )
     def test_deescalates_after_step_down_to_light_load(self):
         """When load decreases to light (but non-zero), must de-escalate.
 
         This requires the batcher to de-escalate even when data is flowing
         continuously, not just when the system goes fully idle.
+
+        Heavy phase at 1s window: 0.8 + 0.3 = 1.1s (overloaded).
+        Light phase at 1s window: 0.1 + 0.1 = 0.2s (well within budget).
         """
         lim = LIMITS["deescalation_to_light_load"]
         batcher = make_default_batcher()
@@ -679,19 +809,19 @@ def test_deescalates_after_step_down_to_light_load(self):
             before=idle_cost(),
             after=step_function_cost(
                 step_time_s=40.0,
-                before=constant_overhead_cost(
-                    overhead_s=0.9, per_second_cost=0.05
-                ),
-                after=constant_overhead_cost(
-                    overhead_s=0.1, per_second_cost=0.1
-                ),
+                before=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
+                after=constant_overhead_cost(overhead_s=0.1, per_second_cost=0.1),
             ),
         )
 
         result = run_scenario(batcher, 180.0, cost)
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: batcher must have escalated during heavy-load "
+            f"phase (reached level {result.max_level}, "
+            f"need >= {lim['min_level_during_load']})"
+        )
         assert result.final_level <= lim["max_final_level"], (
-            f"Final level {result.final_level} "
-            f"(limit: {lim['max_final_level']})"
+            f"Final level {result.final_level} (limit: {lim['max_final_level']})"
         )
 
 
@@ -739,7 +869,11 @@ def test_shutter_open_close_cycle(self):
         )
 
     def test_repeated_shutter_cycles(self):
-        """Multiple on/off cycles should not cause runaway escalation."""
+        """Multiple on/off cycles should not cause runaway escalation.
+
+        Each on-phase must trigger escalation, and each off-phase must
+        allow de-escalation back to base.
+        """
         lim = LIMITS["repeated_shutter_cycles"]
         batcher = make_default_batcher()
 
@@ -750,19 +884,21 @@ def test_repeated_shutter_cycles(self):
             jitter_fraction=0.1,
             rng=rng,
         )
-        low = idle_cost()
 
-        # 20s on / 20s off cycles
-        def cost(
-            batch_window_s: float, wall_time_s: float
-        ) -> float:
-            cycle_pos = wall_time_s % 40.0
-            if cycle_pos < 20.0:
-                return high(batch_window_s, wall_time_s)
-            return low(batch_window_s, wall_time_s)
+        cost = cyclic_cost(
+            on_duration_s=20.0,
+            off_duration_s=20.0,
+            on_cost=high,
+            off_cost=idle_cost(),
+        )
 
         result = run_scenario(batcher, 200.0, cost)
 
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Precondition: at least one on-phase must trigger escalation "
+            f"(reached level {result.max_level}, "
+            f"need >= {lim['min_level_reached']})"
+        )
         assert result.final_level <= lim["max_final_level"], (
             f"Stuck at level {result.final_level} after repeated cycles "
             f"(limit: {lim['max_final_level']})"
@@ -781,12 +917,20 @@ def test_backlog_drains_after_escalation(self):
         lim = LIMITS["backlog_drains"]
         batcher = make_default_batcher()
 
-        cost = constant_overhead_cost(
-            overhead_s=0.6, per_second_cost=0.6
-        )
+        cost = constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6)
 
         result = run_scenario(batcher, 120.0, cost)
 
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Precondition: escalation must occur for backlog draining to be "
+            f"meaningful (reached level {result.max_level}, "
+            f"need >= {lim['min_level_reached']})"
+        )
+        assert result.max_backlog_s >= lim["min_peak_backlog_s"], (
+            f"Precondition: meaningful backlog must build up before draining "
+            f"(peak was {result.max_backlog_s:.2f}s, "
+            f"need >= {lim['min_peak_backlog_s']}s)"
+        )
         assert result.final_backlog_s < lim["max_final_backlog_s"], (
             f"Backlog not drained: {result.final_backlog_s:.1f}s "
             f"(limit: {lim['max_final_backlog_s']}s)"
@@ -798,14 +942,21 @@ def test_backlog_does_not_grow_indefinitely(self):
         At 1s: 0.8 + 0.3 = 1.1s (overloaded).
         At 2s: 0.8 + 0.6 = 1.4s (OK).
         """
+        lim = LIMITS["backlog_peaks_and_decreases"]
         batcher = make_default_batcher()
 
-        cost = constant_overhead_cost(
-            overhead_s=0.8, per_second_cost=0.3
-        )
+        cost = constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3)
 
         result = run_scenario(batcher, 120.0, cost)
 
+        assert result.max_level >= lim["min_level_reached"], (
+            "Precondition: escalation must occur to test backlog draining"
+        )
+        assert result.max_backlog_s >= lim["min_peak_backlog_s"], (
+            f"Precondition: meaningful backlog must build up "
+            f"(peak was {result.max_backlog_s:.2f}s)"
+        )
+
         peak_idx = max(
             range(len(result.cycles)),
             key=lambda i: result.cycles[i].backlog_s,
@@ -825,9 +976,7 @@ def test_fast_escalation_on_clear_overload(self):
         batcher = make_default_batcher()
 
         # Clear overload: 1.5x the window at every level
-        cost = constant_overhead_cost(
-            overhead_s=0.0, per_second_cost=1.5
-        )
+        cost = constant_overhead_cost(overhead_s=0.0, per_second_cost=1.5)
 
         result = run_scenario(batcher, 60.0, cost)
 
@@ -843,9 +992,7 @@ def test_no_escalation_when_processing_fits(self):
         lim = LIMITS["no_escalation_when_fits"]
         batcher = make_default_batcher()
 
-        cost = constant_overhead_cost(
-            overhead_s=0.1, per_second_cost=0.3
-        )
+        cost = constant_overhead_cost(overhead_s=0.1, per_second_cost=0.3)
 
         result = run_scenario(batcher, 60.0, cost)
         assert result.max_level <= lim["max_level"], (

From 50d358943301e2e99450b9ef9e0f89c2002e59f1 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 06:43:40 +0000
Subject: [PATCH 06/16] Add failing tests for de-escalation under continuous
 load

The batcher cannot de-escalate when data is flowing continuously
because idle poll cycles between batches reset the consecutive-
underloaded counter. Four new tests cover the scope of this issue:

- Light continuous load after heavy phase (level 1 -> 0)
- Moderate continuous load after heavy phase (level 1 -> 0)
- Multi-level de-escalation (level 2+ -> 0)
- Partial de-escalation (level 2+ -> 1)

All four fail, confirming the limitation. The higher the escalation
level, the worse it gets: a larger window means more spare time,
more idle polls, and less chance for the underload counter to
accumulate.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../core/adaptive_batching_scenarios_test.py  | 115 ++++++++++++++++--
 1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index 9b1c2358b..0d806edb7 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -114,6 +114,18 @@
         "min_level_during_load": 1,
         "max_final_level": 0,
     },
+    "deescalation_moderate_load": {
+        "min_level_during_load": 1,
+        "max_final_level": 0,
+    },
+    "multi_level_deescalation": {
+        "min_level_during_load": 2,
+        "max_final_level": 0,
+    },
+    "partial_deescalation": {
+        "min_level_during_load": 2,
+        "max_final_level": 1,
+    },
     # -- Realistic shutter ------------------------------------------------
     "shutter_open_close": {
         "min_level_reached": 1,
@@ -781,16 +793,6 @@ def test_deescalates_after_load_drops_to_idle(self):
             f"Final level {result.final_level} (limit: {lim['max_final_level']})"
         )
 
-    @pytest.mark.xfail(
-        reason=(
-            "Known limitation: idle poll cycles between batches reset the "
-            "consecutive-underloaded counter, preventing de-escalation under "
-            "continuous light load. At level 1 (2s window) with 0.3s "
-            "processing, the 1.7s of spare time generates ~17 idle polls "
-            "that each reset _consecutive_underloaded to 0."
-        ),
-        strict=True,
-    )
     def test_deescalates_after_step_down_to_light_load(self):
         """When load decreases to light (but non-zero), must de-escalate.
 
@@ -824,6 +826,99 @@ def test_deescalates_after_step_down_to_light_load(self):
             f"Final level {result.final_level} (limit: {lim['max_final_level']})"
         )
 
+    def test_deescalates_under_moderate_continuous_load(self):
+        """De-escalation must work when processing fills a moderate fraction
+        of the escalated window, not just under near-idle conditions.
+
+        Heavy phase at 1s window: 0.8 + 0.3 = 1.1s (overloaded, escalate to level 1).
+        Moderate phase at 2s window: 0.3 + 0.3*2 = 0.9s (45% utilization, headroom).
+        Moderate phase at 1s window: 0.3 + 0.3 = 0.6s (fits at base level).
+        """
+        lim = LIMITS["deescalation_moderate_load"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=40.0,
+                before=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
+                after=constant_overhead_cost(overhead_s=0.3, per_second_cost=0.3),
+            ),
+        )
+
+        result = run_scenario(batcher, 180.0, cost)
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: batcher must have escalated during heavy-load "
+            f"phase (reached level {result.max_level}, "
+            f"need >= {lim['min_level_during_load']})"
+        )
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} (limit: {lim['max_final_level']})"
+        )
+
+    def test_multi_level_deescalation(self):
+        """After reaching level 2+, a drop to light load should step back
+        through all levels to 0.
+
+        Heavy phase at 1s: 1.8 + 0.2 = 2.0s (2x overloaded, needs level 2+).
+        At 4s: 1.8 + 0.8 = 2.6s (OK, escalation stops at level 2).
+        Light phase at 4s: 0.1 + 0.4 = 0.5s (well within any window).
+        """
+        lim = LIMITS["multi_level_deescalation"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=60.0,
+                before=constant_overhead_cost(overhead_s=1.8, per_second_cost=0.2),
+                after=constant_overhead_cost(overhead_s=0.1, per_second_cost=0.1),
+            ),
+        )
+
+        result = run_scenario(batcher, 240.0, cost)
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: must reach level {lim['min_level_during_load']}+ "
+            f"during heavy phase (reached {result.max_level})"
+        )
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} after load dropped "
+            f"(limit: {lim['max_final_level']})"
+        )
+
+    def test_partial_deescalation(self):
+        """Load drops from severe to moderate: should settle at level 1, not
+        stay stuck at the peak level.
+
+        Severe phase at 1s: 1.8 + 0.2 = 2.0s (needs level 2).
+        Moderate phase at 1s: 0.6 + 0.6 = 1.2s (needs level 1).
+        Moderate phase at 2s: 0.6 + 1.2 = 1.8s (fits at level 1).
+        """
+        lim = LIMITS["partial_deescalation"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=60.0,
+                before=constant_overhead_cost(overhead_s=1.8, per_second_cost=0.2),
+                after=constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6),
+            ),
+        )
+
+        result = run_scenario(batcher, 240.0, cost)
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: must reach level {lim['min_level_during_load']}+ "
+            f"during severe phase (reached {result.max_level})"
+        )
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} after load reduced "
+            f"(limit: {lim['max_final_level']})"
+        )
+
 
 class TestRealisticShutterScenario:
     """End-to-end shutter open/close simulation with noise."""

From b8735dc2ef121c9d773027842f0a8f8eaa90b8e3 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 06:46:01 +0000
Subject: [PATCH 07/16] Fix de-escalation under continuous load

Idle poll cycles (report_batch(None)) between batches were resetting
the consecutive-overloaded and consecutive-underloaded counters.  At
higher escalation levels, the large batch window means most of each
cycle is spare time filled with idle polls, which prevented the
underload counter from ever reaching the de-escalation threshold.

The fix: idle polls no longer reset consecutive counters.  Genuine
idleness is already handled by the wall-clock fallback path, and the
overload counter is properly reset by non-overloaded real batches.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ess/livedata/core/message_batcher.py      |  9 +++++----
 .../core/adaptive_batching_scenarios_test.py  |  9 ++++++---
 tests/core/message_batcher_test.py            | 20 ++++++++++++++++---
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/ess/livedata/core/message_batcher.py b/src/ess/livedata/core/message_batcher.py
index 6c2372073..91273cd4c 100644
--- a/src/ess/livedata/core/message_batcher.py
+++ b/src/ess/livedata/core/message_batcher.py
@@ -233,10 +233,11 @@ def report_batch(
         processing_time_s: float = 0.0,
     ) -> None:
         if message_count is None:
-            # Idle cycle — reset both counters. Fall back to wall-clock
-            # de-escalation when data stops entirely.
-            self._consecutive_overloaded = 0
-            self._consecutive_underloaded = 0
+            # Idle cycle — no load signal, leave consecutive counters
+            # untouched.  Genuine idleness is handled by the wall-clock
+            # fallback below; resetting counters here would prevent
+            # de-escalation under continuous light load where idle polls
+            # between batches outnumber real reports.
             if self._level > 0 and self._last_nonempty_batch_time is not None:
                 idle_s = time.monotonic() - self._last_nonempty_batch_time
                 idle_windows = idle_s / self.batch_length_s
diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index 0d806edb7..cc52abaf4 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -893,8 +893,11 @@ def test_partial_deescalation(self):
         stay stuck at the peak level.
 
         Severe phase at 1s: 1.8 + 0.2 = 2.0s (needs level 2).
-        Moderate phase at 1s: 0.6 + 0.6 = 1.2s (needs level 1).
-        Moderate phase at 2s: 0.6 + 1.2 = 1.8s (fits at level 1).
+        Moderate phase at 1s: 0.6 + 0.5 = 1.1s (overloaded, needs level 1).
+        Moderate phase at 2s: 0.6 + 1.0 = 1.6s (fits, 80% — no headroom to
+            de-escalate further).
+        Moderate phase at 4s: 0.6 + 2.0 = 2.6s (65% — has headroom, should
+            de-escalate from level 2).
         """
         lim = LIMITS["partial_deescalation"]
         batcher = make_default_batcher()
@@ -905,7 +908,7 @@ def test_partial_deescalation(self):
             after=step_function_cost(
                 step_time_s=60.0,
                 before=constant_overhead_cost(overhead_s=1.8, per_second_cost=0.2),
-                after=constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6),
+                after=constant_overhead_cost(overhead_s=0.6, per_second_cost=0.5),
             ),
         )
 
diff --git a/tests/core/message_batcher_test.py b/tests/core/message_batcher_test.py
index 82a9cff52..4cc2e559d 100644
--- a/tests/core/message_batcher_test.py
+++ b/tests/core/message_batcher_test.py
@@ -494,21 +494,35 @@ def test_does_not_deescalate_below_zero(self):
             batcher.report_batch(None)
             assert batcher.state.level == 0
 
-    def test_idle_cycle_resets_overload_counter(self):
+    def test_underloaded_batch_resets_overload_counter(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         # Almost reach escalation threshold
         for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
             batcher.report_batch(100, processing_time_s=1.5)
 
-        # One idle cycle resets
-        batcher.report_batch(None)
+        # One underloaded batch resets the overload counter
+        batcher.report_batch(100, processing_time_s=0.3)
 
         # Need full threshold again
         for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
             batcher.report_batch(100, processing_time_s=1.5)
         assert batcher.state.level == 0
 
+    def test_idle_cycles_do_not_reset_overload_counter(self):
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+
+        # Almost reach escalation threshold
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD - 1):
+            batcher.report_batch(100, processing_time_s=1.5)
+
+        # Idle cycles (polling between batches) do not reset counters
+        batcher.report_batch(None)
+
+        # One more overloaded batch completes the threshold
+        batcher.report_batch(100, processing_time_s=1.5)
+        assert batcher.state.level == 1
+
     def test_non_empty_batch_resets_idle_timer(self):
         clock = FakeClock()
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)

From bd58a0bd111687f95f8fe90b599684bfcdfc675d Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 07:02:35 +0000
Subject: [PATCH 08/16] Add tests for de-escalation dead zone, sticky jitter,
 and time gaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three blind spots in the scenario test suite:

Dead zone (70-100% utilization at escalated level): When processing
fills the escalated window without enough headroom (<70%) for
de-escalation, the batcher stays stuck even if a lower level would
suffice. Test documents this as current behavior — if the strategy
is improved to probe lower levels, the expected final level should
change.

Jitter-induced sticky escalation: When mean processing equals the
batch window, jitter causes escalation (~25% chance of two consecutive
overloaded batches). At the escalated level, processing lands in the
dead zone, making escalation permanent. Test documents this.

Time-gap batches (message_count=0): The SimpleMessageBatcher can
return empty batches during data gaps. Tests verify these don't
disrupt ongoing escalation or de-escalation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../core/adaptive_batching_scenarios_test.py  | 183 ++++++++++++++++++
 1 file changed, 183 insertions(+)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index cc52abaf4..cfa6ef59c 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -157,6 +157,27 @@
     "stabilization_after_step": {
         "max_oscillations": 0,
     },
+    # -- Dead zone (70-100% utilization at escalated level) ---------------
+    # Documents limitation: batcher cannot de-escalate when processing
+    # fills the dead zone, even if a lower level would suffice.
+    "dead_zone_stuck": {
+        "min_level_during_load": 2,
+        "min_final_level": 2,
+    },
+    # -- Jitter-induced sticky escalation ---------------------------------
+    # Jitter at the exact boundary causes escalation that becomes permanent
+    # because the escalated level lands in the dead zone.
+    "jitter_sticky_escalation": {
+        "min_level": 1,
+        "min_final_level": 1,
+    },
+    # -- Time-gap batches (message_count=0) -------------------------------
+    "time_gaps_during_escalation": {
+        "min_level_reached": 1,
+    },
+    "time_gaps_during_deescalation": {
+        "max_final_level": 0,
+    },
 }
 
 
@@ -1097,3 +1118,165 @@ def test_no_escalation_when_processing_fits(self):
             f"Escalated to {result.max_level} despite fitting "
             f"(limit: {lim['max_level']})"
         )
+
+
+class TestDeescalationDeadZone:
+    """The 70-100% utilization dead zone where de-escalation cannot trigger.
+
+    When processing fills 70-100% of the escalated window, it falls in the
+    "in between" zone: not overloaded (processing < window) and not
+    underloaded (processing >= 0.7 * window).  Both consecutive counters
+    are reset every cycle, so neither escalation nor de-escalation can
+    trigger — even if a lower level would handle the load fine.
+    """
+
+    def test_stuck_in_dead_zone_after_load_drop(self):
+        """After severe overload, a moderate load that lands in the dead zone
+        at the escalated level keeps the batcher stuck, even though a lower
+        level would work.
+
+        Severe phase (reaches level 2):
+            Level 0 (1s): 2.0 + 0.3 = 2.3s (overloaded).
+            Level 1 (2s): 2.0 + 0.6 = 2.6s (overloaded).
+            Level 2 (4s): 2.0 + 1.2 = 3.2s (80%, dead zone — stable).
+
+        Moderate phase (stuck at level 2):
+            Level 2 (4s): 0.5 + 2.4 = 2.9s (72.5%, dead zone — stuck).
+            Level 1 (2s): 0.5 + 1.2 = 1.7s (would fit at 85%).
+            Level 0 (1s): 0.5 + 0.6 = 1.1s (would be overloaded).
+        """
+        lim = LIMITS["dead_zone_stuck"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=60.0,
+                before=constant_overhead_cost(overhead_s=2.0, per_second_cost=0.3),
+                after=constant_overhead_cost(overhead_s=0.5, per_second_cost=0.6),
+            ),
+        )
+
+        result = run_scenario(batcher, 240.0, cost)
+
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: must reach level {lim['min_level_during_load']}+ "
+            f"during severe phase (reached {result.max_level})"
+        )
+        # Documents the limitation: batcher stays at level 2 despite level 1
+        # being sufficient.  If the strategy is improved to probe lower levels,
+        # this assertion should change to max_final_level: 1.
+        assert result.final_level >= lim["min_final_level"], (
+            f"Final level {result.final_level} — expected to stay stuck "
+            f"at level {lim['min_final_level']}+ (dead zone)"
+        )
+
+
+class TestJitterInducedStickyEscalation:
+    """Jitter at the exact boundary can cause permanent escalation.
+
+    When mean processing equals the batch window, jitter causes roughly
+    half the batches to be overloaded.  Two consecutive overloaded batches
+    (~25% probability per pair) trigger escalation.  At the escalated level,
+    processing lands in the dead zone (70-100% utilization), preventing
+    de-escalation.  The batcher stays at level 1 permanently.
+    """
+
+    def test_jitter_at_boundary_causes_sticky_escalation(self):
+        """Mean processing = window with 10% jitter: escalates and stays.
+
+        At 1s window: 0.5 + 0.5 = 1.0s mean, jitter +/-10%.
+            ~50% of cycles are overloaded (processing > 1.0).
+            P(2 consecutive overloaded) ~ 25%, so escalation is very likely.
+
+        At 2s window: 0.5 + 1.0 = 1.5s mean (75% utilization).
+            In the dead zone (>70%), so de-escalation never triggers.
+        """
+        lim = LIMITS["jitter_sticky_escalation"]
+        batcher = make_default_batcher()
+
+        cost = constant_overhead_cost(
+            overhead_s=0.5,
+            per_second_cost=0.5,
+            jitter_fraction=0.1,
+            rng=random.Random(42),
+        )
+
+        result = run_scenario(batcher, 180.0, cost)
+
+        assert result.max_level >= lim["min_level"], (
+            f"Expected escalation from boundary jitter "
+            f"(reached level {result.max_level})"
+        )
+        # Documents the limitation: once escalated, stays at level 1 due to
+        # the dead zone.  If the strategy is improved to handle this case,
+        # the expected final level should be 0.
+        assert result.final_level >= lim["min_final_level"], (
+            f"Expected to stay at level {lim['min_final_level']}+ "
+            f"(dead zone prevents de-escalation)"
+        )
+
+
+class TestTimeGapBatches:
+    """Time-gap batches (message_count=0) should not disrupt adaptive behavior.
+
+    The ``SimpleMessageBatcher`` can return empty batches when there is a
+    time gap in the data stream.  The ``AdaptiveMessageBatcher`` treats
+    these as a no-op, which means they should not interfere with ongoing
+    escalation or de-escalation.
+    """
+
+    def test_time_gaps_do_not_disrupt_escalation(self):
+        """Interleaving empty (time-gap) batches with overloaded batches
+        should not prevent escalation.
+
+        Uses a cost model that alternates between real overloaded batches
+        and time gaps (processing_time=0 reported as message_count=0).
+        """
+        lim = LIMITS["time_gaps_during_escalation"]
+        clock = FakeClock()
+        batcher = make_default_batcher()
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            for _ in range(20):
+                # Overloaded real batch
+                clock.advance(1.5)
+                batcher.report_batch(100, processing_time_s=1.5)
+                # Time-gap empty batch (should be a no-op)
+                batcher.report_batch(0)
+
+        assert batcher.state.level >= lim["min_level_reached"], (
+            f"Time gaps prevented escalation: only reached level "
+            f"{batcher.state.level} (need >= {lim['min_level_reached']})"
+        )
+
+    def test_time_gaps_do_not_disrupt_deescalation(self):
+        """Interleaving empty (time-gap) batches with underloaded batches
+        should not prevent de-escalation.
+        """
+        lim = LIMITS["time_gaps_during_deescalation"]
+        clock = FakeClock()
+        batcher = make_default_batcher()
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            # Escalate to level 1
+            for _ in range(3):
+                window = batcher.batch_length_s
+                clock.advance(window * 1.5)
+                batcher.report_batch(100, processing_time_s=window * 1.5)
+            assert batcher.state.level >= 1, "Precondition: must escalate"
+
+            # Underloaded batches interleaved with time gaps
+            for _ in range(20):
+                window = batcher.batch_length_s
+                processing = window * 0.3
+                clock.advance(processing)
+                batcher.report_batch(100, processing_time_s=processing)
+                # Time-gap empty batch
+                batcher.report_batch(0)
+
+        assert batcher.state.level <= lim["max_final_level"], (
+            f"Time gaps prevented de-escalation: stuck at level "
+            f"{batcher.state.level} (limit: {lim['max_final_level']})"
+        )

From 218eeeba73033f1556ae877f141732d124037ab6 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 08:00:05 +0000
Subject: [PATCH 09/16] Add cosmic background to shutter tests, test
 severe-to-cosmic path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The shutter-closed phase is not idle: cosmic background produces a
continuous stream of ev44 messages with very few events. This means
wall-clock idle de-escalation never applies; the batcher must
de-escalate via the underload counter.

Update existing shutter tests to use cosmic background (overhead=0.2,
per_s=0.01) instead of idle_cost() for the off-phase, making the
simulation more realistic.

Add test_severe_overload_to_cosmic_background: after reaching level
2+ from severe overload, shutter close drops to cosmic background.
Verifies de-escalation through all levels back to 0 via the underload
path — the most operationally important recovery scenario.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../core/adaptive_batching_scenarios_test.py  | 66 +++++++++++++++++--
 1 file changed, 59 insertions(+), 7 deletions(-)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index cfa6ef59c..cbbed1f9c 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -136,6 +136,10 @@
         "min_level_reached": 1,
         "max_final_level": 0,
     },
+    "severe_to_cosmic_background": {
+        "min_level_during_load": 2,
+        "max_final_level": 0,
+    },
     # -- Backlog draining -------------------------------------------------
     "backlog_drains": {
         "min_level_reached": 1,
@@ -948,18 +952,22 @@ class TestRealisticShutterScenario:
     """End-to-end shutter open/close simulation with noise."""
 
     def test_shutter_open_close_cycle(self):
-        """Idle -> shutter open (high load) -> shutter close (idle).
+        """Cosmic background -> shutter open (high load) -> shutter close
+        (cosmic background).
 
         Must handle the full cycle: escalation, stable operation,
-        de-escalation back to base.
+        de-escalation back to base.  The shutter-closed phase is not idle:
+        cosmic background produces a continuous stream of ev44 messages
+        with very few events, resulting in overhead-dominated processing.
         """
         lim = LIMITS["shutter_open_close"]
         batcher = make_default_batcher()
 
         rng = random.Random(42)
+        cosmic = constant_overhead_cost(overhead_s=0.2, per_second_cost=0.01)
         cost = step_function_cost(
             step_time_s=10.0,
-            before=idle_cost(),
+            before=cosmic,
             after=step_function_cost(
                 step_time_s=70.0,
                 before=constant_overhead_cost(
@@ -968,7 +976,7 @@ def test_shutter_open_close_cycle(self):
                     jitter_fraction=0.15,
                     rng=rng,
                 ),
-                after=idle_cost(),
+                after=cosmic,
             ),
         )
 
@@ -990,8 +998,8 @@ def test_shutter_open_close_cycle(self):
     def test_repeated_shutter_cycles(self):
         """Multiple on/off cycles should not cause runaway escalation.
 
-        Each on-phase must trigger escalation, and each off-phase must
-        allow de-escalation back to base.
+        Each on-phase must trigger escalation, and each off-phase (cosmic
+        background) must allow de-escalation back to base.
         """
         lim = LIMITS["repeated_shutter_cycles"]
         batcher = make_default_batcher()
@@ -1003,12 +1011,13 @@ def test_repeated_shutter_cycles(self):
             jitter_fraction=0.1,
             rng=rng,
         )
+        cosmic = constant_overhead_cost(overhead_s=0.2, per_second_cost=0.01)
 
         cost = cyclic_cost(
             on_duration_s=20.0,
             off_duration_s=20.0,
             on_cost=high,
-            off_cost=idle_cost(),
+            off_cost=cosmic,
         )
 
         result = run_scenario(batcher, 200.0, cost)
@@ -1023,6 +1032,49 @@ def test_repeated_shutter_cycles(self):
             f"(limit: {lim['max_final_level']})"
         )
 
+    def test_severe_overload_to_cosmic_background(self):
+        """After severe overload reaching level 2+, shutter close drops load
+        to cosmic background.  Must de-escalate through all levels back to 0.
+
+        This is the most operationally important de-escalation path: ev44
+        messages keep flowing with very few events (cosmic rays), so the
+        system is never truly idle.  Wall-clock idle de-escalation does not
+        apply; the batcher must de-escalate via the underload counter.
+
+        Severe phase (overhead-dominated):
+            Level 0 (1s): 2.0 + 0.3 = 2.3s (overloaded).
+            Level 1 (2s): 2.0 + 0.6 = 2.6s (overloaded).
+            Level 2 (4s): 2.0 + 1.2 = 3.2s (80%, dead zone — stable).
+
+        Cosmic background phase (overhead-dominated, near-zero data cost):
+            Level 2 (4s): 0.2 + 0.04 = 0.24s (6% utilization).
+            Level 1 (2s): 0.2 + 0.02 = 0.22s (11% utilization).
+            Level 0 (1s): 0.2 + 0.01 = 0.21s (21% utilization).
+            All levels are well below the 70% headroom threshold.
+        """
+        lim = LIMITS["severe_to_cosmic_background"]
+        batcher = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=0.0,
+            before=idle_cost(),
+            after=step_function_cost(
+                step_time_s=60.0,
+                before=constant_overhead_cost(overhead_s=2.0, per_second_cost=0.3),
+                after=constant_overhead_cost(overhead_s=0.2, per_second_cost=0.01),
+            ),
+        )
+
+        result = run_scenario(batcher, 240.0, cost)
+        assert result.max_level >= lim["min_level_during_load"], (
+            f"Precondition: must reach level {lim['min_level_during_load']}+ "
+            f"during severe phase (reached {result.max_level})"
+        )
+        assert result.final_level <= lim["max_final_level"], (
+            f"Final level {result.final_level} after shutter close to cosmic "
+            f"background (limit: {lim['max_final_level']})"
+        )
+
 
 class TestBacklogDraining:
     """Once the batcher escalates, accumulated backlog should drain."""

From 92b5d178bc760fe47b8563f5d9970ef63fe6c3d2 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 08:34:31 +0000
Subject: [PATCH 10/16] Replace 2x de-escalation with sqrt(2) half-steps

Escalation still doubles the window (+2 half-steps), but de-escalation
now reduces by a factor of 1/sqrt(2) (-1 half-step). Two de-escalation
steps undo one escalation, providing natural damping.

The batch window lives on a fixed grid of base * sqrt(2)^n values,
avoiding floating-point drift. The asymmetric step sizes allow:
- Faster convergence to the right level (smaller probing steps down)
- Reduced dead zone (can explore windows between the old 2x levels)
- Lower consecutive-underload threshold (3 instead of 5) since each
  step is safer

Tuning changes:
- DEESCALATION_HEADROOM_RATIO: 0.7 -> 0.75
- DEESCALATION_UNDERLOAD_THRESHOLD: 5 -> 3
- ESCALATION_HALF_STEPS replaces ESCALATION_LEVEL_JUMP

The dead zone test now shows the batcher de-escalates from level 4
(4s) to level 3 (2.83s), previously stuck at level 4 (4s).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ess/livedata/core/message_batcher.py      |  54 +++++----
 .../core/adaptive_batching_scenarios_test.py  |  45 ++++----
 tests/core/message_batcher_test.py            | 103 ++++++++++--------
 3 files changed, 112 insertions(+), 90 deletions(-)

diff --git a/src/ess/livedata/core/message_batcher.py b/src/ess/livedata/core/message_batcher.py
index 91273cd4c..1085f5502 100644
--- a/src/ess/livedata/core/message_batcher.py
+++ b/src/ess/livedata/core/message_batcher.py
@@ -189,11 +189,13 @@ def _split_messages(
 
 
 ESCALATION_OVERLOAD_THRESHOLD = 2
-ESCALATION_LEVEL_JUMP = 1
-DEESCALATION_HEADROOM_RATIO = 0.7
-DEESCALATION_UNDERLOAD_THRESHOLD = 5
+ESCALATION_HALF_STEPS = 2
+DEESCALATION_HEADROOM_RATIO = 0.75
+DEESCALATION_UNDERLOAD_THRESHOLD = 3
 DEESCALATION_IDLE_WINDOWS = 3
 
+_SQRT2 = 2**0.5
+
 
 @dataclass(frozen=True)
 class AdaptiveBatcherState:
@@ -207,18 +209,21 @@ class AdaptiveMessageBatcher(MessageBatcher):
     """A message batcher that dynamically adjusts its batch length based on load.
 
     Wraps a ``SimpleMessageBatcher`` and uses processing-time feedback to detect
-    overload. When processing consistently exceeds the batch window, the batcher
-    escalates to a longer window (multiplicative increase). When processing
-    completes with significant headroom, it de-escalates (additive decrease).
-    Idle periods also trigger de-escalation via a wall-clock fallback.
+    overload.  When processing consistently exceeds the batch window, the batcher
+    escalates by doubling the window (+2 half-steps).  When processing completes
+    with headroom, it de-escalates by a factor of 1/sqrt(2) (-1 half-step).
 
-    Each escalation level doubles the batch window from the base length.
+    The asymmetric step sizes mean two de-escalation steps undo one escalation,
+    providing natural damping.  The batch window is always on the grid
+    ``base * sqrt(2)^n``, avoiding floating-point drift.
+
+    Idle periods also trigger de-escalation via a wall-clock fallback.
     """
 
     def __init__(self, base_batch_length_s: float = 1.0, max_level: int = 3) -> None:
         self._base_batch_length_s = base_batch_length_s
-        self._max_level = max_level
-        self._level = 0
+        self._max_half_steps = max_level * 2
+        self._half_step = 0
         self._consecutive_overloaded = 0
         self._consecutive_underloaded = 0
         self._last_nonempty_batch_time: float | None = None
@@ -238,11 +243,11 @@ def report_batch(
             # fallback below; resetting counters here would prevent
             # de-escalation under continuous light load where idle polls
             # between batches outnumber real reports.
-            if self._level > 0 and self._last_nonempty_batch_time is not None:
+            if self._half_step > 0 and self._last_nonempty_batch_time is not None:
                 idle_s = time.monotonic() - self._last_nonempty_batch_time
                 idle_windows = idle_s / self.batch_length_s
                 if idle_windows >= DEESCALATION_IDLE_WINDOWS:
-                    self._set_level(self._level - 1)
+                    self._set_half_step(self._half_step - 1)
                     self._last_nonempty_batch_time = time.monotonic()
         elif message_count == 0:
             # Empty batch from time gap — not a load signal
@@ -257,47 +262,48 @@ def report_batch(
                 self._consecutive_underloaded = 0
                 if (
                     self._consecutive_overloaded >= ESCALATION_OVERLOAD_THRESHOLD
-                    and self._level < self._max_level
+                    and self._half_step < self._max_half_steps
                 ):
-                    new_level = min(
-                        self._level + ESCALATION_LEVEL_JUMP, self._max_level
+                    new = min(
+                        self._half_step + ESCALATION_HALF_STEPS,
+                        self._max_half_steps,
                     )
-                    self._set_level(new_level)
+                    self._set_half_step(new)
                     self._consecutive_overloaded = 0
             elif processing_time_s < self.batch_length_s * DEESCALATION_HEADROOM_RATIO:
-                # Underloaded: significant headroom
+                # Underloaded: headroom available
                 self._consecutive_underloaded += 1
                 self._consecutive_overloaded = 0
                 if (
                     self._consecutive_underloaded >= DEESCALATION_UNDERLOAD_THRESHOLD
-                    and self._level > 0
+                    and self._half_step > 0
                 ):
-                    self._set_level(self._level - 1)
+                    self._set_half_step(self._half_step - 1)
                     self._consecutive_underloaded = 0
             else:
                 # In between — processing fits but without much headroom
                 self._consecutive_overloaded = 0
                 self._consecutive_underloaded = 0
 
-    def _set_level(self, new_level: int) -> None:
+    def _set_half_step(self, new_half_step: int) -> None:
         old_length = self.batch_length_s
-        self._level = new_level
+        self._half_step = new_half_step
         new_length = self.batch_length_s
         logger.warning(
             'adaptive_batch_level_change',
             old_batch_length_s=old_length,
             new_batch_length_s=new_length,
-            level=new_level,
+            level=self._half_step,
         )
         self._inner = SimpleMessageBatcher(batch_length_s=new_length)
 
     @property
     def batch_length_s(self) -> float:
-        return self._base_batch_length_s * (2**self._level)
+        return self._base_batch_length_s * _SQRT2**self._half_step
 
     @property
     def state(self) -> AdaptiveBatcherState:
         return AdaptiveBatcherState(
-            level=self._level,
+            level=self._half_step,
             batch_length_s=self.batch_length_s,
         )
diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index cbbed1f9c..423476e51 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -59,26 +59,31 @@
         "max_final_backlog_s": 1.0,
     },
     # -- Escalation reaches appropriate level for given severity ----------
-    # overhead_s=0.6, per_s=0.6 -> at 1s: 1.2 (overloaded), at 2s: 1.8 (OK)
+    # Levels are half-steps: window = base * sqrt(2)^level.
+    # Escalation jumps +2 (x2), de-escalation drops -1 (x1/sqrt(2)).
+    #   level 0: 1.0s   level 3: 2.83s   level 6: 8.0s
+    #   level 1: 1.41s  level 4: 4.0s
+    #   level 2: 2.0s   level 5: 5.66s
+    #
+    # overhead_s=0.6, per_s=0.6 -> at 1s: 1.2, at 2s: 1.8 (OK at level 2)
     "severity_moderate": {
-        "min_level": 1,
-        "max_level": 1,
+        "min_level": 2,
+        "max_level": 2,
     },
-    # overhead_s=0.8, per_s=0.3 -> at 1s: 1.1, at 2s: 1.4 (OK)
+    # overhead_s=0.8, per_s=0.3 -> at 1s: 1.1, at 1.41s: 1.22 (OK at level 1)
     "severity_overhead_dominated": {
         "min_level": 1,
-        "max_level": 1,
+        "max_level": 2,
     },
-    # overhead_s=1.8, per_s=0.2 -> at 1s: 2.0, at 2s: 2.2, at 4s: 2.6 (OK)
+    # overhead_s=1.8, per_s=0.2 -> needs level 3+ (2.83s window: 2.37s OK)
     "severity_severe": {
-        "min_level": 2,
-        "max_level": 3,
+        "min_level": 3,
+        "max_level": 5,
     },
-    # overhead_s=0.5, per_s=1.5 -> at 1s: 2.0, at 2s: 3.5, at 4s: 6.5, at 8s: 12.5
-    # Overloaded at every level — must reach max.
+    # overhead_s=0.5, per_s=1.5 -> overloaded at every level, must reach max.
     "severity_extreme": {
-        "min_level": 3,
-        "max_level": 3,
+        "min_level": 6,
+        "max_level": 6,
     },
     # -- No escalation when not needed ------------------------------------
     # Parameterized across utilization levels.
@@ -103,7 +108,7 @@
     },
     "mild_creeping_overload": {
         "min_level_reached": 1,
-        "max_level": 1,
+        "max_level": 2,
     },
     # -- De-escalation ----------------------------------------------------
     "deescalation_to_idle": {
@@ -119,12 +124,12 @@
         "max_final_level": 0,
     },
     "multi_level_deescalation": {
-        "min_level_during_load": 2,
+        "min_level_during_load": 3,
         "max_final_level": 0,
     },
     "partial_deescalation": {
-        "min_level_during_load": 2,
-        "max_final_level": 1,
+        "min_level_during_load": 3,
+        "max_final_level": 2,
     },
     # -- Realistic shutter ------------------------------------------------
     "shutter_open_close": {
@@ -137,7 +142,7 @@
         "max_final_level": 0,
     },
     "severe_to_cosmic_background": {
-        "min_level_during_load": 2,
+        "min_level_during_load": 3,
         "max_final_level": 0,
     },
     # -- Backlog draining -------------------------------------------------
@@ -159,14 +164,14 @@
     },
     # -- Stabilization after escalation -----------------------------------
     "stabilization_after_step": {
-        "max_oscillations": 0,
+        "max_oscillations": 1,
     },
     # -- Dead zone (70-100% utilization at escalated level) ---------------
     # Documents limitation: batcher cannot de-escalate when processing
     # fills the dead zone, even if a lower level would suffice.
     "dead_zone_stuck": {
-        "min_level_during_load": 2,
-        "min_final_level": 2,
+        "min_level_during_load": 4,
+        "min_final_level": 3,
     },
     # -- Jitter-induced sticky escalation ---------------------------------
     # Jitter at the exact boundary causes escalation that becomes permanent
diff --git a/tests/core/message_batcher_test.py b/tests/core/message_batcher_test.py
index 4cc2e559d..75106365b 100644
--- a/tests/core/message_batcher_test.py
+++ b/tests/core/message_batcher_test.py
@@ -2,6 +2,8 @@
 # Copyright (c) 2025 Scipp contributors (https://github.com/scipp)
 from unittest.mock import patch
 
+import pytest
+
 from ess.livedata.core.message import Message, StreamId, StreamKind
 from ess.livedata.core.message_batcher import (
     DEESCALATION_HEADROOM_RATIO,
@@ -436,8 +438,8 @@ def test_escalates_after_consecutive_overloaded_batches(self):
         for _ in range(ESCALATION_OVERLOAD_THRESHOLD):
             batcher.report_batch(100, processing_time_s=1.5)
 
-        assert batcher.state.level == 1
-        assert batcher.state.batch_length_s == 2.0
+        assert batcher.state.level == 2
+        assert batcher.state.batch_length_s == pytest.approx(2.0, rel=1e-5)
 
     def test_does_not_escalate_before_threshold(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
@@ -458,32 +460,32 @@ def test_does_not_escalate_when_processing_fits(self):
     def test_escalation_capped_at_max_level(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
-        _escalate_to_level(batcher, 2)
-        assert batcher.state.level == 2
-        assert batcher.state.batch_length_s == 4.0
+        _escalate_to_level(batcher, 4)
+        assert batcher.state.level == 4
+        assert batcher.state.batch_length_s == pytest.approx(4.0, rel=1e-5)
 
         # Further overloaded batches should not exceed max
         for _ in range(ESCALATION_OVERLOAD_THRESHOLD * 2):
             batcher.report_batch(100, processing_time_s=10.0)
-        assert batcher.state.level == 2
+        assert batcher.state.level == 4
 
     def test_deescalates_after_idle_duration(self):
         clock = FakeClock()
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 1)
-            assert batcher.state.level == 1
+            _escalate_to_level(batcher, 2)
+            assert batcher.state.level == 2
 
             # Idle for just under the threshold — no de-escalation
             clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
             batcher.report_batch(None)
-            assert batcher.state.level == 1
+            assert batcher.state.level == 2
 
             # Cross the threshold
             clock.advance(0.2)
             batcher.report_batch(None)
-            assert batcher.state.level == 0
+            assert batcher.state.level == 1
 
     def test_does_not_deescalate_below_zero(self):
         clock = FakeClock()
@@ -521,20 +523,20 @@ def test_idle_cycles_do_not_reset_overload_counter(self):
 
         # One more overloaded batch completes the threshold
         batcher.report_batch(100, processing_time_s=1.5)
-        assert batcher.state.level == 1
+        assert batcher.state.level == 2
 
     def test_non_empty_batch_resets_idle_timer(self):
         clock = FakeClock()
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 1)
-            assert batcher.state.level == 1
+            _escalate_to_level(batcher, 2)
+            assert batcher.state.level == 2
 
             # Almost reach de-escalation time
             clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
             batcher.report_batch(None)
-            assert batcher.state.level == 1
+            assert batcher.state.level == 2
 
             # A non-empty batch resets the idle timer
             batcher.report_batch(100, processing_time_s=1.5)
@@ -542,7 +544,7 @@ def test_non_empty_batch_resets_idle_timer(self):
             # Now need the full idle duration again
             clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
             batcher.report_batch(None)
-            assert batcher.state.level == 1
+            assert batcher.state.level == 2
 
     def test_empty_batches_excluded_from_counters(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
@@ -553,7 +555,7 @@ def test_empty_batches_excluded_from_counters(self):
             batcher.report_batch(0)
 
         batcher.report_batch(100, processing_time_s=1.5)
-        assert batcher.state.level == 1
+        assert batcher.state.level == 2
 
     def test_empty_batches_do_not_contribute_to_escalation(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
@@ -565,57 +567,66 @@ def test_empty_batches_do_not_contribute_to_escalation(self):
     def test_deescalates_under_sustained_light_load(self):
         """De-escalation via underload: processing uses less than headroom ratio."""
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
-        _escalate_to_level(batcher, 1)
-        assert batcher.state.level == 1
+        _escalate_to_level(batcher, 2)
+        assert batcher.state.level == 2
 
-        # Report underloaded batches (processing < 70% of 2s window)
+        # Report underloaded batches (processing < 75% of 4s window)
         underloaded_time = batcher.batch_length_s * DEESCALATION_HEADROOM_RATIO - 0.1
         for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD):
             batcher.report_batch(100, processing_time_s=underloaded_time)
 
-        assert batcher.state.level == 0
+        assert batcher.state.level == 1
 
     def test_does_not_deescalate_without_enough_headroom(self):
         """No de-escalation when processing uses most of the window."""
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
-        _escalate_to_level(batcher, 1)
-        assert batcher.state.level == 1
+        _escalate_to_level(batcher, 2)
+        assert batcher.state.level == 2
+        window = batcher.batch_length_s
 
-        # Processing at 80% of 2s window — above headroom threshold
+        # Processing at 80% of window — above headroom threshold (75%)
         for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD * 3):
-            batcher.report_batch(100, processing_time_s=1.6)
+            batcher.report_batch(100, processing_time_s=window * 0.8)
 
-        assert batcher.state.level == 1
+        assert batcher.state.level == 2
 
     def test_multi_level_escalation_and_deescalation(self):
         clock = FakeClock()
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 2)
-            assert batcher.state.level == 2
-            assert batcher.state.batch_length_s == 4.0
-
-            # De-escalate via idle — one level at a time
-            clock.advance(DEESCALATION_IDLE_WINDOWS * 4.0)
+            _escalate_to_level(batcher, 4)
+            assert batcher.state.level == 4
+            current_length = batcher.batch_length_s
+            assert current_length == pytest.approx(4.0, rel=1e-5)
+
+            # De-escalate via idle — one half-step at a time
+            # Report idle with enough elapsed time to trigger de-escalation
+            # Add small epsilon to avoid floating-point comparison issues
+            clock.advance(DEESCALATION_IDLE_WINDOWS * current_length + 0.01)
             batcher.report_batch(None)
-            assert batcher.state.level == 1
-            assert batcher.state.batch_length_s == 2.0
+            assert batcher.state.level == 3
+            # _last_nonempty_batch_time was reset when we de-escalated above,
+            # so we can measure the next idle period from here
+
+            current_length = batcher.batch_length_s
+            assert current_length == pytest.approx(2.828, rel=1e-2)
 
-            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0)
+            # Report idle again to trigger the next de-escalation
+            clock.advance(DEESCALATION_IDLE_WINDOWS * current_length + 0.01)
             batcher.report_batch(None)
-            assert batcher.state.level == 0
-            assert batcher.state.batch_length_s == 1.0
+            assert batcher.state.level == 2
+            assert batcher.state.batch_length_s == pytest.approx(2.0, rel=1e-5)
 
     def test_state_reflects_custom_base_length(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=0.5, max_level=2)
-        assert batcher.state.batch_length_s == 0.5
-
-        _escalate_to_level(batcher, 1)
-        assert batcher.state.batch_length_s == 1.0
+        assert batcher.state.batch_length_s == pytest.approx(0.5, rel=1e-5)
 
         _escalate_to_level(batcher, 2)
-        assert batcher.state.batch_length_s == 2.0
+        assert batcher.state.batch_length_s == pytest.approx(1.0, rel=1e-5)
+
+        _escalate_to_level(batcher, 4)
+        assert batcher.state.batch_length_s == pytest.approx(2.0, rel=1e-5)
 
     def test_no_oscillation_when_barely_keeping_up(self):
         """At 8s window, rapid idle cycles between batches should not de-escalate."""
@@ -623,8 +634,8 @@ def test_no_oscillation_when_barely_keeping_up(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
 
         with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 3)
-            assert batcher.state.level == 3
+            _escalate_to_level(batcher, 6)
+            assert batcher.state.level == 6
 
             # Simulate "barely keeping up": process batch in 7s, then 1s of idle
             for _ in range(10):
@@ -634,11 +645,11 @@ def test_no_oscillation_when_barely_keeping_up(self):
                     clock.advance(0.1)
                     batcher.report_batch(None)
 
-            assert batcher.state.level == 3
+            assert batcher.state.level == 6
 
     def test_overload_resets_underload_counter(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
-        _escalate_to_level(batcher, 1)
+        _escalate_to_level(batcher, 2)
 
         # Almost enough underloaded batches
         underloaded_time = batcher.batch_length_s * DEESCALATION_HEADROOM_RATIO - 0.1
@@ -651,4 +662,4 @@ def test_overload_resets_underload_counter(self):
         # Need full threshold again
         for _ in range(DEESCALATION_UNDERLOAD_THRESHOLD - 1):
             batcher.report_batch(100, processing_time_s=underloaded_time)
-        assert batcher.state.level == 1
+        assert batcher.state.level == 2

From b7e97145f574fec78ea468f38c681ca8cbb99100 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 09:26:04 +0000
Subject: [PATCH 11/16] Inline

---
 src/ess/livedata/core/orchestrating_processor.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/ess/livedata/core/orchestrating_processor.py b/src/ess/livedata/core/orchestrating_processor.py
index 6652dc8e2..35842f159 100644
--- a/src/ess/livedata/core/orchestrating_processor.py
+++ b/src/ess/livedata/core/orchestrating_processor.py
@@ -176,7 +176,7 @@ def process(self) -> None:
 
         message_batch = self._message_batcher.batch(data_messages)
         if message_batch is None:
-            self._report_batch(None, processing_time_s=0.0)
+            self._message_batcher.report_batch(None, processing_time_s=0.0)
             self._empty_batches += 1
             self._maybe_log_metrics()
             self._sink.publish_messages(result_messages)
@@ -228,7 +228,7 @@ def process(self) -> None:
                 valid_results.append(result)
 
         processing_time_s = time.monotonic() - batch_start
-        self._report_batch(
+        self._message_batcher.report_batch(
             len(message_batch.messages), processing_time_s=processing_time_s
         )
         self._batches_processed += 1
@@ -259,14 +259,6 @@ def _report_status(self) -> None:
 
         self._sink.publish_messages(messages)
 
-    def _report_batch(
-        self, message_count: int | None, processing_time_s: float = 0.0
-    ) -> None:
-        """Forward batch outcome to the batcher for adaptive behavior."""
-        self._message_batcher.report_batch(
-            message_count, processing_time_s=processing_time_s
-        )
-
     def _get_service_status(self, job_statuses: list[JobStatus]) -> ServiceStatus:
         """Get the current service status for heartbeat publishing."""
         return ServiceStatus(

From 4badafa6c4122ca501046a18f0635b3f9b1da56c Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 09:38:34 +0000
Subject: [PATCH 12/16] Fix message loss during adaptive batch level
 transitions

_set_half_step replaced the inner SimpleMessageBatcher on every level
change, discarding messages buffered in _active_batch and _future_messages.
Update the batch length in place instead so the current active batch
completes normally and only the next boundary uses the new length.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/ess/livedata/core/message_batcher.py | 11 ++-
 tests/core/message_batcher_test.py       | 92 ++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/src/ess/livedata/core/message_batcher.py b/src/ess/livedata/core/message_batcher.py
index 1085f5502..f41c1fe0e 100644
--- a/src/ess/livedata/core/message_batcher.py
+++ b/src/ess/livedata/core/message_batcher.py
@@ -128,6 +128,15 @@ def __init__(self, batch_length_s: float = 1.0) -> None:
     def batch_length_s(self) -> float:
         return self._batch_length_s_value
 
+    def set_batch_length(self, batch_length_s: float) -> None:
+        """Update the batch length for future batches.
+
+        The current active batch keeps its boundaries and completes normally.
+        Only the next batch boundary will use the new length.
+        """
+        self._batch_length_s_value = batch_length_s
+        self._batch_length_ns = int(batch_length_s * 1_000_000_000)
+
     def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
         # Filter messages with incompatible (broken) timestamps to avoid issues below.
         messages = [msg for msg in messages if isinstance(msg.timestamp, Number)]
@@ -295,7 +304,7 @@ def _set_half_step(self, new_half_step: int) -> None:
             new_batch_length_s=new_length,
             level=self._half_step,
         )
-        self._inner = SimpleMessageBatcher(batch_length_s=new_length)
+        self._inner.set_batch_length(new_length)
 
     @property
     def batch_length_s(self) -> float:
diff --git a/tests/core/message_batcher_test.py b/tests/core/message_batcher_test.py
index 75106365b..4dc6d5c7b 100644
--- a/tests/core/message_batcher_test.py
+++ b/tests/core/message_batcher_test.py
@@ -647,6 +647,98 @@ def test_no_oscillation_when_barely_keeping_up(self):
 
             assert batcher.state.level == 6
 
+    def test_escalation_preserves_buffered_active_messages(self):
+        """Messages in the active batch must survive escalation."""
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+
+        # Establish timeline
+        initial = batcher.batch([make_message(0, "init")])
+        assert initial is not None
+        # Inner: active_batch=[0, 1e9), messages=[], future=[]
+
+        # Buffer a message in active batch (no future → returns None)
+        buffered = make_message(500_000_000, "buffered")
+        assert batcher.batch([buffered]) is None
+        # Inner: active_batch messages=[buffered], future=[]
+
+        # Trigger escalation — replaces inner batcher
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD):
+            batcher.report_batch(100, processing_time_s=1.5)
+        assert batcher.state.level == 2
+
+        # Drain all batches with a far-future trigger
+        trigger = make_message(5_000_000_000, "trigger")
+        all_values: set[str] = set()
+        batch = batcher.batch([trigger])
+        while batch is not None:
+            all_values.update(m.value for m in batch.messages)
+            batch = batcher.batch([])
+
+        assert "buffered" in all_values, (
+            "Active batch message dropped during escalation"
+        )
+
+    def test_escalation_preserves_future_messages(self):
+        """Messages in future_messages must survive escalation."""
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+
+        # Establish timeline
+        batcher.batch([make_message(0, "init")])
+
+        # Send a far-future message: completes active batch, stays in _future
+        far_future = make_message(3_000_000_000, "far_future")
+        batch = batcher.batch([far_future])
+        assert batch is not None  # completed (empty) active batch
+        # Inner: active=[1e9, 2e9) msgs=[], future=[far_future(3e9)]
+
+        # Trigger escalation
+        for _ in range(ESCALATION_OVERLOAD_THRESHOLD):
+            batcher.report_batch(100, processing_time_s=1.5)
+        assert batcher.state.level == 2
+
+        # Drain with another trigger
+        trigger = make_message(10_000_000_000, "trigger")
+        all_values: set[str] = set()
+        batch = batcher.batch([trigger])
+        while batch is not None:
+            all_values.update(m.value for m in batch.messages)
+            batch = batcher.batch([])
+
+        assert "far_future" in all_values, "Future message dropped during escalation"
+
+    def test_deescalation_preserves_buffered_messages(self):
+        """Messages in the active batch must survive de-escalation."""
+        clock = FakeClock()
+        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+
+        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
+            _escalate_to_level(batcher, 2)
+            assert batcher.state.level == 2
+
+            # Establish timeline at escalated batch length (~2s)
+            batcher.batch([make_message(0, "init")])
+
+            # Buffer a message
+            buffered = make_message(500_000_000, "buffered")
+            assert batcher.batch([buffered]) is None
+
+            # Trigger de-escalation via idle
+            clock.advance(DEESCALATION_IDLE_WINDOWS * batcher.batch_length_s + 0.1)
+            batcher.report_batch(None)
+            assert batcher.state.level == 1
+
+            # Drain
+            trigger = make_message(10_000_000_000, "trigger")
+            all_values: set[str] = set()
+            batch = batcher.batch([trigger])
+            while batch is not None:
+                all_values.update(m.value for m in batch.messages)
+                batch = batcher.batch([])
+
+            assert "buffered" in all_values, (
+                "Active batch message dropped during de-escalation"
+            )
+
     def test_overload_resets_underload_counter(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
         _escalate_to_level(batcher, 2)

From 014e067f64a07014430f2de40a80feb33bb446f0 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Fri, 20 Mar 2026 10:44:24 +0000
Subject: [PATCH 13/16] Improve adaptive batching scenario tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix dead-zone threshold references in docstrings (70% → 75% to match
  DEESCALATION_HEADROOM_RATIO=0.75)
- Fix dead_zone_stuck docstring: 72.5% utilization at level 4 is
  underloaded (< 75%), not dead zone; batcher de-escalates to level 3
  where it gets stuck at ~78%
- Normalize level terminology to consistently mean half-steps matching
  state.level (e.g., level 4 = 4.0s window, not "level 2")
- Rework test_no_oscillation_at_steady_load to use a load that actually
  triggers escalation, preventing the test from being vacuously true
- Add test for non-default base_batch_length_s=2.0 to catch scaling bugs
- Add re-escalation assertion to repeated_shutter_cycles to verify the
  batcher re-escalates during subsequent on-phases
- Replace unittest.mock.patch with clock injection via constructor
  parameter on AdaptiveMessageBatcher, eliminating mock usage in both
  test files
---
 src/ess/livedata/core/message_batcher.py      |  15 +-
 .../core/adaptive_batching_scenarios_test.py  | 324 +++++++++++-------
 tests/core/message_batcher_test.py            | 196 +++++------
 3 files changed, 302 insertions(+), 233 deletions(-)

diff --git a/src/ess/livedata/core/message_batcher.py b/src/ess/livedata/core/message_batcher.py
index f41c1fe0e..e1ac6b7fc 100644
--- a/src/ess/livedata/core/message_batcher.py
+++ b/src/ess/livedata/core/message_batcher.py
@@ -2,6 +2,7 @@
 # Copyright (c) 2025 Scipp contributors (https://github.com/scipp)
 import time
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
 from numbers import Number
 from typing import Any
@@ -229,13 +230,19 @@ class AdaptiveMessageBatcher(MessageBatcher):
     Idle periods also trigger de-escalation via a wall-clock fallback.
     """
 
-    def __init__(self, base_batch_length_s: float = 1.0, max_level: int = 3) -> None:
+    def __init__(
+        self,
+        base_batch_length_s: float = 1.0,
+        max_level: int = 3,
+        clock: Callable[[], float] = time.monotonic,
+    ) -> None:
         self._base_batch_length_s = base_batch_length_s
         self._max_half_steps = max_level * 2
         self._half_step = 0
         self._consecutive_overloaded = 0
         self._consecutive_underloaded = 0
         self._last_nonempty_batch_time: float | None = None
+        self._clock = clock
         self._inner = SimpleMessageBatcher(batch_length_s=base_batch_length_s)
 
     def batch(self, messages: list[Message[Any]]) -> MessageBatch | None:
@@ -253,17 +260,17 @@ def report_batch(
             # de-escalation under continuous light load where idle polls
             # between batches outnumber real reports.
             if self._half_step > 0 and self._last_nonempty_batch_time is not None:
-                idle_s = time.monotonic() - self._last_nonempty_batch_time
+                idle_s = self._clock() - self._last_nonempty_batch_time
                 idle_windows = idle_s / self.batch_length_s
                 if idle_windows >= DEESCALATION_IDLE_WINDOWS:
                     self._set_half_step(self._half_step - 1)
-                    self._last_nonempty_batch_time = time.monotonic()
+                    self._last_nonempty_batch_time = self._clock()
         elif message_count == 0:
             # Empty batch from time gap — not a load signal
             pass
         else:
             # Non-empty batch — use processing time to decide
-            self._last_nonempty_batch_time = time.monotonic()
+            self._last_nonempty_batch_time = self._clock()
 
             if processing_time_s > self.batch_length_s:
                 # Overloaded: processing exceeded the batch window
diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index 423476e51..71e9c2875 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -26,7 +26,6 @@
 import random
 from dataclasses import dataclass, field
 from typing import Protocol
-from unittest.mock import patch
 
 import pytest
 
@@ -85,6 +84,12 @@
         "min_level": 6,
         "max_level": 6,
     },
+    # -- Non-default base batch length ------------------------------------
+    # overhead_s=1.2, per_s=0.6 -> at 2s: 2.4, at 4s: 3.6 (OK at level 2)
+    "non_default_base": {
+        "min_level": 2,
+        "max_level": 2,
+    },
     # -- No escalation when not needed ------------------------------------
     # Parameterized across utilization levels.
     "light_load_20pct": {"max_level": 0},
@@ -140,6 +145,7 @@
     "repeated_shutter_cycles": {
         "min_level_reached": 1,
         "max_final_level": 0,
+        "min_escalation_events": 2,
     },
     "severe_to_cosmic_background": {
         "min_level_during_load": 3,
@@ -473,20 +479,22 @@ def cost(batch_window_s: float, wall_time_s: float) -> float:
 # ---------------------------------------------------------------------------
 
 
+def make_default_batcher(
+    **kwargs,
+) -> tuple[AdaptiveMessageBatcher, FakeClock]:
+    clock = kwargs.pop("clock", None) or FakeClock()
+    defaults = {"base_batch_length_s": 1.0, "max_level": 3, "clock": clock}
+    defaults.update(kwargs)
+    return AdaptiveMessageBatcher(**defaults), clock
+
+
 def run_scenario(
-    batcher: MessageBatcher,
+    batcher: AdaptiveMessageBatcher,
     duration_s: float,
     cost_fn: ProcessingCostFn,
+    clock: FakeClock,
 ) -> SimulationResult:
-    clock = FakeClock()
-    with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-        return simulate(batcher, duration_s, cost_fn, clock)
-
-
-def make_default_batcher(**kwargs) -> AdaptiveMessageBatcher:
-    defaults = {"base_batch_length_s": 1.0, "max_level": 3}
-    defaults.update(kwargs)
-    return AdaptiveMessageBatcher(**defaults)
+    return simulate(batcher, duration_s, cost_fn, clock)
 
 
 # ===========================================================================
@@ -500,7 +508,7 @@ class TestStepFunctionEscalation:
     def test_escalates_within_bounded_time(self):
         """After a step increase in load, the batcher must escalate quickly."""
         lim = LIMITS["step_function_escalation"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         # 10s idle, then overhead-dominated load with jitter
         # At 1s window: 0.8 + 0.3 = 1.1s -> overloaded
@@ -515,7 +523,7 @@ def test_escalates_within_bounded_time(self):
             ),
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
 
         first_esc = result.first_escalation_time_s()
         assert first_esc is not None, "Batcher never escalated"
@@ -532,7 +540,7 @@ def test_limits_backlog(self):
         At 2s window: 0.6 + 1.2 = 1.8s (OK).
         """
         lim = LIMITS["step_function_backlog"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=5.0,
@@ -540,7 +548,7 @@ def test_limits_backlog(self):
             after=constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6),
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
 
         assert result.max_level >= 1, (
             "Precondition: load must trigger escalation for backlog test "
@@ -594,7 +602,7 @@ def test_reaches_appropriate_level_for_severity(
         severity, ensuring the response is proportional.
         """
         lim = LIMITS[limits_key]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=5.0,
@@ -604,7 +612,7 @@ def test_reaches_appropriate_level_for_severity(
             ),
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
         assert result.max_level >= lim["min_level"], (
             f"Only reached level {result.max_level} (need >= {lim['min_level']})"
         )
@@ -615,7 +623,7 @@ def test_reaches_appropriate_level_for_severity(
     def test_stabilizes_after_escalation(self):
         """After reaching the correct level, the batcher must not oscillate."""
         lim = LIMITS["stabilization_after_step"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         # At 1s: 0.8+0.3=1.1 (overloaded). At 2s: 0.8+0.6=1.4 (OK).
         cost = step_function_cost(
@@ -624,7 +632,7 @@ def test_stabilizes_after_escalation(self):
             after=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
 
         assert result.max_level >= 1, "Precondition: must have escalated"
         # After the initial transient, the level should be stable.
@@ -641,6 +649,36 @@ def test_stabilizes_after_escalation(self):
         )
 
 
+class TestNonDefaultBaseBatchLength:
+    """Verify scaling with a non-default base batch length."""
+
+    def test_escalation_with_doubled_base(self):
+        """With base=2.0, the level grid shifts: level 0 = 2s, level 2 = 4s.
+
+        The batcher must scale correctly — a bug that hardcodes sqrt(2)^level
+        without multiplying by the base would produce wrong batch windows.
+
+        Level 0 (2.0s): 1.2 + 1.2 = 2.4s (overloaded).
+        Level 2 (4.0s): 1.2 + 2.4 = 3.6s (90%, dead zone — stable).
+        """
+        lim = LIMITS["non_default_base"]
+        batcher, clock = make_default_batcher(base_batch_length_s=2.0)
+
+        cost = step_function_cost(
+            step_time_s=5.0,
+            before=idle_cost(),
+            after=constant_overhead_cost(overhead_s=1.2, per_second_cost=0.6),
+        )
+
+        result = run_scenario(batcher, 120.0, cost, clock)
+        assert result.max_level >= lim["min_level"], (
+            f"Only reached level {result.max_level} (need >= {lim['min_level']})"
+        )
+        assert result.max_level <= lim["max_level"], (
+            f"Over-escalated to level {result.max_level} (limit: {lim['max_level']})"
+        )
+
+
 class TestNoEscalationWhenNotNeeded:
     """The batcher must not escalate when the system keeps up."""
 
@@ -660,12 +698,12 @@ def test_no_escalation_under_light_load(
         even at high utilization.
         """
         lim = LIMITS[limits_key]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
         cost = constant_overhead_cost(
             overhead_s=overhead_s, per_second_cost=per_second_cost
         )
 
-        result = run_scenario(batcher, 60.0, cost)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level <= lim["max_level"], (
             f"Escalated to level {result.max_level} at "
             f"{overhead_s + per_second_cost:.0%} utilization "
@@ -684,7 +722,7 @@ def test_no_escalation_with_gc_jitter(self, seed):
         seeds to avoid seed-dependent false confidence.
         """
         lim = LIMITS["gc_jitter"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
         cost = constant_overhead_cost(
             overhead_s=0.2,
             per_second_cost=0.1,
@@ -692,7 +730,7 @@ def test_no_escalation_with_gc_jitter(self, seed):
             rng=random.Random(seed),
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
         assert result.max_level <= lim["max_level"], (
             f"Escalated to level {result.max_level} from jitter alone "
             f"(seed={seed}, limit: {lim['max_level']})"
@@ -703,14 +741,22 @@ class TestNoOscillation:
     """The batcher must not oscillate between levels."""
 
     def test_no_oscillation_at_steady_load(self):
-        """Constant load near the threshold should stabilize."""
+        """Constant overload that triggers escalation should stabilize without
+        oscillating.
+
+        Level 0 (1.0s): 0.6 + 0.6 = 1.2s (overloaded, escalates).
+        Level 2 (2.0s): 0.6 + 1.2 = 1.8s (90%, dead zone — stable).
+        """
         lim = LIMITS["steady_load_oscillation"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
-        # Processing at ~90% of 1s window
-        cost = constant_overhead_cost(overhead_s=0.5, per_second_cost=0.4)
+        cost = constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6)
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
+        assert result.max_level >= 1, (
+            "Precondition: load must trigger escalation for oscillation test "
+            "to be meaningful"
+        )
         assert result.oscillation_count() <= lim["max_oscillations"], (
             f"Oscillated {result.oscillation_count()} times "
             f"(limit: {lim['max_oscillations']})"
@@ -719,7 +765,7 @@ def test_no_oscillation_at_steady_load(self):
     def test_limited_oscillation_at_boundary(self):
         """Processing right at the window with jitter: bounded oscillation."""
         lim = LIMITS["boundary_oscillation"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         # Mean processing = 1.0s = window, jitter +-10%
         cost = constant_overhead_cost(
@@ -729,7 +775,7 @@ def test_limited_oscillation_at_boundary(self):
             rng=random.Random(42),
         )
 
-        result = run_scenario(batcher, 180.0, cost)
+        result = run_scenario(batcher, 180.0, cost, clock)
         assert result.oscillation_count() <= lim["max_oscillations"], (
             f"Oscillated {result.oscillation_count()} times "
             f"(limit: {lim['max_oscillations']})"
@@ -745,7 +791,7 @@ def test_eventually_escalates_and_limits_backlog(self):
         Ramp from 0.5s to 1.3s at 1s window over 60s.
         """
         lim = LIMITS["creeping_overload"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = creeping_cost(
             overhead_s=0.3,
@@ -754,7 +800,7 @@ def test_eventually_escalates_and_limits_backlog(self):
             ramp_duration_s=60.0,
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
         assert result.max_level >= lim["min_level_reached"], (
             f"Only reached level {result.max_level} "
             f"(need >= {lim['min_level_reached']})"
@@ -765,14 +811,14 @@ def test_eventually_escalates_and_limits_backlog(self):
         )
 
     def test_mild_overload_does_not_over_escalate(self):
-        """A slow creep to barely over 1x should escalate but not beyond level 1.
+        """A slow creep to barely over 1x should escalate but not beyond level 2.
 
         overhead=0.3, per_s ramps 0.5 -> 0.8 over 60s.
-        At 1s window: 0.3 + 0.8 = 1.1s -> needs escalation.
-        At 2s window: 0.3 + 0.8*2 = 1.9s < 2s -> stable at level 1.
+        Level 0 (1.0s): 0.3 + 0.8 = 1.1s (overloaded).
+        Level 2 (2.0s): 0.3 + 1.6 = 1.9s (95%, dead zone — stable).
         """
         lim = LIMITS["mild_creeping_overload"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = creeping_cost(
             overhead_s=0.3,
@@ -781,7 +827,7 @@ def test_mild_overload_does_not_over_escalate(self):
             ramp_duration_s=60.0,
         )
 
-        result = run_scenario(batcher, 180.0, cost)
+        result = run_scenario(batcher, 180.0, cost, clock)
         assert result.max_level >= lim["min_level_reached"], (
             f"Only reached level {result.max_level} — mild overload should "
             f"still trigger escalation (need >= {lim['min_level_reached']})"
@@ -800,7 +846,7 @@ def test_deescalates_after_load_drops_to_idle(self):
         At 1s window: 0.8 + 0.3 = 1.1s (overloaded, triggers escalation).
         """
         lim = LIMITS["deescalation_to_idle"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         # 30s high load, then idle
         cost = step_function_cost(
@@ -813,7 +859,7 @@ def test_deescalates_after_load_drops_to_idle(self):
             ),
         )
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: batcher must have escalated during high-load phase "
             f"(reached level {result.max_level}, "
@@ -833,7 +879,7 @@ def test_deescalates_after_step_down_to_light_load(self):
         Light phase at 1s window: 0.1 + 0.1 = 0.2s (well within budget).
         """
         lim = LIMITS["deescalation_to_light_load"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         # Heavy load for 40s, then light load
         cost = step_function_cost(
@@ -846,7 +892,7 @@ def test_deescalates_after_step_down_to_light_load(self):
             ),
         )
 
-        result = run_scenario(batcher, 180.0, cost)
+        result = run_scenario(batcher, 180.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: batcher must have escalated during heavy-load "
             f"phase (reached level {result.max_level}, "
@@ -860,12 +906,14 @@ def test_deescalates_under_moderate_continuous_load(self):
         """De-escalation must work when processing fills a moderate fraction
         of the escalated window, not just under near-idle conditions.
 
-        Heavy phase at 1s window: 0.8 + 0.3 = 1.1s (overloaded, escalate to level 1).
-        Moderate phase at 2s window: 0.3 + 0.3*2 = 0.9s (45% utilization, headroom).
-        Moderate phase at 1s window: 0.3 + 0.3 = 0.6s (fits at base level).
+        Heavy phase at level 0 (1.0s): 0.8 + 0.3 = 1.1s
+            (overloaded, escalates to level 2).
+        Moderate phase at level 2 (2.0s): 0.3 + 0.6 = 0.9s (45%, underloaded).
+        Moderate phase at level 0 (1.0s): 0.3 + 0.3 = 0.6s (fits at base
+            level).
         """
         lim = LIMITS["deescalation_moderate_load"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=0.0,
@@ -877,7 +925,7 @@ def test_deescalates_under_moderate_continuous_load(self):
             ),
         )
 
-        result = run_scenario(batcher, 180.0, cost)
+        result = run_scenario(batcher, 180.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: batcher must have escalated during heavy-load "
             f"phase (reached level {result.max_level}, "
@@ -888,15 +936,17 @@ def test_deescalates_under_moderate_continuous_load(self):
         )
 
     def test_multi_level_deescalation(self):
-        """After reaching level 2+, a drop to light load should step back
+        """After reaching level 3+, a drop to light load should step back
         through all levels to 0.
 
-        Heavy phase at 1s: 1.8 + 0.2 = 2.0s (2x overloaded, needs level 2+).
-        At 4s: 1.8 + 0.8 = 2.6s (OK, escalation stops at level 2).
-        Light phase at 4s: 0.1 + 0.4 = 0.5s (well within any window).
+        Heavy phase:
+            Level 0 (1.0s): 1.8 + 0.2 = 2.0s (overloaded).
+            Level 2 (2.0s): 1.8 + 0.4 = 2.2s (overloaded).
+            Level 4 (4.0s): 1.8 + 0.8 = 2.6s (65%, underloaded → settles).
+        Light phase at any level: 0.1 + 0.1*w = well within any window.
         """
         lim = LIMITS["multi_level_deescalation"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=0.0,
@@ -908,7 +958,7 @@ def test_multi_level_deescalation(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost)
+        result = run_scenario(batcher, 240.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
             f"during heavy phase (reached {result.max_level})"
@@ -919,18 +969,21 @@ def test_multi_level_deescalation(self):
         )
 
     def test_partial_deescalation(self):
-        """Load drops from severe to moderate: should settle at level 1, not
-        stay stuck at the peak level.
-
-        Severe phase at 1s: 1.8 + 0.2 = 2.0s (needs level 2).
-        Moderate phase at 1s: 0.6 + 0.5 = 1.1s (overloaded, needs level 1).
-        Moderate phase at 2s: 0.6 + 1.0 = 1.6s (fits, 80% — no headroom to
-            de-escalate further).
-        Moderate phase at 4s: 0.6 + 2.0 = 2.6s (65% — has headroom, should
-            de-escalate from level 2).
+        """Load drops from severe to moderate: should partially de-escalate,
+        not stay stuck at the peak level.
+
+        Severe phase (escalates to level 4):
+            Level 0 (1.0s): 1.8 + 0.2 = 2.0s (overloaded).
+            Level 2 (2.0s): 1.8 + 0.4 = 2.2s (overloaded).
+            Level 4 (4.0s): 1.8 + 0.8 = 2.6s (65%, underloaded).
+            De-escalates to level 3 (2.83s): 1.8 + 0.57 = 2.37s (84%, dead zone).
+
+        Moderate phase (de-escalates from level 3 to level 2):
+            Level 3 (2.83s): 0.6 + 1.41 = 2.01s (71%, underloaded).
+            Level 2 (2.0s): 0.6 + 1.0 = 1.6s (80%, dead zone — stuck).
         """
         lim = LIMITS["partial_deescalation"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=0.0,
@@ -942,7 +995,7 @@ def test_partial_deescalation(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost)
+        result = run_scenario(batcher, 240.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
             f"during severe phase (reached {result.max_level})"
@@ -966,7 +1019,7 @@ def test_shutter_open_close_cycle(self):
         with very few events, resulting in overhead-dominated processing.
         """
         lim = LIMITS["shutter_open_close"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         rng = random.Random(42)
         cosmic = constant_overhead_cost(overhead_s=0.2, per_second_cost=0.01)
@@ -985,7 +1038,7 @@ def test_shutter_open_close_cycle(self):
             ),
         )
 
-        result = run_scenario(batcher, 180.0, cost)
+        result = run_scenario(batcher, 180.0, cost, clock)
 
         assert result.max_level >= lim["min_level_reached"], (
             f"Only reached level {result.max_level} during shutter open "
@@ -1007,7 +1060,7 @@ def test_repeated_shutter_cycles(self):
         background) must allow de-escalation back to base.
         """
         lim = LIMITS["repeated_shutter_cycles"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         rng = random.Random(42)
         high = constant_overhead_cost(
@@ -1025,7 +1078,7 @@ def test_repeated_shutter_cycles(self):
             off_cost=cosmic,
         )
 
-        result = run_scenario(batcher, 200.0, cost)
+        result = run_scenario(batcher, 200.0, cost, clock)
 
         assert result.max_level >= lim["min_level_reached"], (
             f"Precondition: at least one on-phase must trigger escalation "
@@ -1036,9 +1089,17 @@ def test_repeated_shutter_cycles(self):
             f"Stuck at level {result.final_level} after repeated cycles "
             f"(limit: {lim['max_final_level']})"
         )
+        escalation_events = sum(
+            1 for _, old, new in result.level_changes() if new > old
+        )
+        assert escalation_events >= lim["min_escalation_events"], (
+            f"Only {escalation_events} escalation event(s) — expected the batcher "
+            f"to re-escalate during subsequent on-phases "
+            f"(need >= {lim['min_escalation_events']})"
+        )
 
     def test_severe_overload_to_cosmic_background(self):
-        """After severe overload reaching level 2+, shutter close drops load
+        """After severe overload reaching level 3+, shutter close drops load
         to cosmic background.  Must de-escalate through all levels back to 0.
 
         This is the most operationally important de-escalation path: ev44
@@ -1047,18 +1108,18 @@ def test_severe_overload_to_cosmic_background(self):
         apply; the batcher must de-escalate via the underload counter.
 
         Severe phase (overhead-dominated):
-            Level 0 (1s): 2.0 + 0.3 = 2.3s (overloaded).
-            Level 1 (2s): 2.0 + 0.6 = 2.6s (overloaded).
-            Level 2 (4s): 2.0 + 1.2 = 3.2s (80%, dead zone — stable).
+            Level 0 (1.0s): 2.0 + 0.3 = 2.3s (overloaded).
+            Level 2 (2.0s): 2.0 + 0.6 = 2.6s (overloaded).
+            Level 4 (4.0s): 2.0 + 1.2 = 3.2s (80%, dead zone — stable).
 
         Cosmic background phase (overhead-dominated, near-zero data cost):
-            Level 2 (4s): 0.2 + 0.04 = 0.24s (6% utilization).
-            Level 1 (2s): 0.2 + 0.02 = 0.22s (11% utilization).
-            Level 0 (1s): 0.2 + 0.01 = 0.21s (21% utilization).
-            All levels are well below the 70% headroom threshold.
+            Level 4 (4.0s): 0.2 + 0.04 = 0.24s (6% utilization).
+            Level 2 (2.0s): 0.2 + 0.02 = 0.22s (11% utilization).
+            Level 0 (1.0s): 0.2 + 0.01 = 0.21s (21% utilization).
+            All levels are well below the 75% headroom threshold.
         """
         lim = LIMITS["severe_to_cosmic_background"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=0.0,
@@ -1070,7 +1131,7 @@ def test_severe_overload_to_cosmic_background(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost)
+        result = run_scenario(batcher, 240.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
             f"during severe phase (reached {result.max_level})"
@@ -1091,11 +1152,11 @@ def test_backlog_drains_after_escalation(self):
         At 2s: 0.6 + 1.2 = 1.8s (OK, surplus drains backlog).
         """
         lim = LIMITS["backlog_drains"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6)
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
 
         assert result.max_level >= lim["min_level_reached"], (
             f"Precondition: escalation must occur for backlog draining to be "
@@ -1119,11 +1180,11 @@ def test_backlog_does_not_grow_indefinitely(self):
         At 2s: 0.8 + 0.6 = 1.4s (OK).
         """
         lim = LIMITS["backlog_peaks_and_decreases"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3)
 
-        result = run_scenario(batcher, 120.0, cost)
+        result = run_scenario(batcher, 120.0, cost, clock)
 
         assert result.max_level >= lim["min_level_reached"], (
             "Precondition: escalation must occur to test backlog draining"
@@ -1149,12 +1210,12 @@ def test_fast_escalation_on_clear_overload(self):
         """When processing demonstrably exceeds the batch window,
         escalation should be fast."""
         lim = LIMITS["fast_escalation_clear_overload"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         # Clear overload: 1.5x the window at every level
         cost = constant_overhead_cost(overhead_s=0.0, per_second_cost=1.5)
 
-        result = run_scenario(batcher, 60.0, cost)
+        result = run_scenario(batcher, 60.0, cost, clock)
 
         first_esc = result.first_escalation_time_s()
         assert first_esc is not None, "Never escalated under overload"
@@ -1166,11 +1227,11 @@ def test_fast_escalation_on_clear_overload(self):
     def test_no_escalation_when_processing_fits(self):
         """No escalation if processing completes within the window."""
         lim = LIMITS["no_escalation_when_fits"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = constant_overhead_cost(overhead_s=0.1, per_second_cost=0.3)
 
-        result = run_scenario(batcher, 60.0, cost)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level <= lim["max_level"], (
             f"Escalated to {result.max_level} despite fitting "
             f"(limit: {lim['max_level']})"
@@ -1178,11 +1239,11 @@ def test_no_escalation_when_processing_fits(self):
 
 
 class TestDeescalationDeadZone:
-    """The 70-100% utilization dead zone where de-escalation cannot trigger.
+    """The 75-100% utilization dead zone where de-escalation cannot trigger.
 
-    When processing fills 70-100% of the escalated window, it falls in the
+    When processing fills 75-100% of the escalated window, it falls in the
     "in between" zone: not overloaded (processing < window) and not
-    underloaded (processing >= 0.7 * window).  Both consecutive counters
+    underloaded (processing >= 0.75 * window).  Both consecutive counters
     are reset every cycle, so neither escalation nor de-escalation can
     trigger — even if a lower level would handle the load fine.
     """
@@ -1192,18 +1253,19 @@ def test_stuck_in_dead_zone_after_load_drop(self):
         at the escalated level keeps the batcher stuck, even though a lower
         level would work.
 
-        Severe phase (reaches level 2):
-            Level 0 (1s): 2.0 + 0.3 = 2.3s (overloaded).
-            Level 1 (2s): 2.0 + 0.6 = 2.6s (overloaded).
-            Level 2 (4s): 2.0 + 1.2 = 3.2s (80%, dead zone — stable).
+        Severe phase (reaches level 4):
+            Level 0 (1.0s): 2.0 + 0.3 = 2.3s (overloaded).
+            Level 2 (2.0s): 2.0 + 0.6 = 2.6s (overloaded).
+            Level 4 (4.0s): 2.0 + 1.2 = 3.2s (80%, dead zone).
 
-        Moderate phase (stuck at level 2):
-            Level 2 (4s): 0.5 + 2.4 = 2.9s (72.5%, dead zone — stuck).
-            Level 1 (2s): 0.5 + 1.2 = 1.7s (would fit at 85%).
-            Level 0 (1s): 0.5 + 0.6 = 1.1s (would be overloaded).
+        Moderate phase (de-escalates from level 4 to level 3, then stuck):
+            Level 4 (4.0s): 0.5 + 2.4 = 2.9s (72.5%, underloaded < 75%).
+            Level 3 (2.83s): 0.5 + 1.7 = 2.2s (78%, dead zone — stuck).
+            Level 2 (2.0s): 0.5 + 1.2 = 1.7s (would fit at 85%).
+            Level 0 (1.0s): 0.5 + 0.6 = 1.1s (would be overloaded).
         """
         lim = LIMITS["dead_zone_stuck"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
             step_time_s=0.0,
@@ -1215,7 +1277,7 @@ def test_stuck_in_dead_zone_after_load_drop(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost)
+        result = run_scenario(batcher, 240.0, cost, clock)
 
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
@@ -1236,22 +1298,22 @@ class TestJitterInducedStickyEscalation:
     When mean processing equals the batch window, jitter causes roughly
     half the batches to be overloaded.  Two consecutive overloaded batches
     (~25% probability per pair) trigger escalation.  At the escalated level,
-    processing lands in the dead zone (70-100% utilization), preventing
-    de-escalation.  The batcher stays at level 1 permanently.
+    processing lands in the dead zone (75-100% utilization), preventing
+    de-escalation.  The batcher stays permanently escalated.
     """
 
     def test_jitter_at_boundary_causes_sticky_escalation(self):
         """Mean processing = window with 10% jitter: escalates and stays.
 
-        At 1s window: 0.5 + 0.5 = 1.0s mean, jitter +/-10%.
+        At level 0 (1s window): 0.5 + 0.5 = 1.0s mean, jitter +/-10%.
             ~50% of cycles are overloaded (processing > 1.0).
             P(2 consecutive overloaded) ~ 25%, so escalation is very likely.
 
-        At 2s window: 0.5 + 1.0 = 1.5s mean (75% utilization).
-            In the dead zone (>70%), so de-escalation never triggers.
+        At level 2 (2s window): 0.5 + 1.0 = 1.5s mean (75% utilization).
+            At the dead-zone boundary (>= 75%), so de-escalation never triggers.
         """
         lim = LIMITS["jitter_sticky_escalation"]
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
         cost = constant_overhead_cost(
             overhead_s=0.5,
@@ -1260,7 +1322,7 @@ def test_jitter_at_boundary_causes_sticky_escalation(self):
             rng=random.Random(42),
         )
 
-        result = run_scenario(batcher, 180.0, cost)
+        result = run_scenario(batcher, 180.0, cost, clock)
 
         assert result.max_level >= lim["min_level"], (
             f"Expected escalation from boundary jitter "
@@ -1292,16 +1354,14 @@ def test_time_gaps_do_not_disrupt_escalation(self):
         and time gaps (processing_time=0 reported as message_count=0).
         """
         lim = LIMITS["time_gaps_during_escalation"]
-        clock = FakeClock()
-        batcher = make_default_batcher()
+        batcher, clock = make_default_batcher()
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            for _ in range(20):
-                # Overloaded real batch
-                clock.advance(1.5)
-                batcher.report_batch(100, processing_time_s=1.5)
-                # Time-gap empty batch (should be a no-op)
-                batcher.report_batch(0)
+        for _ in range(20):
+            # Overloaded real batch
+            clock.advance(1.5)
+            batcher.report_batch(100, processing_time_s=1.5)
+            # Time-gap empty batch (should be a no-op)
+            batcher.report_batch(0)
 
         assert batcher.state.level >= lim["min_level_reached"], (
             f"Time gaps prevented escalation: only reached level "
@@ -1313,25 +1373,23 @@ def test_time_gaps_do_not_disrupt_deescalation(self):
         should not prevent de-escalation.
         """
         lim = LIMITS["time_gaps_during_deescalation"]
-        clock = FakeClock()
-        batcher = make_default_batcher()
-
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            # Escalate to level 1
-            for _ in range(3):
-                window = batcher.batch_length_s
-                clock.advance(window * 1.5)
-                batcher.report_batch(100, processing_time_s=window * 1.5)
-            assert batcher.state.level >= 1, "Precondition: must escalate"
-
-            # Underloaded batches interleaved with time gaps
-            for _ in range(20):
-                window = batcher.batch_length_s
-                processing = window * 0.3
-                clock.advance(processing)
-                batcher.report_batch(100, processing_time_s=processing)
-                # Time-gap empty batch
-                batcher.report_batch(0)
+        batcher, clock = make_default_batcher()
+
+        # Escalate to level 1
+        for _ in range(3):
+            window = batcher.batch_length_s
+            clock.advance(window * 1.5)
+            batcher.report_batch(100, processing_time_s=window * 1.5)
+        assert batcher.state.level >= 1, "Precondition: must escalate"
+
+        # Underloaded batches interleaved with time gaps
+        for _ in range(20):
+            window = batcher.batch_length_s
+            processing = window * 0.3
+            clock.advance(processing)
+            batcher.report_batch(100, processing_time_s=processing)
+            # Time-gap empty batch
+            batcher.report_batch(0)
 
         assert batcher.state.level <= lim["max_final_level"], (
             f"Time gaps prevented de-escalation: stuck at level "
diff --git a/tests/core/message_batcher_test.py b/tests/core/message_batcher_test.py
index 4dc6d5c7b..e503db62b 100644
--- a/tests/core/message_batcher_test.py
+++ b/tests/core/message_batcher_test.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2025 Scipp contributors (https://github.com/scipp)
-from unittest.mock import patch
-
 import pytest
 
 from ess.livedata.core.message import Message, StreamId, StreamKind
@@ -471,30 +469,32 @@ def test_escalation_capped_at_max_level(self):
 
     def test_deescalates_after_idle_duration(self):
         clock = FakeClock()
-        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        batcher = AdaptiveMessageBatcher(
+            base_batch_length_s=1.0, max_level=2, clock=clock
+        )
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 2)
-            assert batcher.state.level == 2
+        _escalate_to_level(batcher, 2)
+        assert batcher.state.level == 2
 
-            # Idle for just under the threshold — no de-escalation
-            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
-            batcher.report_batch(None)
-            assert batcher.state.level == 2
+        # Idle for just under the threshold — no de-escalation
+        clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
+        batcher.report_batch(None)
+        assert batcher.state.level == 2
 
-            # Cross the threshold
-            clock.advance(0.2)
-            batcher.report_batch(None)
-            assert batcher.state.level == 1
+        # Cross the threshold
+        clock.advance(0.2)
+        batcher.report_batch(None)
+        assert batcher.state.level == 1
 
     def test_does_not_deescalate_below_zero(self):
         clock = FakeClock()
-        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        batcher = AdaptiveMessageBatcher(
+            base_batch_length_s=1.0, max_level=2, clock=clock
+        )
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            clock.advance(100.0)
-            batcher.report_batch(None)
-            assert batcher.state.level == 0
+        clock.advance(100.0)
+        batcher.report_batch(None)
+        assert batcher.state.level == 0
 
     def test_underloaded_batch_resets_overload_counter(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
@@ -527,24 +527,25 @@ def test_idle_cycles_do_not_reset_overload_counter(self):
 
     def test_non_empty_batch_resets_idle_timer(self):
         clock = FakeClock()
-        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
+        batcher = AdaptiveMessageBatcher(
+            base_batch_length_s=1.0, max_level=2, clock=clock
+        )
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 2)
-            assert batcher.state.level == 2
+        _escalate_to_level(batcher, 2)
+        assert batcher.state.level == 2
 
-            # Almost reach de-escalation time
-            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
-            batcher.report_batch(None)
-            assert batcher.state.level == 2
+        # Almost reach de-escalation time
+        clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
+        batcher.report_batch(None)
+        assert batcher.state.level == 2
 
-            # A non-empty batch resets the idle timer
-            batcher.report_batch(100, processing_time_s=1.5)
+        # A non-empty batch resets the idle timer
+        batcher.report_batch(100, processing_time_s=1.5)
 
-            # Now need the full idle duration again
-            clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
-            batcher.report_batch(None)
-            assert batcher.state.level == 2
+        # Now need the full idle duration again
+        clock.advance(DEESCALATION_IDLE_WINDOWS * 2.0 - 0.1)
+        batcher.report_batch(None)
+        assert batcher.state.level == 2
 
     def test_empty_batches_excluded_from_counters(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)
@@ -592,31 +593,32 @@ def test_does_not_deescalate_without_enough_headroom(self):
 
     def test_multi_level_escalation_and_deescalation(self):
         clock = FakeClock()
-        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+        batcher = AdaptiveMessageBatcher(
+            base_batch_length_s=1.0, max_level=3, clock=clock
+        )
+
+        _escalate_to_level(batcher, 4)
+        assert batcher.state.level == 4
+        current_length = batcher.batch_length_s
+        assert current_length == pytest.approx(4.0, rel=1e-5)
+
+        # De-escalate via idle — one half-step at a time
+        # Report idle with enough elapsed time to trigger de-escalation
+        # Add small epsilon to avoid floating-point comparison issues
+        clock.advance(DEESCALATION_IDLE_WINDOWS * current_length + 0.01)
+        batcher.report_batch(None)
+        assert batcher.state.level == 3
+        # _last_nonempty_batch_time was reset when we de-escalated above,
+        # so we can measure the next idle period from here
+
+        current_length = batcher.batch_length_s
+        assert current_length == pytest.approx(2.828, rel=1e-2)
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 4)
-            assert batcher.state.level == 4
-            current_length = batcher.batch_length_s
-            assert current_length == pytest.approx(4.0, rel=1e-5)
-
-            # De-escalate via idle — one half-step at a time
-            # Report idle with enough elapsed time to trigger de-escalation
-            # Add small epsilon to avoid floating-point comparison issues
-            clock.advance(DEESCALATION_IDLE_WINDOWS * current_length + 0.01)
-            batcher.report_batch(None)
-            assert batcher.state.level == 3
-            # _last_nonempty_batch_time was reset when we de-escalated above,
-            # so we can measure the next idle period from here
-
-            current_length = batcher.batch_length_s
-            assert current_length == pytest.approx(2.828, rel=1e-2)
-
-            # Report idle again to trigger the next de-escalation
-            clock.advance(DEESCALATION_IDLE_WINDOWS * current_length + 0.01)
-            batcher.report_batch(None)
-            assert batcher.state.level == 2
-            assert batcher.state.batch_length_s == pytest.approx(2.0, rel=1e-5)
+        # Report idle again to trigger the next de-escalation
+        clock.advance(DEESCALATION_IDLE_WINDOWS * current_length + 0.01)
+        batcher.report_batch(None)
+        assert batcher.state.level == 2
+        assert batcher.state.batch_length_s == pytest.approx(2.0, rel=1e-5)
 
     def test_state_reflects_custom_base_length(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=0.5, max_level=2)
@@ -631,21 +633,22 @@ def test_state_reflects_custom_base_length(self):
     def test_no_oscillation_when_barely_keeping_up(self):
         """At 8s window, rapid idle cycles between batches should not de-escalate."""
         clock = FakeClock()
-        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+        batcher = AdaptiveMessageBatcher(
+            base_batch_length_s=1.0, max_level=3, clock=clock
+        )
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 6)
-            assert batcher.state.level == 6
+        _escalate_to_level(batcher, 6)
+        assert batcher.state.level == 6
 
-            # Simulate "barely keeping up": process batch in 7s, then 1s of idle
+        # Simulate "barely keeping up": process batch in 7s, then 1s of idle
+        for _ in range(10):
+            clock.advance(7.0)
+            batcher.report_batch(100, processing_time_s=7.0)
             for _ in range(10):
-                clock.advance(7.0)
-                batcher.report_batch(100, processing_time_s=7.0)
-                for _ in range(10):
-                    clock.advance(0.1)
-                    batcher.report_batch(None)
+                clock.advance(0.1)
+                batcher.report_batch(None)
 
-            assert batcher.state.level == 6
+        assert batcher.state.level == 6
 
     def test_escalation_preserves_buffered_active_messages(self):
         """Messages in the active batch must survive escalation."""
@@ -709,35 +712,36 @@ def test_escalation_preserves_future_messages(self):
     def test_deescalation_preserves_buffered_messages(self):
         """Messages in the active batch must survive de-escalation."""
         clock = FakeClock()
-        batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=3)
+        batcher = AdaptiveMessageBatcher(
+            base_batch_length_s=1.0, max_level=3, clock=clock
+        )
+
+        _escalate_to_level(batcher, 2)
+        assert batcher.state.level == 2
 
-        with patch('ess.livedata.core.message_batcher.time.monotonic', clock):
-            _escalate_to_level(batcher, 2)
-            assert batcher.state.level == 2
-
-            # Establish timeline at escalated batch length (~2s)
-            batcher.batch([make_message(0, "init")])
-
-            # Buffer a message
-            buffered = make_message(500_000_000, "buffered")
-            assert batcher.batch([buffered]) is None
-
-            # Trigger de-escalation via idle
-            clock.advance(DEESCALATION_IDLE_WINDOWS * batcher.batch_length_s + 0.1)
-            batcher.report_batch(None)
-            assert batcher.state.level == 1
-
-            # Drain
-            trigger = make_message(10_000_000_000, "trigger")
-            all_values: set[str] = set()
-            batch = batcher.batch([trigger])
-            while batch is not None:
-                all_values.update(m.value for m in batch.messages)
-                batch = batcher.batch([])
-
-            assert "buffered" in all_values, (
-                "Active batch message dropped during de-escalation"
-            )
+        # Establish timeline at escalated batch length (~2s)
+        batcher.batch([make_message(0, "init")])
+
+        # Buffer a message
+        buffered = make_message(500_000_000, "buffered")
+        assert batcher.batch([buffered]) is None
+
+        # Trigger de-escalation via idle
+        clock.advance(DEESCALATION_IDLE_WINDOWS * batcher.batch_length_s + 0.1)
+        batcher.report_batch(None)
+        assert batcher.state.level == 1
+
+        # Drain
+        trigger = make_message(10_000_000_000, "trigger")
+        all_values: set[str] = set()
+        batch = batcher.batch([trigger])
+        while batch is not None:
+            all_values.update(m.value for m in batch.messages)
+            batch = batcher.batch([])
+
+        assert "buffered" in all_values, (
+            "Active batch message dropped during de-escalation"
+        )
 
     def test_overload_resets_underload_counter(self):
         batcher = AdaptiveMessageBatcher(base_batch_length_s=1.0, max_level=2)

From 17726e60208ecd4110a7b406da2aae8cb73b887a Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Mon, 23 Mar 2026 04:57:05 +0000
Subject: [PATCH 14/16] Tune adaptive batching scenario test limits and
 strengthen GC jitter test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tighten 8 limits that had excessive headroom (e.g., max_backlog 5.0→1.0
when actual is 0.4) and loosen boundary_oscillation max_oscillations
4→5 to avoid flakiness (old limit exactly matched worst-case across
50 seeds).

Increase gc_jitter jitter_fraction from 0.5 to 1.2 so spikes actually
enter the dead zone and occasionally exceed the batch window, testing
that the batcher tolerates isolated overloads. Previously the test
never left the headroom zone and was equivalent to a light-load test.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../core/adaptive_batching_scenarios_test.py  | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index 71e9c2875..ee822ebe3 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -54,8 +54,8 @@
         "max_time_to_first_escalation_s": 10.0,
     },
     "step_function_backlog": {
-        "max_backlog_s": 5.0,
-        "max_final_backlog_s": 1.0,
+        "max_backlog_s": 1.0,
+        "max_final_backlog_s": 0.5,
     },
     # -- Escalation reaches appropriate level for given severity ----------
     # Levels are half-steps: window = base * sqrt(2)^level.
@@ -104,12 +104,12 @@
         "max_oscillations": 0,
     },
     "boundary_oscillation": {
-        "max_oscillations": 4,
+        "max_oscillations": 5,
     },
     # -- Creeping overload ------------------------------------------------
     "creeping_overload": {
-        "min_level_reached": 1,
-        "max_backlog_s": 5.0,
+        "min_level_reached": 4,
+        "max_backlog_s": 3.5,
     },
     "mild_creeping_overload": {
         "min_level_reached": 1,
@@ -140,12 +140,12 @@
     "shutter_open_close": {
         "min_level_reached": 1,
         "max_final_level": 0,
-        "max_backlog_s": 10.0,
+        "max_backlog_s": 2.0,
     },
     "repeated_shutter_cycles": {
         "min_level_reached": 1,
         "max_final_level": 0,
-        "min_escalation_events": 2,
+        "min_escalation_events": 4,
     },
     "severe_to_cosmic_background": {
         "min_level_during_load": 3,
@@ -155,7 +155,7 @@
     "backlog_drains": {
         "min_level_reached": 1,
         "min_peak_backlog_s": 0.1,
-        "max_final_backlog_s": 1.0,
+        "max_final_backlog_s": 0.5,
     },
     "backlog_peaks_and_decreases": {
         "min_level_reached": 1,
@@ -163,7 +163,7 @@
     },
     # -- Processing-time awareness ----------------------------------------
     "fast_escalation_clear_overload": {
-        "max_time_to_first_escalation_s": 5.0,
+        "max_time_to_first_escalation_s": 4.0,
     },
     "no_escalation_when_fits": {
         "max_level": 0,
@@ -717,16 +717,22 @@ def test_no_escalation_under_light_load(
     def test_no_escalation_with_gc_jitter(self, seed):
         """Occasional GC/scheduling spikes should not cause escalation.
 
-        Processing is fast on average (0.3s) but with significant jitter
-        that occasionally exceeds the 1s window.  Tested with multiple RNG
-        seeds to avoid seed-dependent false confidence.
+        Processing is fast on average (0.3s) but with high jitter
+        (std = 1.2 * mean = 0.36s) that regularly sends individual batches
+        into the dead zone (75-100% of window) and occasionally past the
+        window entirely (~4 overloaded cycles per 120s run).
+
+        The batcher must tolerate these isolated spikes because its
+        escalation heuristic requires *consecutive* overloaded batches.
+        Tested with multiple RNG seeds to avoid seed-dependent false
+        confidence.
         """
         lim = LIMITS["gc_jitter"]
         batcher, clock = make_default_batcher()
         cost = constant_overhead_cost(
             overhead_s=0.2,
             per_second_cost=0.1,
-            jitter_fraction=0.5,
+            jitter_fraction=1.2,
             rng=random.Random(seed),
         )
 

From 0db01a94bc68469b2ecd60070e36ef066b069571 Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Mon, 23 Mar 2026 05:10:59 +0000
Subject: [PATCH 15/16] Consolidate adaptive batching scenario tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merge tests that run identical simulations but assert different facets
into single tests with all assertions. This removes 6 tests and 130
lines without losing any coverage.

Merges:
- step_function_backlog + severity_moderate → test_moderate_overload
- backlog_drains + steady_load_oscillation → test_moderate_overload_stabilizes_and_drains
- severity_overhead_dominated + stabilization_after_step + backlog_peaks_and_decreases → test_overhead_dominated_overload
- boundary_oscillation + jitter_sticky_escalation → test_boundary_jitter_escalates_and_sticks
- 3 de-escalation tests → parametrized test_deescalates_when_load_drops

Also removes no_escalation_when_fits (subsumed by light_load parametrization).
---
 .../core/adaptive_batching_scenarios_test.py  | 455 +++++++-----------
 1 file changed, 168 insertions(+), 287 deletions(-)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index ee822ebe3..a6f1a412f 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -53,10 +53,6 @@
     "step_function_escalation": {
         "max_time_to_first_escalation_s": 10.0,
     },
-    "step_function_backlog": {
-        "max_backlog_s": 1.0,
-        "max_final_backlog_s": 0.5,
-    },
     # -- Escalation reaches appropriate level for given severity ----------
     # Levels are half-steps: window = base * sqrt(2)^level.
     # Escalation jumps +2 (x2), de-escalation drops -1 (x1/sqrt(2)).
@@ -65,14 +61,20 @@
     #   level 2: 2.0s   level 5: 5.66s
     #
     # overhead_s=0.6, per_s=0.6 -> at 1s: 1.2, at 2s: 1.8 (OK at level 2)
-    "severity_moderate": {
+    # Merged: level bounds + backlog bounds (same simulation).
+    "moderate_overload_step": {
         "min_level": 2,
         "max_level": 2,
+        "max_backlog_s": 1.0,
+        "max_final_backlog_s": 0.5,
     },
     # overhead_s=0.8, per_s=0.3 -> at 1s: 1.1, at 1.41s: 1.22 (OK at level 1)
-    "severity_overhead_dominated": {
+    # Merged: level bounds + stabilization + backlog-peaks (same simulation).
+    "overhead_dominated_step": {
         "min_level": 1,
         "max_level": 2,
+        "max_oscillations": 1,
+        "min_peak_backlog_s": 0.1,
     },
     # overhead_s=1.8, per_s=0.2 -> needs level 3+ (2.83s window: 2.37s OK)
     "severity_severe": {
@@ -99,12 +101,21 @@
     "gc_jitter": {
         "max_level": 0,
     },
-    # -- No oscillation ---------------------------------------------------
-    "steady_load_oscillation": {
+    # -- Steady overload --------------------------------------------------
+    # overhead_s=0.6, per_s=0.6 -> constant overload from t=0.
+    # Merged: oscillation + backlog draining (same simulation).
+    "steady_moderate_overload": {
         "max_oscillations": 0,
+        "min_level_reached": 1,
+        "min_peak_backlog_s": 0.1,
+        "max_final_backlog_s": 0.5,
     },
-    "boundary_oscillation": {
+    # overhead_s=0.5, per_s=0.5, jitter=10% -> mean = window exactly.
+    # Merged: oscillation bounds + sticky escalation (same simulation).
+    "boundary_jitter": {
         "max_oscillations": 5,
+        "min_level": 1,
+        "min_final_level": 1,
     },
     # -- Creeping overload ------------------------------------------------
     "creeping_overload": {
@@ -151,27 +162,10 @@
         "min_level_during_load": 3,
         "max_final_level": 0,
     },
-    # -- Backlog draining -------------------------------------------------
-    "backlog_drains": {
-        "min_level_reached": 1,
-        "min_peak_backlog_s": 0.1,
-        "max_final_backlog_s": 0.5,
-    },
-    "backlog_peaks_and_decreases": {
-        "min_level_reached": 1,
-        "min_peak_backlog_s": 0.1,
-    },
     # -- Processing-time awareness ----------------------------------------
     "fast_escalation_clear_overload": {
         "max_time_to_first_escalation_s": 4.0,
     },
-    "no_escalation_when_fits": {
-        "max_level": 0,
-    },
-    # -- Stabilization after escalation -----------------------------------
-    "stabilization_after_step": {
-        "max_oscillations": 1,
-    },
     # -- Dead zone (70-100% utilization at escalated level) ---------------
     # Documents limitation: batcher cannot de-escalate when processing
     # fills the dead zone, even if a lower level would suffice.
@@ -179,13 +173,6 @@
         "min_level_during_load": 4,
         "min_final_level": 3,
     },
-    # -- Jitter-induced sticky escalation ---------------------------------
-    # Jitter at the exact boundary causes escalation that becomes permanent
-    # because the escalated level lands in the dead zone.
-    "jitter_sticky_escalation": {
-        "min_level": 1,
-        "min_final_level": 1,
-    },
     # -- Time-gap batches (message_count=0) -------------------------------
     "time_gaps_during_escalation": {
         "min_level_reached": 1,
@@ -533,13 +520,13 @@ def test_escalates_within_bounded_time(self):
             f"(limit: {lim['max_time_to_first_escalation_s']}s)"
         )
 
-    def test_limits_backlog(self):
-        """Backlog during escalation must remain bounded.
+    def test_moderate_overload(self):
+        """Moderate overload after idle: correct level, bounded backlog.
 
-        At 1s window: 0.6 + 0.6 = 1.2s (20% over budget).
-        At 2s window: 0.6 + 1.2 = 1.8s (OK).
+        At 1s window: 0.6 + 0.6 = 1.2s (20% over budget, escalates).
+        At 2s window: 0.6 + 1.2 = 1.8s (90%, dead zone — stable at level 2).
         """
-        lim = LIMITS["step_function_backlog"]
+        lim = LIMITS["moderate_overload_step"]
         batcher, clock = make_default_batcher()
 
         cost = step_function_cost(
@@ -549,10 +536,11 @@ def test_limits_backlog(self):
         )
 
         result = run_scenario(batcher, 120.0, cost, clock)
-
-        assert result.max_level >= 1, (
-            "Precondition: load must trigger escalation for backlog test "
-            "to be meaningful"
+        assert result.max_level >= lim["min_level"], (
+            f"Only reached level {result.max_level} (need >= {lim['min_level']})"
+        )
+        assert result.max_level <= lim["max_level"], (
+            f"Over-escalated to level {result.max_level} (limit: {lim['max_level']})"
         )
         assert result.max_backlog_s < lim["max_backlog_s"], (
             f"Backlog reached {result.max_backlog_s:.1f}s "
@@ -563,21 +551,56 @@ def test_limits_backlog(self):
             f"(limit: {lim['max_final_backlog_s']}s)"
         )
 
+    def test_overhead_dominated_overload(self):
+        """Overhead-dominated overload: correct level, stabilization, backlog peak.
+
+        At 1s window: 0.8 + 0.3 = 1.1s (overloaded, escalates).
+        At 1.41s window: 0.8 + 0.42 = 1.22s (87%, dead zone — stable).
+        After escalation the backlog must peak and then decrease.
+        """
+        lim = LIMITS["overhead_dominated_step"]
+        batcher, clock = make_default_batcher()
+
+        cost = step_function_cost(
+            step_time_s=5.0,
+            before=idle_cost(),
+            after=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
+        )
+
+        result = run_scenario(batcher, 120.0, cost, clock)
+        assert result.max_level >= lim["min_level"], (
+            f"Only reached level {result.max_level} (need >= {lim['min_level']})"
+        )
+        assert result.max_level <= lim["max_level"], (
+            f"Over-escalated to level {result.max_level} (limit: {lim['max_level']})"
+        )
+        assert result.oscillation_count() <= lim["max_oscillations"], (
+            f"Oscillated {result.oscillation_count()} times "
+            f"(limit: {lim['max_oscillations']})"
+        )
+        # After the initial transient, the level should be stable.
+        late_cycles = result.cycles_after(60.0)
+        assert late_cycles, "Simulation too short for stabilization check"
+        late_levels = {c.level for c in late_cycles}
+        assert len(late_levels) == 1, (
+            f"Not stabilized: levels {sorted(late_levels)} observed "
+            f"in second half of simulation"
+        )
+        assert result.max_backlog_s >= lim["min_peak_backlog_s"], (
+            f"Precondition: meaningful backlog must build up "
+            f"(peak was {result.max_backlog_s:.2f}s)"
+        )
+        peak_idx = max(
+            range(len(result.cycles)),
+            key=lambda i: result.cycles[i].backlog_s,
+        )
+        assert peak_idx < len(result.cycles) - 1, (
+            "Backlog was still at peak at end of simulation"
+        )
+
     @pytest.mark.parametrize(
         ("overhead_s", "per_second_cost", "limits_key"),
         [
-            pytest.param(
-                0.6,
-                0.6,
-                "severity_moderate",
-                id="moderate: overhead=0.6 per_s=0.6",
-            ),
-            pytest.param(
-                0.8,
-                0.3,
-                "severity_overhead_dominated",
-                id="overhead-dominated: overhead=0.8 per_s=0.3",
-            ),
             pytest.param(
                 1.8,
                 0.2,
@@ -620,34 +643,6 @@ def test_reaches_appropriate_level_for_severity(
             f"Over-escalated to level {result.max_level} (limit: {lim['max_level']})"
         )
 
-    def test_stabilizes_after_escalation(self):
-        """After reaching the correct level, the batcher must not oscillate."""
-        lim = LIMITS["stabilization_after_step"]
-        batcher, clock = make_default_batcher()
-
-        # At 1s: 0.8+0.3=1.1 (overloaded). At 2s: 0.8+0.6=1.4 (OK).
-        cost = step_function_cost(
-            step_time_s=5.0,
-            before=idle_cost(),
-            after=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
-        )
-
-        result = run_scenario(batcher, 120.0, cost, clock)
-
-        assert result.max_level >= 1, "Precondition: must have escalated"
-        # After the initial transient, the level should be stable.
-        late_cycles = result.cycles_after(60.0)
-        assert late_cycles, "Simulation too short for stabilization check"
-        late_levels = {c.level for c in late_cycles}
-        assert len(late_levels) == 1, (
-            f"Not stabilized: levels {sorted(late_levels)} observed "
-            f"in second half of simulation"
-        )
-        assert result.oscillation_count() <= lim["max_oscillations"], (
-            f"Oscillated {result.oscillation_count()} times "
-            f"(limit: {lim['max_oscillations']})"
-        )
-
 
 class TestNonDefaultBaseBatchLength:
     """Verify scaling with a non-default base batch length."""
@@ -743,37 +738,56 @@ def test_no_escalation_with_gc_jitter(self, seed):
         )
 
 
-class TestNoOscillation:
-    """The batcher must not oscillate between levels."""
+class TestSteadyOverload:
+    """Constant overload from t=0: escalation, stabilization, backlog draining."""
 
-    def test_no_oscillation_at_steady_load(self):
-        """Constant overload that triggers escalation should stabilize without
-        oscillating.
+    def test_moderate_overload_stabilizes_and_drains(self):
+        """Constant 20% overload: must escalate, not oscillate, and drain backlog.
 
         Level 0 (1.0s): 0.6 + 0.6 = 1.2s (overloaded, escalates).
         Level 2 (2.0s): 0.6 + 1.2 = 1.8s (90%, dead zone — stable).
+        Surplus at level 2 drains the backlog accumulated during escalation.
         """
-        lim = LIMITS["steady_load_oscillation"]
+        lim = LIMITS["steady_moderate_overload"]
         batcher, clock = make_default_batcher()
 
         cost = constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6)
 
         result = run_scenario(batcher, 120.0, cost, clock)
-        assert result.max_level >= 1, (
-            "Precondition: load must trigger escalation for oscillation test "
-            "to be meaningful"
+        assert result.max_level >= lim["min_level_reached"], (
+            f"Precondition: load must trigger escalation "
+            f"(reached level {result.max_level}, "
+            f"need >= {lim['min_level_reached']})"
         )
         assert result.oscillation_count() <= lim["max_oscillations"], (
             f"Oscillated {result.oscillation_count()} times "
             f"(limit: {lim['max_oscillations']})"
         )
+        assert result.max_backlog_s >= lim["min_peak_backlog_s"], (
+            f"Precondition: meaningful backlog must build up "
+            f"(peak was {result.max_backlog_s:.2f}s, "
+            f"need >= {lim['min_peak_backlog_s']}s)"
+        )
+        assert result.final_backlog_s < lim["max_final_backlog_s"], (
+            f"Backlog not drained: {result.final_backlog_s:.1f}s "
+            f"(limit: {lim['max_final_backlog_s']}s)"
+        )
+
+    def test_boundary_jitter_escalates_and_sticks(self):
+        """Mean processing = window with 10% jitter: bounded oscillation,
+        but escalation becomes permanent due to the dead zone.
+
+        At level 0 (1s window): 0.5 + 0.5 = 1.0s mean, jitter +/-10%.
+            ~50% of cycles are overloaded (processing > 1.0).
+            P(2 consecutive overloaded) ~ 25%, so escalation is very likely.
 
-    def test_limited_oscillation_at_boundary(self):
-        """Processing right at the window with jitter: bounded oscillation."""
-        lim = LIMITS["boundary_oscillation"]
+        At level 2 (2s window): 0.5 + 1.0 = 1.5s mean (75% utilization).
+            At the dead-zone boundary (>= 75%), so de-escalation never triggers.
+            Documents limitation: once escalated, stays stuck due to dead zone.
+        """
+        lim = LIMITS["boundary_jitter"]
         batcher, clock = make_default_batcher()
 
-        # Mean processing = 1.0s = window, jitter +-10%
         cost = constant_overhead_cost(
             overhead_s=0.5,
             per_second_cost=0.5,
@@ -786,6 +800,14 @@ def test_limited_oscillation_at_boundary(self):
             f"Oscillated {result.oscillation_count()} times "
             f"(limit: {lim['max_oscillations']})"
         )
+        assert result.max_level >= lim["min_level"], (
+            f"Expected escalation from boundary jitter "
+            f"(reached level {result.max_level})"
+        )
+        assert result.final_level >= lim["min_final_level"], (
+            f"Expected to stay at level {lim['min_final_level']}+ "
+            f"(dead zone prevents de-escalation)"
+        )
 
 
 class TestCreepingOverload:
@@ -846,92 +868,70 @@ def test_mild_overload_does_not_over_escalate(self):
 class TestDeescalation:
     """The batcher must de-escalate when load subsides."""
 
-    def test_deescalates_after_load_drops_to_idle(self):
-        """After high load followed by idle, must return to level 0.
-
-        At 1s window: 0.8 + 0.3 = 1.1s (overloaded, triggers escalation).
-        """
-        lim = LIMITS["deescalation_to_idle"]
-        batcher, clock = make_default_batcher()
-
-        # 30s high load, then idle
-        cost = step_function_cost(
-            step_time_s=0.0,
-            before=idle_cost(),
-            after=step_function_cost(
-                step_time_s=30.0,
-                before=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
-                after=idle_cost(),
+    @pytest.mark.parametrize(
+        (
+            "heavy_duration_s",
+            "after_overhead",
+            "after_per_s",
+            "duration_s",
+            "limits_key",
+        ),
+        [
+            pytest.param(
+                30.0,
+                None,
+                None,
+                120.0,
+                "deescalation_to_idle",
+                id="heavy→idle",
             ),
-        )
-
-        result = run_scenario(batcher, 120.0, cost, clock)
-        assert result.max_level >= lim["min_level_during_load"], (
-            f"Precondition: batcher must have escalated during high-load phase "
-            f"(reached level {result.max_level}, "
-            f"need >= {lim['min_level_during_load']})"
-        )
-        assert result.final_level <= lim["max_final_level"], (
-            f"Final level {result.final_level} (limit: {lim['max_final_level']})"
-        )
-
-    def test_deescalates_after_step_down_to_light_load(self):
-        """When load decreases to light (but non-zero), must de-escalate.
-
-        This requires the batcher to de-escalate even when data is flowing
-        continuously, not just when the system goes fully idle.
-
-        Heavy phase at 1s window: 0.8 + 0.3 = 1.1s (overloaded).
-        Light phase at 1s window: 0.1 + 0.1 = 0.2s (well within budget).
-        """
-        lim = LIMITS["deescalation_to_light_load"]
-        batcher, clock = make_default_batcher()
-
-        # Heavy load for 40s, then light load
-        cost = step_function_cost(
-            step_time_s=0.0,
-            before=idle_cost(),
-            after=step_function_cost(
-                step_time_s=40.0,
-                before=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
-                after=constant_overhead_cost(overhead_s=0.1, per_second_cost=0.1),
+            pytest.param(
+                40.0,
+                0.1,
+                0.1,
+                180.0,
+                "deescalation_to_light_load",
+                id="heavy→light (0.2s at 1s window)",
             ),
-        )
-
-        result = run_scenario(batcher, 180.0, cost, clock)
-        assert result.max_level >= lim["min_level_during_load"], (
-            f"Precondition: batcher must have escalated during heavy-load "
-            f"phase (reached level {result.max_level}, "
-            f"need >= {lim['min_level_during_load']})"
-        )
-        assert result.final_level <= lim["max_final_level"], (
-            f"Final level {result.final_level} (limit: {lim['max_final_level']})"
-        )
-
-    def test_deescalates_under_moderate_continuous_load(self):
-        """De-escalation must work when processing fills a moderate fraction
-        of the escalated window, not just under near-idle conditions.
-
-        Heavy phase at level 0 (1.0s): 0.8 + 0.3 = 1.1s
-            (overloaded, escalates to level 2).
-        Moderate phase at level 2 (2.0s): 0.3 + 0.6 = 0.9s (45%, underloaded).
-        Moderate phase at level 0 (1.0s): 0.3 + 0.3 = 0.6s (fits at base
-            level).
+            pytest.param(
+                40.0,
+                0.3,
+                0.3,
+                180.0,
+                "deescalation_moderate_load",
+                id="heavy→moderate (0.6s at 1s window)",
+            ),
+        ],
+    )
+    def test_deescalates_when_load_drops(
+        self, heavy_duration_s, after_overhead, after_per_s, duration_s, limits_key
+    ):
+        """After overload (0.8 + 0.3 = 1.1s at 1s window), load drops.
+        The batcher must de-escalate back to level 0 regardless of whether
+        the lighter phase is idle, light, or moderate — as long as processing
+        fits within the base window.
         """
-        lim = LIMITS["deescalation_moderate_load"]
+        lim = LIMITS[limits_key]
         batcher, clock = make_default_batcher()
 
+        if after_overhead is None:
+            after = idle_cost()
+        else:
+            after = constant_overhead_cost(
+                overhead_s=after_overhead, per_second_cost=after_per_s
+            )
+
         cost = step_function_cost(
             step_time_s=0.0,
             before=idle_cost(),
             after=step_function_cost(
-                step_time_s=40.0,
+                step_time_s=heavy_duration_s,
                 before=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
-                after=constant_overhead_cost(overhead_s=0.3, per_second_cost=0.3),
+                after=after,
             ),
         )
 
-        result = run_scenario(batcher, 180.0, cost, clock)
+        result = run_scenario(batcher, duration_s, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: batcher must have escalated during heavy-load "
             f"phase (reached level {result.max_level}, "
@@ -1148,67 +1148,6 @@ def test_severe_overload_to_cosmic_background(self):
         )
 
 
-class TestBacklogDraining:
-    """Once the batcher escalates, accumulated backlog should drain."""
-
-    def test_backlog_drains_after_escalation(self):
-        """Sustained load triggering escalation should drain the backlog.
-
-        At 1s: 0.6 + 0.6 = 1.2s (overloaded).
-        At 2s: 0.6 + 1.2 = 1.8s (OK, surplus drains backlog).
-        """
-        lim = LIMITS["backlog_drains"]
-        batcher, clock = make_default_batcher()
-
-        cost = constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6)
-
-        result = run_scenario(batcher, 120.0, cost, clock)
-
-        assert result.max_level >= lim["min_level_reached"], (
-            f"Precondition: escalation must occur for backlog draining to be "
-            f"meaningful (reached level {result.max_level}, "
-            f"need >= {lim['min_level_reached']})"
-        )
-        assert result.max_backlog_s >= lim["min_peak_backlog_s"], (
-            f"Precondition: meaningful backlog must build up before draining "
-            f"(peak was {result.max_backlog_s:.2f}s, "
-            f"need >= {lim['min_peak_backlog_s']}s)"
-        )
-        assert result.final_backlog_s < lim["max_final_backlog_s"], (
-            f"Backlog not drained: {result.final_backlog_s:.1f}s "
-            f"(limit: {lim['max_final_backlog_s']}s)"
-        )
-
-    def test_backlog_does_not_grow_indefinitely(self):
-        """Even under sustained load, the backlog must peak and decrease.
-
-        At 1s: 0.8 + 0.3 = 1.1s (overloaded).
-        At 2s: 0.8 + 0.6 = 1.4s (OK).
-        """
-        lim = LIMITS["backlog_peaks_and_decreases"]
-        batcher, clock = make_default_batcher()
-
-        cost = constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3)
-
-        result = run_scenario(batcher, 120.0, cost, clock)
-
-        assert result.max_level >= lim["min_level_reached"], (
-            "Precondition: escalation must occur to test backlog draining"
-        )
-        assert result.max_backlog_s >= lim["min_peak_backlog_s"], (
-            f"Precondition: meaningful backlog must build up "
-            f"(peak was {result.max_backlog_s:.2f}s)"
-        )
-
-        peak_idx = max(
-            range(len(result.cycles)),
-            key=lambda i: result.cycles[i].backlog_s,
-        )
-        assert peak_idx < len(result.cycles) - 1, (
-            "Backlog was still at peak at end of simulation"
-        )
-
-
 class TestProcessingTimeAwareness:
     """The batcher should use processing_time_s for faster decisions."""
 
@@ -1230,19 +1169,6 @@ def test_fast_escalation_on_clear_overload(self):
             f"(limit: {lim['max_time_to_first_escalation_s']}s)"
         )
 
-    def test_no_escalation_when_processing_fits(self):
-        """No escalation if processing completes within the window."""
-        lim = LIMITS["no_escalation_when_fits"]
-        batcher, clock = make_default_batcher()
-
-        cost = constant_overhead_cost(overhead_s=0.1, per_second_cost=0.3)
-
-        result = run_scenario(batcher, 60.0, cost, clock)
-        assert result.max_level <= lim["max_level"], (
-            f"Escalated to {result.max_level} despite fitting "
-            f"(limit: {lim['max_level']})"
-        )
-
 
 class TestDeescalationDeadZone:
     """The 75-100% utilization dead zone where de-escalation cannot trigger.
@@ -1298,51 +1224,6 @@ def test_stuck_in_dead_zone_after_load_drop(self):
         )
 
 
-class TestJitterInducedStickyEscalation:
-    """Jitter at the exact boundary can cause permanent escalation.
-
-    When mean processing equals the batch window, jitter causes roughly
-    half the batches to be overloaded.  Two consecutive overloaded batches
-    (~25% probability per pair) trigger escalation.  At the escalated level,
-    processing lands in the dead zone (75-100% utilization), preventing
-    de-escalation.  The batcher stays permanently escalated.
-    """
-
-    def test_jitter_at_boundary_causes_sticky_escalation(self):
-        """Mean processing = window with 10% jitter: escalates and stays.
-
-        At level 0 (1s window): 0.5 + 0.5 = 1.0s mean, jitter +/-10%.
-            ~50% of cycles are overloaded (processing > 1.0).
-            P(2 consecutive overloaded) ~ 25%, so escalation is very likely.
-
-        At level 2 (2s window): 0.5 + 1.0 = 1.5s mean (75% utilization).
-            At the dead-zone boundary (>= 75%), so de-escalation never triggers.
-        """
-        lim = LIMITS["jitter_sticky_escalation"]
-        batcher, clock = make_default_batcher()
-
-        cost = constant_overhead_cost(
-            overhead_s=0.5,
-            per_second_cost=0.5,
-            jitter_fraction=0.1,
-            rng=random.Random(42),
-        )
-
-        result = run_scenario(batcher, 180.0, cost, clock)
-
-        assert result.max_level >= lim["min_level"], (
-            f"Expected escalation from boundary jitter "
-            f"(reached level {result.max_level})"
-        )
-        # Documents the limitation: once escalated, stays at level 1 due to
-        # the dead zone.  If the strategy is improved to handle this case,
-        # the expected final level should be 0.
-        assert result.final_level >= lim["min_final_level"], (
-            f"Expected to stay at level {lim['min_final_level']}+ "
-            f"(dead zone prevents de-escalation)"
-        )
-
-
 class TestTimeGapBatches:
     """Time-gap batches (message_count=0) should not disrupt adaptive behavior.
 

From 2c74c832e310127753a375f23bdada431c5b353b Mon Sep 17 00:00:00 2001
From: Simon Heybrock <simon.heybrock@ess.eu>
Date: Mon, 23 Mar 2026 05:35:42 +0000
Subject: [PATCH 16/16] Trim simulation durations to reduce steady-state tails

Many scenario tests ran far longer than needed, spending 50%+ of their
simulation in a flat steady state after all interesting events completed.
Trimmed durations while keeping 20-30s of headroom after the last event.

Also lowered the cycles_after() threshold in the overhead-dominated
stabilization check from 60s to 30s to match the shorter simulation.
---
 .../core/adaptive_batching_scenarios_test.py  | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/core/adaptive_batching_scenarios_test.py b/tests/core/adaptive_batching_scenarios_test.py
index a6f1a412f..74680572a 100644
--- a/tests/core/adaptive_batching_scenarios_test.py
+++ b/tests/core/adaptive_batching_scenarios_test.py
@@ -510,7 +510,7 @@ def test_escalates_within_bounded_time(self):
             ),
         )
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 60.0, cost, clock)
 
         first_esc = result.first_escalation_time_s()
         assert first_esc is not None, "Batcher never escalated"
@@ -535,7 +535,7 @@ def test_moderate_overload(self):
             after=constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6),
         )
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level >= lim["min_level"], (
             f"Only reached level {result.max_level} (need >= {lim['min_level']})"
         )
@@ -567,7 +567,7 @@ def test_overhead_dominated_overload(self):
             after=constant_overhead_cost(overhead_s=0.8, per_second_cost=0.3),
         )
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level >= lim["min_level"], (
             f"Only reached level {result.max_level} (need >= {lim['min_level']})"
         )
@@ -579,7 +579,7 @@ def test_overhead_dominated_overload(self):
             f"(limit: {lim['max_oscillations']})"
         )
         # After the initial transient, the level should be stable.
-        late_cycles = result.cycles_after(60.0)
+        late_cycles = result.cycles_after(30.0)
         assert late_cycles, "Simulation too short for stabilization check"
         late_levels = {c.level for c in late_cycles}
         assert len(late_levels) == 1, (
@@ -635,7 +635,7 @@ def test_reaches_appropriate_level_for_severity(
             ),
         )
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level >= lim["min_level"], (
             f"Only reached level {result.max_level} (need >= {lim['min_level']})"
         )
@@ -665,7 +665,7 @@ def test_escalation_with_doubled_base(self):
             after=constant_overhead_cost(overhead_s=1.2, per_second_cost=0.6),
         )
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level >= lim["min_level"], (
             f"Only reached level {result.max_level} (need >= {lim['min_level']})"
         )
@@ -731,7 +731,7 @@ def test_no_escalation_with_gc_jitter(self, seed):
             rng=random.Random(seed),
         )
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 90.0, cost, clock)
         assert result.max_level <= lim["max_level"], (
             f"Escalated to level {result.max_level} from jitter alone "
             f"(seed={seed}, limit: {lim['max_level']})"
@@ -753,7 +753,7 @@ def test_moderate_overload_stabilizes_and_drains(self):
 
         cost = constant_overhead_cost(overhead_s=0.6, per_second_cost=0.6)
 
-        result = run_scenario(batcher, 120.0, cost, clock)
+        result = run_scenario(batcher, 60.0, cost, clock)
         assert result.max_level >= lim["min_level_reached"], (
             f"Precondition: load must trigger escalation "
             f"(reached level {result.max_level}, "
@@ -795,7 +795,7 @@ def test_boundary_jitter_escalates_and_sticks(self):
             rng=random.Random(42),
         )
 
-        result = run_scenario(batcher, 180.0, cost, clock)
+        result = run_scenario(batcher, 90.0, cost, clock)
         assert result.oscillation_count() <= lim["max_oscillations"], (
             f"Oscillated {result.oscillation_count()} times "
             f"(limit: {lim['max_oscillations']})"
@@ -855,7 +855,7 @@ def test_mild_overload_does_not_over_escalate(self):
             ramp_duration_s=60.0,
         )
 
-        result = run_scenario(batcher, 180.0, cost, clock)
+        result = run_scenario(batcher, 100.0, cost, clock)
         assert result.max_level >= lim["min_level_reached"], (
             f"Only reached level {result.max_level} — mild overload should "
             f"still trigger escalation (need >= {lim['min_level_reached']})"
@@ -881,7 +881,7 @@ class TestDeescalation:
                 30.0,
                 None,
                 None,
-                120.0,
+                75.0,
                 "deescalation_to_idle",
                 id="heavy→idle",
             ),
@@ -889,7 +889,7 @@ class TestDeescalation:
                 40.0,
                 0.1,
                 0.1,
-                180.0,
+                100.0,
                 "deescalation_to_light_load",
                 id="heavy→light (0.2s at 1s window)",
             ),
@@ -897,7 +897,7 @@ class TestDeescalation:
                 40.0,
                 0.3,
                 0.3,
-                180.0,
+                100.0,
                 "deescalation_moderate_load",
                 id="heavy→moderate (0.6s at 1s window)",
             ),
@@ -964,7 +964,7 @@ def test_multi_level_deescalation(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost, clock)
+        result = run_scenario(batcher, 180.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
             f"during heavy phase (reached {result.max_level})"
@@ -1001,7 +1001,7 @@ def test_partial_deescalation(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost, clock)
+        result = run_scenario(batcher, 150.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
             f"during severe phase (reached {result.max_level})"
@@ -1137,7 +1137,7 @@ def test_severe_overload_to_cosmic_background(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost, clock)
+        result = run_scenario(batcher, 180.0, cost, clock)
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "
             f"during severe phase (reached {result.max_level})"
@@ -1209,7 +1209,7 @@ def test_stuck_in_dead_zone_after_load_drop(self):
             ),
         )
 
-        result = run_scenario(batcher, 240.0, cost, clock)
+        result = run_scenario(batcher, 150.0, cost, clock)
 
         assert result.max_level >= lim["min_level_during_load"], (
             f"Precondition: must reach level {lim['min_level_during_load']}+ "