diff --git a/src/spikeinterface/core/__init__.py b/src/spikeinterface/core/__init__.py
index 1f4a3e4d2a..4dad00667f 100644
--- a/src/spikeinterface/core/__init__.py
+++ b/src/spikeinterface/core/__init__.py
@@ -100,6 +100,8 @@
     TimeSeriesChunkExecutor,
     split_job_kwargs,
     fix_job_kwargs,
+    get_inner_pool,
+    thread_budget,
 )
 from .recording_tools import (
     write_binary_recording,
diff --git a/src/spikeinterface/core/baserecording.py b/src/spikeinterface/core/baserecording.py
index 0a9b26931b..8598be7bb3 100644
--- a/src/spikeinterface/core/baserecording.py
+++ b/src/spikeinterface/core/baserecording.py
@@ -303,6 +303,92 @@ def get_traces(
                 traces = traces.astype("float32", copy=False) * gains + offsets
         return traces
 
+    def get_traces_multi_thread(
+        self,
+        segment_index: int | None = None,
+        start_frame: int | None = None,
+        end_frame: int | None = None,
+        channel_ids: list | np.ndarray | tuple | None = None,
+        order: Literal["C", "F"] | None = None,
+        return_in_uV: bool = False,
+        max_threads: int | None = None,
+    ) -> np.ndarray:
+        """Like ``get_traces``, but the segment kernel may use up to
+        ``max_threads`` threads internally to compute its output.
+
+        Most segments fall through to the serial ``get_traces`` path; only
+        segments whose kernels benefit from intra-call parallelism (e.g.
+        ``FilterRecordingSegment``, ``CommonReferenceRecordingSegment``)
+        override ``BaseRecordingSegment.get_traces_multi_thread`` to actually
+        use the budget.
+
+        Parameters
+        ----------
+        max_threads : int or None, default: None
+            Inner thread budget for this single call.  ``None`` means
+            "look up ``max_threads_per_worker`` from the global job_kwargs."
+            ``<= 1`` falls back to plain ``get_traces``.
+
+            .. note::
+               The implicit ``None`` lookup is only safe in the **parent
+               process**. Inside a ``TimeSeriesChunkExecutor`` worker
+               (especially with ``mp_context="spawn"`` / ``"forkserver"`` or on
+               macOS / Windows defaults), the worker's globals do not reflect
+               the parent's ``set_global_job_kwargs(...)``. Chunk callbacks
+               that want intra-call parallelism inside CRE must pass
+               ``max_threads`` explicitly.
+
+        See ``get_traces`` for the other parameters.
+        """
+        if max_threads is None:
+            from .globals import get_global_job_kwargs
+
+            max_threads = int(get_global_job_kwargs().get("max_threads_per_worker", 1) or 1)
+
+        if max_threads <= 1:
+            return self.get_traces(
+                segment_index=segment_index,
+                start_frame=start_frame,
+                end_frame=end_frame,
+                channel_ids=channel_ids,
+                order=order,
+                return_in_uV=return_in_uV,
+            )
+
+        segment_index = self._check_segment_index(segment_index)
+        channel_indices = self.ids_to_indices(channel_ids, prefer_slice=True)
+        rs = self.segments[segment_index]
+        start_frame = int(start_frame) if start_frame is not None else 0
+        num_samples = rs.get_num_samples()
+        end_frame = int(min(end_frame, num_samples)) if end_frame is not None else num_samples
+        traces = rs.get_traces_multi_thread(
+            start_frame=start_frame,
+            end_frame=end_frame,
+            channel_indices=channel_indices,
+            max_threads=max_threads,
+        )
+
+        if order is not None:
+            assert order in ["C", "F"]
+            traces = np.asanyarray(traces, order=order)
+
+        if return_in_uV:
+            if not self.has_scaleable_traces():
+                if self._dtype.kind == "f":
+                    pass
+                else:
+                    raise ValueError(
+                        "This recording does not support return_in_uV=True (need gain_to_uV and offset_"
+                        "to_uV properties)"
+                    )
+            else:
+                gains = self.get_property("gain_to_uV")
+                offsets = self.get_property("offset_to_uV")
+                gains = gains[channel_indices].astype("float32", copy=False)
+                offsets = offsets[channel_indices].astype("float32", copy=False)
+                traces = traces.astype("float32", copy=False) * gains + offsets
+        return traces
+
     def get_data(self, start_frame: int, end_frame: int, segment_index: int | None = None, **kwargs) -> np.ndarray:
         """
         General retrieval function for time_series objects
@@ -673,6 +759,26 @@ def get_traces(
         # must be implemented in subclass
         raise NotImplementedError
 
+    def get_traces_multi_thread(
+        self,
+        start_frame: int | None = None,
+        end_frame: int | None = None,
+        channel_indices: list | np.ndarray | tuple | None = None,
+        max_threads: int = 1,
+    ) -> np.ndarray:
+        """Default: serial fall-through to ``get_traces``.
+
+        Override on segments whose kernels benefit from intra-call
+        parallelism (channel-block fan-out, time-block fan-out, numba
+        prange).  See ``core/job_tools.py:get_inner_pool`` and
+        ``thread_budget`` for the building blocks.
+        """
+        return self.get_traces(
+            start_frame=start_frame,
+            end_frame=end_frame,
+            channel_indices=channel_indices,
+        )
+
     def get_data(
         self, start_frame: int, end_frame: int, indices: list | np.ndarray | tuple | None = None
     ) -> np.ndarray:
diff --git a/src/spikeinterface/core/job_tools.py b/src/spikeinterface/core/job_tools.py
index a335feddd3..8e06f5d66a 100644
--- a/src/spikeinterface/core/job_tools.py
+++ b/src/spikeinterface/core/job_tools.py
@@ -11,8 +11,10 @@
 from tqdm.auto import tqdm
 
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from contextlib import ExitStack, contextmanager
 import multiprocessing
 import threading
+import weakref
 from threadpoolctl import threadpool_limits
 
 from spikeinterface.core.core_tools import convert_string_to_bytes, convert_bytes_to_str, convert_seconds_to_str
@@ -759,3 +761,137 @@ def get_poolexecutor(n_jobs):
         return MockPoolExecutor
     else:
         return ProcessPoolExecutor
+
+
+# ---------------------------------------------------------------------------
+# Intra-call thread fan-out utilities (used by ``get_traces_multi_thread``)
+#
+# These let a single ``get_traces`` call internally spend a thread budget
+# (``max_threads_per_worker`` from job_kwargs) without exposing per-class
+# init kwargs.  Each segment that benefits from intra-call parallelism
+# overrides ``BaseRecordingSegment.get_traces_multi_thread`` and picks
+# the mechanism it actually needs:
+#
+#   - explicit Python-thread fan-out  → ``get_inner_pool``
+#   - BLAS / OpenMP cap (matmuls)     → ``thread_budget(blas=True)``
+#   - numba ``prange`` parallelism    → ``thread_budget(numba=True)``
+#
+# All three compose, but most segments use only one.
+
+# Module-global per-caller-thread pool registry.  Keyed by
+# ``Thread → {max_threads → ThreadPoolExecutor}`` so that the same calling
+# thread reusing the same budget gets the same pool across calls and across
+# segments (a chained pipeline reuses one pool per (Thread, max_threads)
+# pair, not one per segment).
+#
+# Identity-stable: never re-bound, only ``.clear()``ed in the post-fork
+# guard, so callers that imported ``_inner_pools`` keep a valid reference.
+_inner_pools: "weakref.WeakKeyDictionary[threading.Thread, dict]" = weakref.WeakKeyDictionary()
+_inner_pools_lock = threading.Lock()
+_inner_pools_pid: int = os.getpid()
+
+
+def _shutdown_inner_pools(sized_dict):
+    """Finalizer for a thread's pool dict: shut down all its pools.
+
+    ``wait=False`` to avoid blocking the finalizer thread.  In-flight tasks
+    would be cancelled, but the owning thread submits + joins synchronously,
+    so no such tasks exist when it actually exits.
+    """
+    for pool in sized_dict.values():
+        pool.shutdown(wait=False)
+
+
+def get_inner_pool(max_threads: int) -> ThreadPoolExecutor | None:
+    """Per-caller-thread ``ThreadPoolExecutor`` of size ``max_threads``.
+
+    Same calling thread + same ``max_threads`` returns the same pool —
+    across calls, across segments.  Different calling threads get distinct
+    pools so concurrent outer workers never queue on a shared inner pool
+    (the pathology that otherwise dominates when CRE ``n_jobs`` exceeds the
+    inner pool size).
+
+    Returns ``None`` for ``max_threads <= 1`` so callers can keep a single
+    serial-fallback branch.
+
+    Pools are owned by the calling ``Thread`` (via ``WeakKeyDictionary``),
+    so when the thread is garbage-collected its pools are shut down
+    automatically.
+
+    A pid guard clears the registry after ``os.fork()``: in a forked child
+    the parent's ``ThreadPoolExecutor``s reference Thread objects whose OS
+    threads were not copied across, so submitting to them would deadlock.
+    Pickled (spawn / forkserver) workers come up with their own module-load
+    state and never see this.
+    """
+    if max_threads <= 1:
+        return None
+
+    global _inner_pools_pid
+    pid = os.getpid()
+    if _inner_pools_pid != pid:
+        with _inner_pools_lock:
+            if _inner_pools_pid != pid:
+                _inner_pools.clear()
+                _inner_pools_pid = pid
+
+    thread = threading.current_thread()
+    sized = _inner_pools.get(thread)
+    if sized is None:
+        with _inner_pools_lock:
+            sized = _inner_pools.get(thread)
+            if sized is None:
+                sized = {}
+                _inner_pools[thread] = sized
+                weakref.finalize(thread, _shutdown_inner_pools, sized)
+    pool = sized.get(max_threads)
+    if pool is None:
+        with _inner_pools_lock:
+            pool = sized.get(max_threads)
+            if pool is None:
+                pool = ThreadPoolExecutor(max_workers=max_threads)
+                sized[max_threads] = pool
+    return pool
+
+
+@contextmanager
+def thread_budget(max_threads: int, *, blas: bool = False, numba: bool = False):
+    """Cap underlying thread runtimes for the duration of the context.
+
+    Caller picks which mechanisms apply — the rest are left alone.  Compose
+    with ``get_inner_pool`` for explicit Python-thread fan-out (a separate
+    mechanism that doesn't need a context manager).
+
+    Parameters
+    ----------
+    max_threads : int
+        Per-mechanism thread cap.  ``<= 1`` is a no-op (still enters the
+        context but caps to 1, which is what ``threadpool_limits`` /
+        ``numba.set_num_threads`` do anyway).
+    blas : bool, default False
+        Apply ``threadpool_limits(limits=max_threads)`` — caps the C-level
+        thread pools used by BLAS (OpenBLAS / MKL / BLIS) and OpenMP
+        (libgomp / libomp).
+    numba : bool, default False
+        Apply ``numba.set_num_threads(max_threads)`` for the duration of the
+        scope.  Restored on exit.  Only meaningful for ``@njit(parallel=True)``
+        kernels using ``prange``; harmless otherwise.
+
+    Notes
+    -----
+    threadpoolctl can sometimes reach numba's threading layer (when numba
+    is configured to use OpenMP), but this is unreliable across
+    ``NUMBA_THREADING_LAYER`` choices.  Use ``numba=True`` explicitly when
+    a segment actually contains a numba parallel kernel — don't rely on
+    ``blas=True`` to reach it.
+    """
+    with ExitStack() as stack:
+        if blas:
+            stack.enter_context(threadpool_limits(limits=max_threads))
+        if numba:
+            import numba as _nb
+
+            prev = _nb.get_num_threads()
+            _nb.set_num_threads(max(1, max_threads))
+            stack.callback(_nb.set_num_threads, prev)
+        yield
diff --git a/src/spikeinterface/core/time_series_tools.py b/src/spikeinterface/core/time_series_tools.py
index 1c15daed21..b840ac36db 100644
--- a/src/spikeinterface/core/time_series_tools.py
+++ b/src/spikeinterface/core/time_series_tools.py
@@ -577,6 +577,7 @@ def get_chunk_with_margin(
     add_reflect_padding=False,
     window_on_margin=False,
     dtype=None,
+    max_threads: int = 1,
 ):
     """
     Helper to get chunk with margin
@@ -586,12 +587,33 @@ def get_chunk_with_margin(
     of `add_zeros` or `add_reflect_padding` is True. In the first
     case zero padding is used, in the second case np.pad is called
     with mod="reflect".
+
+    When ``max_threads > 1`` and the segment is a recording segment with a
+    ``get_traces_multi_thread`` override, the upstream fetch goes through
+    that parallel kernel so a chained pipeline (e.g. Filter → CMR) gets
+    end-to-end parallelism per call.  Snippets and other generic
+    ``TimeSeriesSegment`` subtypes always use ``get_data`` (serial).
     """
     length = int(chunkable_segment.get_num_samples())
 
     if last_dimension_indices is None:
         last_dimension_indices = slice(None)
 
+    # Local fetcher: branch on max_threads + recording-segment capability.
+    # Keeps ``get_data`` as a clean generic-TimeSeries API and pushes the
+    # "parallel if K>1" decision to the one call site that cares.
+    use_multi = max_threads > 1 and hasattr(chunkable_segment, "get_traces_multi_thread")
+
+    def _fetch(s0, s1):
+        if use_multi:
+            return chunkable_segment.get_traces_multi_thread(
+                start_frame=s0,
+                end_frame=s1,
+                channel_indices=last_dimension_indices,
+                max_threads=max_threads,
+            )
+        return chunkable_segment.get_data(s0, s1, last_dimension_indices)
+
     if not (add_zeros or add_reflect_padding):
         if window_on_margin and not add_zeros:
             raise ValueError("window_on_margin requires add_zeros=True")
@@ -612,11 +634,7 @@ def get_chunk_with_margin(
         else:
             right_margin = margin
 
-        data_chunk = chunkable_segment.get_data(
-            start_frame - left_margin,
-            end_frame + right_margin,
-            last_dimension_indices,
-        )
+        data_chunk = _fetch(start_frame - left_margin, end_frame + right_margin)
 
     else:
         # either add_zeros or reflect_padding
@@ -642,7 +660,7 @@ def get_chunk_with_margin(
             end_frame2 = end_frame + margin
             right_pad = 0
 
-        data_chunk = chunkable_segment.get_data(start_frame2, end_frame2, last_dimension_indices)
+        data_chunk = _fetch(start_frame2, end_frame2)
 
         if dtype is not None or window_on_margin or left_pad > 0 or right_pad > 0:
             need_copy = True
diff --git a/src/spikeinterface/preprocessing/common_reference.py b/src/spikeinterface/preprocessing/common_reference.py
index 5a3a9b0043..f343ed2e56 100644
--- a/src/spikeinterface/preprocessing/common_reference.py
+++ b/src/spikeinterface/preprocessing/common_reference.py
@@ -5,7 +5,7 @@
 from spikeinterface.core.core_tools import define_function_handling_dict_from_class
 
 from .basepreprocessor import BasePreprocessor, BasePreprocessorSegment
-from spikeinterface.core import get_closest_channels
+from spikeinterface.core import get_closest_channels, get_inner_pool
 from spikeinterface.core.baserecording import BaseRecording
 
 from .filter import fix_dtype
@@ -201,15 +201,103 @@ def __init__(
         self.operator = operator
         self.operator_func = np.mean if self.operator == "average" else np.median
 
-    def get_traces(self, start_frame, end_frame, channel_indices):
+    def _parallel_reduce_axis1(self, traces, max_threads):
+        """Apply ``operator_func(..., axis=1)`` optionally split across time blocks.
+
+        numpy's partition-based median and BLAS-backed mean release the GIL
+        during per-row work, so Python-thread parallelism delivers real
+        speedup.
+
+        Block-sizing strategy
+        ---------------------
+
+        Aim for many small chunks (typically ~1.5 MB each, sized around L2
+        per worker) rather than one big chunk per worker.  With N small
+        chunks dispatched to a fixed-size pool, all workers tend to be
+        processing rows in the same time region at any moment (FIFO
+        queue), so shared L3 absorbs the input data once instead of
+        ``max_threads`` independent streams competing for DRAM.
+
+        Empirically (524k × 384 fp32, 16 workers) this scheme is ~1.4×
+        faster than "one chunk per worker": measured 121 ms at block=1024
+        vs 167 ms at block=32768.  Diminishing returns past 16 chunks per
+        worker as dispatch overhead starts to compete with the cache win.
+
+        Workers write directly into a pre-allocated output array — see
+        FilterRecordingSegment._apply_sos for the same pattern.
+        """
+        pool = get_inner_pool(max_threads)
+        if pool is None:
+            return self.operator_func(traces, axis=1)
+        T = traces.shape[0]
+        C = traces.shape[1] if traces.ndim == 2 else 1
+        itemsize = traces.dtype.itemsize
+
+        # Target each chunk at ~1.5 MB so it fits comfortably in L2 on a
+        # typical core, with max_threads chunks active at once fitting in L3.
+        # Floor at 1024 rows so per-chunk dispatch overhead (~few µs) stays
+        # well below per-chunk compute (~hundreds of µs at C=384).
+        target_chunk_bytes = 1_500_000
+        block = max(1024, target_chunk_bytes // max(1, C * itemsize))
+
+        # Don't make the chunk count exceed what's useful: at very small T
+        # we want at least one chunk per worker, but no more than 64
+        # chunks/worker (more would just amortize less work per dispatch).
+        n_chunks = max(max_threads, (T + block - 1) // block)
+        n_chunks = min(n_chunks, max_threads * 64)
+        block = max(1, (T + n_chunks - 1) // n_chunks)
+
+        # Floor: if T is so small that each chunk would be tiny, shrink the
+        # effective worker count instead of paying dispatch overhead.
+        if block < 256:
+            effective = max(1, T // 256)
+            if effective == 1:
+                return self.operator_func(traces, axis=1)
+            block = (T + effective - 1) // effective
+
+        bounds = [(t0, min(t0 + block, T)) for t0 in range(0, T, block)]
+
+        # Probe dtype: median/mean of a 1×C row gives the same dtype as the
+        # full reduction.
+        out_dtype = self.operator_func(traces[:1, :], axis=1).dtype
+        out = np.empty(T, dtype=out_dtype)
+
+        def _work(t0, t1):
+            out[t0:t1] = self.operator_func(traces[t0:t1, :], axis=1)
+
+        futures = [pool.submit(_work, t0, t1) for t0, t1 in bounds]
+        for fut in futures:
+            fut.result()
+        return out
+
+    def _fetch_parent(self, start_frame, end_frame, max_threads):
+        """Fetch upstream traces, propagating max_threads when > 1.
+
+        Explicit branch keeps the serial path strictly serial — calling
+        ``get_traces`` directly when ``max_threads <= 1`` avoids any
+        traversal through ``get_traces_multi_thread`` and its routing.
+        """
+        if max_threads > 1:
+            return self.parent_recording_segment.get_traces_multi_thread(
+                start_frame=start_frame,
+                end_frame=end_frame,
+                channel_indices=slice(None),
+                max_threads=max_threads,
+            )
+        return self.parent_recording_segment.get_traces(
+            start_frame=start_frame, end_frame=end_frame, channel_indices=slice(None)
+        )
+
+    def _get_traces_impl(self, start_frame, end_frame, channel_indices, max_threads):
         # Let's do the case with group_indices equal None as that is easy
         if self.group_indices is None:
-            # We need all the channels to calculate the reference
-            traces = self.parent_recording_segment.get_traces(start_frame, end_frame, slice(None))
+            # We need all the channels to calculate the reference.
+            traces = self._fetch_parent(start_frame, end_frame, max_threads)
 
             if self.reference == "global":
                 if self.ref_channel_indices is None:
-                    shift = self.operator_func(traces, axis=1, keepdims=True)
+                    # Hot path: parallelizable global median/mean across all channels.
+                    shift = self._parallel_reduce_axis1(traces, max_threads)[:, np.newaxis]
                 else:
                     shift = self.operator_func(traces[:, self.ref_channel_indices], axis=1, keepdims=True)
                 re_referenced_traces = traces[:, channel_indices] - shift
@@ -233,8 +321,7 @@ def get_traces(self, start_frame, end_frame, channel_indices):
 
         # Then the old implementation for backwards compatibility that supports grouping
         else:
-            # need input trace
-            traces = self.parent_recording_segment.get_traces(start_frame, end_frame, slice(None))
+            traces = self._fetch_parent(start_frame, end_frame, max_threads)
 
             sliced_channel_indices = np.arange(traces.shape[1])
             if channel_indices is not None:
@@ -257,6 +344,12 @@ def get_traces(self, start_frame, end_frame, channel_indices):
 
             return re_referenced_traces.astype(self.dtype, copy=False)
 
+    def get_traces(self, start_frame=None, end_frame=None, channel_indices=None):
+        return self._get_traces_impl(start_frame, end_frame, channel_indices, max_threads=1)
+
+    def get_traces_multi_thread(self, start_frame=None, end_frame=None, channel_indices=None, max_threads=1):
+        return self._get_traces_impl(start_frame, end_frame, channel_indices, max_threads=max_threads)
+
     def slice_groups(self, channel_indices):
         """
         Slice the channel indices into groups. This is used to apply the common reference to groups of channels.
diff --git a/src/spikeinterface/preprocessing/filter.py b/src/spikeinterface/preprocessing/filter.py
index b4ceed886e..5fe8a1979f 100644
--- a/src/spikeinterface/preprocessing/filter.py
+++ b/src/spikeinterface/preprocessing/filter.py
@@ -8,6 +8,7 @@
     ensure_chunk_size,
     get_global_job_kwargs,
     is_set_global_job_kwargs_set,
+    get_inner_pool,
 )
 
 from .basepreprocessor import BasePreprocessor, BasePreprocessorSegment
@@ -177,7 +178,76 @@ def __init__(
         self.add_reflect_padding = add_reflect_padding
         self.dtype = dtype
 
-    def get_traces(self, start_frame, end_frame, channel_indices):
+    def _apply_sos(self, fn, traces, max_threads, axis=0):
+        """Apply a scipy SOS function across channel blocks, optionally parallel.
+
+        Each channel is independent of every other, so splitting the channel
+        axis across threads is a safe parallelization.  scipy's C
+        implementations of ``sosfiltfilt`` / ``sosfilt`` release the GIL during
+        per-column work, so Python-thread parallelism delivers real speedup
+        (measured ~3× on 8 threads for a 1M × 384 float32 chunk).
+
+        ``max_threads <= 1`` or too few channels falls back to a single serial
+        call.  Workers write directly into a pre-allocated output to avoid the
+        per-block tuple return + post-loop copy.
+        """
+        pool = get_inner_pool(max_threads)
+        if pool is None:
+            return fn(self.coeff, traces, axis=axis)
+        C = traces.shape[1]
+        if C < 2 * max_threads:
+            return fn(self.coeff, traces, axis=axis)
+
+        block = (C + max_threads - 1) // max_threads
+        bounds = [(c0, min(c0 + block, C)) for c0 in range(0, C, block)]
+
+        # Probe the output dtype on a tiny slice (longer than scipy's internal
+        # padlen of 6 * len(sos)) so we can pre-allocate.  Cost: microseconds.
+        probe_len = max(64, 6 * self.coeff.shape[0] + 1)
+        out_dtype = fn(self.coeff, traces[:probe_len, :1], axis=axis).dtype
+        out = np.empty((traces.shape[0], C), dtype=out_dtype)
+
+        def _work(c0, c1):
+            out[:, c0:c1] = fn(self.coeff, traces[:, c0:c1], axis=axis)
+
+        futures = [pool.submit(_work, c0, c1) for c0, c1 in bounds]
+        for fut in futures:
+            fut.result()
+        return out
+
+    def _filter(self, traces_chunk, max_threads):
+        """Run the configured filter on a margin-included chunk.
+
+        Factored out so ``get_traces`` (serial) and ``get_traces_multi_thread``
+        share a single body and only differ by the ``max_threads`` argument.
+        """
+        import scipy.signal
+
+        if self.direction == "forward-backward":
+            if self.filter_mode == "sos":
+                return self._apply_sos(scipy.signal.sosfiltfilt, traces_chunk, max_threads, axis=0)
+            elif self.filter_mode == "ba":
+                b, a = self.coeff
+                return scipy.signal.filtfilt(b, a, traces_chunk, axis=0)
+
+        # forward / backward only
+        if self.direction == "backward":
+            traces_chunk = np.flip(traces_chunk, axis=0)
+
+        if self.filter_mode == "sos":
+            filtered = self._apply_sos(scipy.signal.sosfilt, traces_chunk, max_threads, axis=0)
+        elif self.filter_mode == "ba":
+            b, a = self.coeff
+            filtered = scipy.signal.lfilter(b, a, traces_chunk, axis=0)
+
+        if self.direction == "backward":
+            filtered = np.flip(filtered, axis=0)
+
+        return filtered
+
+    def _get_traces_impl(self, start_frame, end_frame, channel_indices, max_threads):
+        # Propagate max_threads upstream so a chained Filter→Filter (or any
+        # parallel-capable parent) fans out under the same thread budget.
         traces_chunk, left_margin, right_margin = get_chunk_with_margin(
             self.parent_recording_segment,
             start_frame,
@@ -185,33 +255,14 @@ def get_traces(self, start_frame, end_frame, channel_indices):
             channel_indices,
             self.margin,
             add_reflect_padding=self.add_reflect_padding,
+            max_threads=max_threads,
         )
 
-        traces_dtype = traces_chunk.dtype
         # if uint --> force int
-        if traces_dtype.kind == "u":
+        if traces_chunk.dtype.kind == "u":
             traces_chunk = traces_chunk.astype("float32")
 
-        import scipy.signal
-
-        if self.direction == "forward-backward":
-            if self.filter_mode == "sos":
-                filtered_traces = scipy.signal.sosfiltfilt(self.coeff, traces_chunk, axis=0)
-            elif self.filter_mode == "ba":
-                b, a = self.coeff
-                filtered_traces = scipy.signal.filtfilt(b, a, traces_chunk, axis=0)
-        else:
-            if self.direction == "backward":
-                traces_chunk = np.flip(traces_chunk, axis=0)
-
-            if self.filter_mode == "sos":
-                filtered_traces = scipy.signal.sosfilt(self.coeff, traces_chunk, axis=0)
-            elif self.filter_mode == "ba":
-                b, a = self.coeff
-                filtered_traces = scipy.signal.lfilter(b, a, traces_chunk, axis=0)
-
-            if self.direction == "backward":
-                filtered_traces = np.flip(filtered_traces, axis=0)
+        filtered_traces = self._filter(traces_chunk, max_threads)
 
         if right_margin > 0:
             filtered_traces = filtered_traces[left_margin:-right_margin, :]
@@ -223,6 +274,12 @@ def get_traces(self, start_frame, end_frame, channel_indices):
 
         return filtered_traces.astype(self.dtype)
 
+    def get_traces(self, start_frame=None, end_frame=None, channel_indices=None):
+        return self._get_traces_impl(start_frame, end_frame, channel_indices, max_threads=1)
+
+    def get_traces_multi_thread(self, start_frame=None, end_frame=None, channel_indices=None, max_threads=1):
+        return self._get_traces_impl(start_frame, end_frame, channel_indices, max_threads=max_threads)
+
 
 class BandpassFilterRecording(FilterRecording):
     """
diff --git a/src/spikeinterface/preprocessing/tests/test_common_reference.py b/src/spikeinterface/preprocessing/tests/test_common_reference.py
index e19cad59ba..dc500f5a47 100644
--- a/src/spikeinterface/preprocessing/tests/test_common_reference.py
+++ b/src/spikeinterface/preprocessing/tests/test_common_reference.py
@@ -209,5 +209,37 @@ def test_local_car_vs_cmr_performance():
     assert car_time < cmr_time
 
 
+def test_cmr_parallel_median_matches_stock():
+    """``get_traces_multi_thread`` must produce bit-identical median output."""
+    from spikeinterface import NumpyRecording
+
+    rng = np.random.default_rng(0)
+    T, C = 60_000, 64
+    traces = (rng.standard_normal((T, C)) * 100).astype("float32")
+    rec = NumpyRecording([traces], sampling_frequency=30_000.0)
+    cmr = common_reference(rec, reference="global", operator="median")
+    ref = cmr.get_traces(start_frame=5_000, end_frame=T - 5_000)
+    out = cmr.get_traces_multi_thread(start_frame=5_000, end_frame=T - 5_000, max_threads=8)
+    np.testing.assert_array_equal(out, ref)
+
+
+def test_cmr_parallel_average_matches_stock():
+    """Same invariant for the mean (CAR) operator; tolerate float rounding."""
+    from spikeinterface import NumpyRecording
+
+    rng = np.random.default_rng(0)
+    T, C = 60_000, 64
+    traces = (rng.standard_normal((T, C)) * 100).astype("float32")
+    rec = NumpyRecording([traces], sampling_frequency=30_000.0)
+    cmr = common_reference(rec, reference="global", operator="average")
+    ref = cmr.get_traces(start_frame=5_000, end_frame=T - 5_000)
+    out = cmr.get_traces_multi_thread(start_frame=5_000, end_frame=T - 5_000, max_threads=8)
+    # Mean across different block partitions can differ by 1 ULP due to
+    # non-associative float summation.
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-4)
+
+
 if __name__ == "__main__":
     test_local_car_vs_cmr_performance()
+    test_cmr_parallel_median_matches_stock()
+    test_cmr_parallel_average_matches_stock()
diff --git a/src/spikeinterface/preprocessing/tests/test_filter.py b/src/spikeinterface/preprocessing/tests/test_filter.py
index e95b456542..0e67a47897 100644
--- a/src/spikeinterface/preprocessing/tests/test_filter.py
+++ b/src/spikeinterface/preprocessing/tests/test_filter.py
@@ -220,5 +220,36 @@ def test_filter_opencl():
     # plt.show()
 
 
+def test_bandpass_parallel_matches_stock():
+    """``get_traces_multi_thread(max_threads=N)`` must match ``get_traces``.
+
+    Locks in the invariant that channel-axis parallelism is a pure perf
+    optimisation — scipy's sosfiltfilt is channel-independent so splitting
+    the channel axis across threads cannot change per-channel output.
+    """
+    rng = np.random.default_rng(0)
+    T, C = 60_000, 64
+    traces = (rng.standard_normal((T, C)) * 100).astype("float32")
+    rec = NumpyRecording([traces], sampling_frequency=30_000.0)
+    bp = bandpass_filter(rec, freq_min=300.0, freq_max=5000.0, dtype="float32")
+    ref = bp.get_traces(start_frame=5_000, end_frame=T - 5_000)
+    out = bp.get_traces_multi_thread(start_frame=5_000, end_frame=T - 5_000, max_threads=8)
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-4)
+
+
+def test_filter_parallel_fewer_channels_than_workers():
+    """``max_threads > C`` must still produce correct output (falls through to serial)."""
+    rng = np.random.default_rng(0)
+    T, C = 10_000, 4
+    traces = (rng.standard_normal((T, C)) * 100).astype("float32")
+    rec = NumpyRecording([traces], sampling_frequency=30_000.0)
+    bp = bandpass_filter(rec, freq_min=300.0, freq_max=5000.0, dtype="float32")
+    ref = bp.get_traces(start_frame=1000, end_frame=T - 1000)
+    out = bp.get_traces_multi_thread(start_frame=1000, end_frame=T - 1000, max_threads=16)
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-4)
+
+
 if __name__ == "__main__":
     test_filter()
+    test_bandpass_parallel_matches_stock()
+    test_filter_parallel_fewer_channels_than_workers()
diff --git a/src/spikeinterface/preprocessing/tests/test_parallel_pool_semantics.py b/src/spikeinterface/preprocessing/tests/test_parallel_pool_semantics.py
new file mode 100644
index 0000000000..ab02ab7899
--- /dev/null
+++ b/src/spikeinterface/preprocessing/tests/test_parallel_pool_semantics.py
@@ -0,0 +1,281 @@
+"""Tests for the per-caller-thread pool semantics used by
+``BaseRecording.get_traces_multi_thread`` (FilterRecording, CommonReferenceRecording).
+
+Contract: each outer thread that calls ``get_traces_multi_thread`` gets its own
+inner ``ThreadPoolExecutor`` (keyed in a module-global registry by
+``(Thread, max_threads)``).  Keying by Thread avoids the shared-pool queueing
+pathology that arises when many outer workers submit concurrently into a
+single inner pool with fewer max_workers than outer callers.
+
+The pool registry lives in ``core/job_tools._inner_pools`` rather than on each
+segment, so a chained pipeline reuses one pool per ``(Thread, max_threads)``
+pair across segments.
+"""
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import os
+import sys
+import threading
+
+import numpy as np
+import pytest
+
+from spikeinterface import NumpyRecording
+from spikeinterface.preprocessing import (
+    BandpassFilterRecording,
+    CommonReferenceRecording,
+)
+from spikeinterface.core.job_tools import _inner_pools, get_inner_pool
+
+
+def _make_recording(T: int = 50_000, C: int = 64, fs: float = 30_000.0):
+    rng = np.random.default_rng(0)
+    traces = rng.standard_normal((T, C)).astype(np.float32) * 100.0
+    return NumpyRecording([traces], sampling_frequency=fs)
+
+
+@pytest.fixture
+def filter_rec():
+    return BandpassFilterRecording(_make_recording(), freq_min=300.0, freq_max=6000.0)
+
+
+@pytest.fixture
+def cmr_rec():
+    return CommonReferenceRecording(_make_recording(), operator="median", reference="global")
+
+
+def _pool_for_current_thread(max_threads: int):
+    sized = _inner_pools.get(threading.current_thread())
+    if sized is None:
+        return None
+    return sized.get(max_threads)
+
+
+class TestPerCallerThreadPool:
+    """Verify each calling thread gets its own inner pool, keyed by max_threads."""
+
+    @pytest.mark.parametrize("rec_fixture", ["filter_rec", "cmr_rec"])
+    def test_single_caller_reuses_pool(self, rec_fixture, request):
+        """Repeated calls from the same thread reuse the same inner pool."""
+        rec = request.getfixturevalue(rec_fixture)
+        rec.get_traces_multi_thread(start_frame=0, end_frame=50_000, max_threads=4)
+        pool_a = _pool_for_current_thread(4)
+        rec.get_traces_multi_thread(start_frame=0, end_frame=50_000, max_threads=4)
+        pool_b = _pool_for_current_thread(4)
+        assert pool_a is not None
+        assert pool_a is pool_b, "expected the same inner pool to be reused across calls from the same thread"
+
+    @pytest.mark.parametrize("rec_fixture", ["filter_rec", "cmr_rec"])
+    def test_concurrent_callers_get_distinct_pools(self, rec_fixture, request):
+        """Two outer threads calling get_traces_multi_thread concurrently must
+        receive different inner pools — not a shared one that would queue their
+        tasks through a single bottleneck.
+        """
+        rec = request.getfixturevalue(rec_fixture)
+
+        ready = threading.Barrier(2)
+        captured = {}
+
+        def worker(name):
+            ready.wait()
+            rec.get_traces_multi_thread(start_frame=0, end_frame=50_000, max_threads=4)
+            captured[name] = _pool_for_current_thread(4)
+
+        t1 = threading.Thread(target=worker, args=("t1",))
+        t2 = threading.Thread(target=worker, args=("t2",))
+        t1.start()
+        t2.start()
+        t1.join()
+        t2.join()
+
+        assert captured["t1"] is not None
+        assert captured["t2"] is not None
+        assert captured["t1"] is not captured["t2"], (
+            "expected distinct inner pools for concurrent callers; a shared "
+            "single-pool design would cause queueing pathology"
+        )
+
+    def test_distinct_max_threads_get_distinct_pools(self):
+        """Same caller, different max_threads => different pools.
+
+        get_inner_pool is keyed by (Thread, max_threads) so a viewer that
+        flips between budgets gets a fresh pool of the right size each time
+        rather than sharing one undersized pool.
+        """
+        pool_a = get_inner_pool(2)
+        pool_b = get_inner_pool(8)
+        assert pool_a is not None
+        assert pool_b is not None
+        assert pool_a is not pool_b
+        # repeated lookups of the same size return the same pool
+        assert get_inner_pool(2) is pool_a
+        assert get_inner_pool(8) is pool_b
+
+    def test_single_thread_max_threads_is_passthrough(self):
+        """max_threads <= 1 returns None — no pool is ever created."""
+        assert get_inner_pool(1) is None
+        assert get_inner_pool(0) is None
+
+
+class TestChainPropagation:
+    """Verify max_threads propagates through chained preprocessor segments.
+
+    The contract: calling ``cmr.get_traces_multi_thread(max_threads=K)`` on
+    a ``BP → CMR`` chain must invoke BP's parallel kernel with ``K`` threads
+    too — not just CMR's.  Inside one such call the chain runs sequentially
+    (BP completes before CMR starts), so peak in-flight is K threads, but
+    each stage gets the budget when it's its turn.
+    """
+
+    def test_chain_bp_cmr_matches_serial(self):
+        """Bit-equivalence (within float tolerance) of serial vs parallel chain."""
+        rec = _make_recording(T=60_000, C=64)
+        bp = BandpassFilterRecording(rec, freq_min=300.0, freq_max=6000.0, dtype="float32")
+        cmr = CommonReferenceRecording(bp, reference="global", operator="median")
+        ref = cmr.get_traces(start_frame=5_000, end_frame=55_000)
+        out = cmr.get_traces_multi_thread(start_frame=5_000, end_frame=55_000, max_threads=8)
+        # CMR median is bit-identical regardless of block partition; BP SOS
+        # split is also bit-identical per channel.  Both stages parallel ⇒
+        # bit-identical to fully-serial chain.
+        np.testing.assert_array_equal(out, ref)
+
+    def test_chain_bp_car_within_tolerance(self):
+        """CAR (mean) is non-associative across blocks ⇒ tolerance-equivalent."""
+        rec = _make_recording(T=60_000, C=64)
+        bp = BandpassFilterRecording(rec, freq_min=300.0, freq_max=6000.0, dtype="float32")
+        car = CommonReferenceRecording(bp, reference="global", operator="average")
+        ref = car.get_traces(start_frame=5_000, end_frame=55_000)
+        out = car.get_traces_multi_thread(start_frame=5_000, end_frame=55_000, max_threads=8)
+        # Mean across block partitions can differ by ~1 ULP from single-pass.
+        np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-4)
+
+    def test_chain_bp_invokes_parallel_kernel(self):
+        """The upstream BP segment's get_traces_multi_thread must actually fire.
+
+        We monkey-patch the BP segment to count get_traces vs
+        get_traces_multi_thread invocations during a chained call.
+        """
+        rec = _make_recording(T=60_000, C=64)
+        bp = BandpassFilterRecording(rec, freq_min=300.0, freq_max=6000.0, dtype="float32")
+        cmr = CommonReferenceRecording(bp, reference="global", operator="median")
+
+        bp_seg = bp._recording_segments[0]
+        counts = {"get_traces": 0, "get_traces_multi_thread": 0}
+        original_get_traces = bp_seg.get_traces
+        original_multi = bp_seg.get_traces_multi_thread
+
+        def counting_get_traces(*args, **kwargs):
+            counts["get_traces"] += 1
+            return original_get_traces(*args, **kwargs)
+
+        def counting_multi(*args, **kwargs):
+            counts["get_traces_multi_thread"] += 1
+            return original_multi(*args, **kwargs)
+
+        bp_seg.get_traces = counting_get_traces
+        bp_seg.get_traces_multi_thread = counting_multi
+
+        cmr.get_traces_multi_thread(start_frame=5_000, end_frame=55_000, max_threads=8)
+        # Chain propagation must route upstream via get_traces_multi_thread.
+        assert (
+            counts["get_traces_multi_thread"] >= 1
+        ), f"expected BP.get_traces_multi_thread to fire under chain propagation; counts={counts}"
+
+    def test_chain_serial_path_bypasses_multi(self):
+        """``cmr.get_traces()`` (not multi_thread) must NOT fire the parallel kernel.
+
+        Symmetric guard: the serial path stays serial all the way down.
+        """
+        rec = _make_recording(T=60_000, C=64)
+        bp = BandpassFilterRecording(rec, freq_min=300.0, freq_max=6000.0, dtype="float32")
+        cmr = CommonReferenceRecording(bp, reference="global", operator="median")
+
+        bp_seg = bp._recording_segments[0]
+        counts = {"get_traces": 0, "get_traces_multi_thread": 0}
+        original_get_traces = bp_seg.get_traces
+        original_multi = bp_seg.get_traces_multi_thread
+
+        def counting_get_traces(*args, **kwargs):
+            counts["get_traces"] += 1
+            return original_get_traces(*args, **kwargs)
+
+        def counting_multi(*args, **kwargs):
+            counts["get_traces_multi_thread"] += 1
+            return original_multi(*args, **kwargs)
+
+        bp_seg.get_traces = counting_get_traces
+        bp_seg.get_traces_multi_thread = counting_multi
+
+        cmr.get_traces(start_frame=5_000, end_frame=55_000)
+        assert counts["get_traces_multi_thread"] == 0, f"serial path leaked into multi_thread; counts={counts}"
+        assert counts["get_traces"] >= 1, f"BP.get_traces should have fired; counts={counts}"
+
+
+# --- Post-fork pid-guard regression test --------------------------------------
+#
+# The pid guard in get_inner_pool detects when the calling process has
+# changed (i.e. after os.fork()) and rebuilds the registry so we don't
+# inherit the parent's ThreadPoolExecutors — whose worker OS threads were not
+# copied across fork() and would deadlock on the child's first submit().
+
+
+def _child_uses_inherited_recording(rec, queue):
+    """Child entry point: exercise the parent-inherited recording.
+
+    Under fork, the parent's ``_inner_pools`` registry is copied via fork's
+    COW.  Without the pid guard in ``get_inner_pool``, the child's first
+    ``submit()`` blocks because the worker threads of the inherited
+    ``ThreadPoolExecutor`` don't exist in this process.
+    """
+    try:
+        rec.get_traces_multi_thread(start_frame=0, end_frame=50_000, max_threads=4)
+        queue.put("ok")
+    except Exception as e:  # pragma: no cover — failure path
+        queue.put(f"error: {type(e).__name__}: {e}")
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="fork is POSIX-only")
+@pytest.mark.parametrize(
+    "builder",
+    [
+        lambda: BandpassFilterRecording(_make_recording(), freq_min=300.0, freq_max=6000.0),
+        lambda: CommonReferenceRecording(_make_recording(), operator="median", reference="global"),
+    ],
+    ids=["filter", "cmr"],
+)
+def test_pool_recovers_after_fork(builder):
+    """After fork, the child must rebuild its inner pool rather than reuse the
+    parent's stale one — so ``get_traces_multi_thread`` completes promptly.
+
+    Trigger: the parent pre-warms the pool *before* fork.  Without the pid
+    guard in ``get_inner_pool``, the child's first ``submit()`` deadlocks on
+    the inherited pool's queue because the parent's worker OS threads were
+    not copied across ``fork()``.
+    """
+    rec = builder()
+    rec.get_traces_multi_thread(start_frame=0, end_frame=50_000, max_threads=4)
+    parent_pid = os.getpid()
+    parent_pool = _pool_for_current_thread(4)
+    assert parent_pool is not None, "fixture failed to pre-warm the parent pool"
+
+    ctx = mp.get_context("fork")
+    queue = ctx.Queue()
+    proc = ctx.Process(target=_child_uses_inherited_recording, args=(rec, queue))
+    proc.start()
+    proc.join(timeout=30)
+    if proc.is_alive():
+        proc.terminate()
+        proc.join()
+        pytest.fail(
+            "child get_traces_multi_thread() deadlocked after fork: pid guard "
+            "in get_inner_pool is missing or broken (parent pre-warmed the pool before fork)"
+        )
+    result = queue.get_nowait()
+    assert result == "ok", f"child failed: {result}"
+    assert proc.exitcode == 0, f"child exited non-zero: {proc.exitcode}"
+
+    # Parent's pool is unchanged after the child runs.
+    assert os.getpid() == parent_pid
+    assert _pool_for_current_thread(4) is parent_pool