From ce396ec1fdba7f93cf725c36c1f8dd99de7cfb11 Mon Sep 17 00:00:00 2001
From: Daniel Kantor <git@daniel-kantor.com>
Date: Sun, 1 Feb 2026 22:58:35 +0100
Subject: [PATCH 1/6] perf: optimize sampling for grep mode

---
 src/ingest/formats/json/mod.rs |  11 ++-
 src/ingest/formats/text/mod.rs |   4 +-
 src/ingest/formats/yaml/mod.rs |   2 +-
 src/ingest/mod.rs              |  45 ++++++++-
 src/ingest/sampling/mod.rs     | 164 ++++++++++++++++++++++++++++++---
 src/lib.rs                     |   6 +-
 6 files changed, 204 insertions(+), 28 deletions(-)
diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs
index 70b4fbaa..90bfbb90 100644
--- a/src/ingest/formats/json/mod.rs
+++ b/src/ingest/formats/json/mod.rs
@@ -67,7 +67,7 @@ pub(crate) fn build_json_tree_arena_from_many(
 }
 
 /// Collect (byte_start, 1-based line number) for every non-empty line.
-fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
+pub fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
     let mut offsets = Vec::new();
     let mut pos = 0usize;
     for (line_idx, raw_line) in text.split('\n').enumerate() {
@@ -88,10 +88,13 @@ fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
 ///
 /// Lines are sampled using the same strategy as JSON arrays (controlled by
 /// `PriorityConfig::array_max_items` and `array_sampler`), so only a subset
-/// of lines is actually parsed for large inputs.
+/// of lines is actually parsed for large inputs. When a `must_include`
+/// predicate is provided, matching lines are always kept regardless of the
+/// sampling cap.
 pub fn parse_jsonl_one(
     bytes: &[u8],
     cfg: &PriorityConfig,
+    must_include: impl Fn(usize) -> bool,
 ) -> Result<TreeArena> {
     use crate::ingest::sampling::{ArraySamplerKind, choose_indices};
 
@@ -102,7 +105,7 @@ pub fn parse_jsonl_one(
     let total = line_offsets.len();
     let sampler_kind: ArraySamplerKind = cfg.array_sampler.into();
     let kept_indices =
-        choose_indices(sampler_kind, total, cfg.array_max_items);
+        choose_indices(sampler_kind, total, cfg.array_max_items, must_include);
 
     let builder = JsonTreeBuilder::new(cfg.array_max_items, sampler_kind);
     let root_id = builder.push_default();
@@ -143,7 +146,7 @@ pub(crate) fn build_jsonl_tree_arena_from_slice(
     bytes: &[u8],
     cfg: &PriorityConfig,
 ) -> Result<TreeArena> {
-    parse_jsonl_one(bytes, cfg)
+    parse_jsonl_one(bytes, cfg, |_| false)
 }
 
 /// Convenience functions for the JSON ingest path.
diff --git a/src/ingest/formats/text/mod.rs b/src/ingest/formats/text/mod.rs
index 175000c8..aa76897b 100644
--- a/src/ingest/formats/text/mod.rs
+++ b/src/ingest/formats/text/mod.rs
@@ -129,7 +129,7 @@ impl TextArenaBuilder {
             n.arr_indices_len = 0;
             return id;
         }
-        let idxs = choose_indices(self.sampler, total, self.array_cap);
+        let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false);
         let kept = idxs.len().min(self.array_cap);
         let children_start = self.arena.children.len();
         for &orig_index in idxs.iter().take(kept) {
@@ -192,7 +192,7 @@ impl TextArenaBuilder {
         lines: &[String],
         total: usize,
     ) {
-        let idxs = choose_indices(self.sampler, total, self.array_cap);
+        let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false);
         let kept = idxs.len().min(self.array_cap);
         let children_start = self.arena.children.len();
         let mut pushed = 0usize;
diff --git a/src/ingest/formats/yaml/mod.rs b/src/ingest/formats/yaml/mod.rs
index a6bbbb9e..d8b9d936 100644
--- a/src/ingest/formats/yaml/mod.rs
+++ b/src/ingest/formats/yaml/mod.rs
@@ -159,7 +159,7 @@ impl YamlArenaBuilder {
         match y {
             Yaml::Array(v) => {
                 let total = v.len();
-                let idxs = choose_indices(self.sampler, total, self.array_cap);
+                let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false);
                 let mut child_ids = Vec::with_capacity(idxs.len());
                 for i in &idxs {
                     if let Some(item) = v.get(*i) {
diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs
index fb9fec26..c36f67c1 100644
--- a/src/ingest/mod.rs
+++ b/src/ingest/mod.rs
@@ -3,6 +3,7 @@ use anyhow::Result;
 use crate::order::PriorityConfig;
 use crate::utils::tree_arena::JsonTreeArena as TreeArena;
 
+use crate::grep::GrepConfig;
 use crate::InputKind;
 
 pub mod fileset;
@@ -25,10 +26,43 @@ pub(crate) struct IngestOutput {
     pub warnings: Vec<String>,
 }
 
+/// Build a predicate that returns true for JSONL line indices matching the
+/// strong grep pattern. When no grep is active, returns a no-op.
+///
+/// Uses a single regex scan over the entire text and maps match positions
+/// back to line indices, avoiding per-line regex overhead.
+fn jsonl_grep_predicate(
+    bytes: &[u8],
+    grep: &GrepConfig,
+) -> Box<dyn Fn(usize) -> bool> {
+    let Some(re) = grep.patterns.strong() else {
+        return Box::new(|_| false);
+    };
+    let Ok(text) = std::str::from_utf8(bytes) else {
+        return Box::new(|_| false);
+    };
+    let offsets = formats::json::jsonl_line_offsets(text);
+    if offsets.is_empty() {
+        return Box::new(|_| false);
+    }
+    // Single regex pass: find all match positions and map to line indices.
+    let mut matching = vec![false; offsets.len()];
+    for m in re.find_iter(text) {
+        let pos = m.start();
+        // Binary search for the line containing this byte position.
+        let idx = offsets.partition_point(|&(start, _)| start <= pos);
+        if idx > 0 {
+            matching[idx - 1] = true;
+        }
+    }
+    Box::new(move |i: usize| matching.get(i).copied().unwrap_or(false))
+}
+
 /// Dispatch the appropriate ingest path for any supported input kind.
 pub(crate) fn ingest_into_arena(
     input: InputKind,
     priority_cfg: &PriorityConfig,
+    grep: &GrepConfig,
 ) -> Result<IngestOutput> {
     match input {
         InputKind::Json(bytes) => {
@@ -38,10 +72,12 @@ pub(crate) fn ingest_into_arena(
             })
         }
         InputKind::Jsonl(bytes) => {
-            parse_jsonl_one(&bytes, priority_cfg).map(|arena| IngestOutput {
-                arena,
-                warnings: Vec::new(),
-            })
+            let must_include = jsonl_grep_predicate(&bytes, grep);
+            parse_jsonl_one(&bytes, priority_cfg, |i| must_include(i))
+                .map(|arena| IngestOutput {
+                    arena,
+                    warnings: Vec::new(),
+                })
         }
         InputKind::Yaml(bytes) => {
             parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput {
@@ -118,6 +154,7 @@ mod tests {
         let IngestOutput { arena, warnings } = ingest_into_arena(
             InputKind::Fileset(inputs),
             &PriorityConfig::new(usize::MAX, usize::MAX),
+            &GrepConfig::default(),
         )
         .unwrap();
         assert!(arena.is_fileset, "fileset input should mark arena");
diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs
index 8e22e07c..8cdb0411 100644
--- a/src/ingest/sampling/mod.rs
+++ b/src/ingest/sampling/mod.rs
@@ -47,13 +47,18 @@ fn accept_index(i: u64) -> bool {
 }
 
 /// Choose indices using the default policy (keep-first, greedy, random accept).
+/// Items for which `must_include(i)` returns true are always kept.
 #[allow(
     clippy::cognitive_complexity,
     reason = "Single function mirrors JSON streaming sampler phases"
 )]
-pub fn choose_indices_default(total: usize, cap: usize) -> Vec<usize> {
+pub fn choose_indices_default(
+    total: usize,
+    cap: usize,
+    must_include: impl Fn(usize) -> bool,
+) -> Vec<usize> {
     if cap == 0 || total == 0 {
-        return Vec::new();
+        return collect_required(total, cap, &must_include);
     }
     if cap >= total {
         return (0..total).collect();
@@ -66,7 +71,7 @@ pub fn choose_indices_default(total: usize, cap: usize) -> Vec<usize> {
     }
     if out.len() >= cap || out.len() >= total {
         out.truncate(cap.min(total));
-        return out;
+        return merge_required(out, total, cap, &must_include);
     }
     // Greedy phase: take a portion of remaining capacity linearly
     let mut idx = keep_first;
@@ -79,7 +84,7 @@ pub fn choose_indices_default(total: usize, cap: usize) -> Vec<usize> {
         g += 1;
     }
     if out.len() >= cap || idx >= total {
-        return out;
+        return merge_required(out, total, cap, &must_include);
     }
     // Random phase: use accept_index on logical index to thin remaining
     while out.len() < cap && idx < total {
@@ -88,38 +93,123 @@ pub fn choose_indices_default(total: usize, cap: usize) -> Vec<usize> {
         }
         idx += 1;
     }
-    out
+    merge_required(out, total, cap, &must_include)
 }
 
 /// Choose head prefix indices.
-pub fn choose_indices_head(total: usize, cap: usize) -> Vec<usize> {
+/// Items for which `must_include(i)` returns true are always kept.
+pub fn choose_indices_head(
+    total: usize,
+    cap: usize,
+    must_include: impl Fn(usize) -> bool,
+) -> Vec<usize> {
     let kept = total.min(cap);
-    (0..kept).collect()
+    let out: Vec<usize> = (0..kept).collect();
+    merge_required(out, total, cap, &must_include)
 }
 
 /// Choose tail suffix indices.
-pub fn choose_indices_tail(total: usize, cap: usize) -> Vec<usize> {
+/// Items for which `must_include(i)` returns true are always kept.
+pub fn choose_indices_tail(
+    total: usize,
+    cap: usize,
+    must_include: impl Fn(usize) -> bool,
+) -> Vec<usize> {
     if cap == 0 || total == 0 {
-        return Vec::new();
+        return collect_required(total, cap, &must_include);
     }
     let kept = total.min(cap);
     let start = total.saturating_sub(kept);
-    (start..total).collect()
+    let out: Vec<usize> = (start..total).collect();
+    merge_required(out, total, cap, &must_include)
 }
 
 /// Dispatcher: choose indices for a given sampler kind.
+/// Items for which `must_include(i)` returns true are always kept,
+/// regardless of the sampling strategy or cap.
 pub fn choose_indices(
     kind: ArraySamplerKind,
     total: usize,
     cap: usize,
+    must_include: impl Fn(usize) -> bool,
 ) -> Vec<usize> {
     match kind {
-        ArraySamplerKind::Default => choose_indices_default(total, cap),
-        ArraySamplerKind::Head => choose_indices_head(total, cap),
-        ArraySamplerKind::Tail => choose_indices_tail(total, cap),
+        ArraySamplerKind::Default => {
+            choose_indices_default(total, cap, must_include)
+        }
+        ArraySamplerKind::Head => {
+            choose_indices_head(total, cap, must_include)
+        }
+        ArraySamplerKind::Tail => {
+            choose_indices_tail(total, cap, must_include)
+        }
     }
 }
 
+/// Merge required indices into an already-chosen set, preserving sorted order.
+/// At most `cap` extra required indices are added (sampled from the required
+/// set using the same head/mid/tail distribution) to avoid blowing up when
+/// most items match.
+fn merge_required(
+    sampled: Vec<usize>,
+    total: usize,
+    cap: usize,
+    must_include: &impl Fn(usize) -> bool,
+) -> Vec<usize> {
+    let mut seen = vec![false; total];
+    for &i in &sampled {
+        seen[i] = true;
+    }
+    let mut extra: Vec<usize> = Vec::new();
+    for i in 0..total {
+        if !seen[i] && must_include(i) {
+            extra.push(i);
+        }
+    }
+    if extra.is_empty() {
+        return sampled;
+    }
+    // Sub-sample the extras so we don't blow past the cap.
+    if extra.len() > cap {
+        let sub = subsample_indices(extra.len(), cap);
+        extra = sub.into_iter().map(|i| extra[i]).collect();
+    }
+    // Merge both sorted sequences
+    let mut result = Vec::with_capacity(sampled.len() + extra.len());
+    let (mut si, mut ei) = (0, 0);
+    while si < sampled.len() && ei < extra.len() {
+        if sampled[si] <= extra[ei] {
+            result.push(sampled[si]);
+            si += 1;
+        } else {
+            result.push(extra[ei]);
+            ei += 1;
+        }
+    }
+    result.extend_from_slice(&sampled[si..]);
+    result.extend_from_slice(&extra[ei..]);
+    result
+}
+
+/// Collect only the required indices (used when cap is 0).
+fn collect_required(
+    total: usize,
+    cap: usize,
+    must_include: &impl Fn(usize) -> bool,
+) -> Vec<usize> {
+    let all: Vec<usize> = (0..total).filter(|&i| must_include(i)).collect();
+    if all.len() <= cap || cap == 0 {
+        return all;
+    }
+    let sub = subsample_indices(all.len(), cap);
+    sub.into_iter().map(|i| all[i]).collect()
+}
+
+/// Pure default-policy sub-sampling with no `must_include` (breaks recursion).
+fn subsample_indices(total: usize, cap: usize) -> Vec<usize> {
+    choose_indices_default(total, cap, |_| false)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -128,7 +218,7 @@ mod tests {
     fn default_sampler_returns_all_when_cap_not_binding() {
         let total = 10usize;
         let cap = total + 5;
-        let indices = choose_indices_default(total, cap);
+        let indices = choose_indices_default(total, cap, |_| false);
         assert_eq!(indices, (0..total).collect::<Vec<_>>());
     }
 
@@ -136,7 +226,51 @@ mod tests {
     fn default_sampler_respects_cap_when_smaller() {
         let total = 10usize;
         let cap = 3usize;
-        let indices = choose_indices_default(total, cap);
+        let indices = choose_indices_default(total, cap, |_| false);
         assert!(indices.len() <= cap);
     }
+
+    #[test]
+    fn must_include_adds_missing_indices() {
+        let total = 20usize;
+        let cap = 3usize;
+        // Force index 15 to be included even though cap is 3
+        let indices = choose_indices_default(total, cap, |i| i == 15);
+        assert!(
+            indices.contains(&15),
+            "must_include index should be present: {indices:?}"
+        );
+        // Original sampled indices should still be present
+        assert!(indices.contains(&0), "head items should be present");
+    }
+
+    #[test]
+    fn must_include_preserves_sorted_order() {
+        let total = 100usize;
+        let cap = 5usize;
+        let indices =
+            choose_indices_default(total, cap, |i| i == 50 || i == 90);
+        for w in indices.windows(2) {
+            assert!(w[0] < w[1], "indices should be sorted: {indices:?}");
+        }
+        assert!(indices.contains(&50));
+        assert!(indices.contains(&90));
+    }
+
+    #[test]
+    fn must_include_with_zero_cap() {
+        let total = 10usize;
+        let indices =
+            choose_indices_default(total, 0, |i| i == 3 || i == 7);
+        assert_eq!(indices, vec![3, 7]);
+    }
+
+    #[test]
+    fn must_include_no_duplicates_when_already_sampled() {
+        let total = 10usize;
+        let cap = 10usize;
+        // All indices already sampled; must_include shouldn't duplicate
+        let indices = choose_indices_default(total, cap, |i| i == 0);
+        assert_eq!(indices, (0..total).collect::<Vec<_>>());
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index d275174e..d75a76ab 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -76,12 +76,14 @@ pub fn headson(
     budgets: Budgets,
 ) -> Result<RenderOutput> {
     let mut prio = *priority_cfg;
-    if grep.has_strong() {
+    let is_jsonl = matches!(input, InputKind::Jsonl(_));
+    if grep.has_strong() && !is_jsonl {
         // Avoid sampling away potential matches in strong grep mode.
+        // JSONL handles this via must_include in the sampler instead.
         prio.array_max_items = usize::MAX;
     }
     let crate::ingest::IngestOutput { arena, warnings } =
-        crate::ingest::ingest_into_arena(input, &prio)?;
+        crate::ingest::ingest_into_arena(input, &prio, grep)?;
     let mut order_build = order::build_order(&arena, &prio)?;
     let out = find_largest_render_under_budgets(
         &mut order_build,

From b19489c8632ba72dcbd145f4f43f514f373f35ae Mon Sep 17 00:00:00 2001
From: Daniel Kantor <git@daniel-kantor.com>
Date: Sun, 1 Feb 2026 22:58:50 +0100
Subject: [PATCH 2/6] fixes to grep mode sampling

---
 src/ingest/formats/text/mod.rs |  6 ++++--
 src/ingest/formats/yaml/mod.rs |  7 ++++++-
 src/ingest/mod.rs              |  9 +++++----
 src/ingest/sampling/mod.rs     | 11 +++++++----
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/ingest/formats/text/mod.rs b/src/ingest/formats/text/mod.rs
index aa76897b..b3574509 100644
--- a/src/ingest/formats/text/mod.rs
+++ b/src/ingest/formats/text/mod.rs
@@ -129,7 +129,8 @@ impl TextArenaBuilder {
             n.arr_indices_len = 0;
             return id;
         }
-        let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false);
+        let idxs =
+            choose_indices(self.sampler, total, self.array_cap, |_| false);
         let kept = idxs.len().min(self.array_cap);
         let children_start = self.arena.children.len();
         for &orig_index in idxs.iter().take(kept) {
@@ -192,7 +193,8 @@ impl TextArenaBuilder {
         lines: &[String],
         total: usize,
     ) {
-        let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false);
+        let idxs =
+            choose_indices(self.sampler, total, self.array_cap, |_| false);
         let kept = idxs.len().min(self.array_cap);
         let children_start = self.arena.children.len();
         let mut pushed = 0usize;
diff --git a/src/ingest/formats/yaml/mod.rs b/src/ingest/formats/yaml/mod.rs
index d8b9d936..66651835 100644
--- a/src/ingest/formats/yaml/mod.rs
+++ b/src/ingest/formats/yaml/mod.rs
@@ -159,7 +159,12 @@ impl YamlArenaBuilder {
         match y {
             Yaml::Array(v) => {
                 let total = v.len();
-                let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false);
+                let idxs = choose_indices(
+                    self.sampler,
+                    total,
+                    self.array_cap,
+                    |_| false,
+                );
                 let mut child_ids = Vec::with_capacity(idxs.len());
                 for i in &idxs {
                     if let Some(item) = v.get(*i) {
diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs
index c36f67c1..dd48a47e 100644
--- a/src/ingest/mod.rs
+++ b/src/ingest/mod.rs
@@ -3,8 +3,8 @@ use anyhow::Result;
 use crate::order::PriorityConfig;
 use crate::utils::tree_arena::JsonTreeArena as TreeArena;
 
-use crate::grep::GrepConfig;
 use crate::InputKind;
+use crate::grep::GrepConfig;
 
 pub mod fileset;
 pub mod format;
@@ -73,11 +73,12 @@ pub(crate) fn ingest_into_arena(
         }
         InputKind::Jsonl(bytes) => {
             let must_include = jsonl_grep_predicate(&bytes, grep);
-            parse_jsonl_one(&bytes, priority_cfg, |i| must_include(i))
-                .map(|arena| IngestOutput {
+            parse_jsonl_one(&bytes, priority_cfg, &*must_include).map(
+                |arena| IngestOutput {
                     arena,
                     warnings: Vec::new(),
-                })
+                },
+            )
         }
         InputKind::Yaml(bytes) => {
             parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput {
diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs
index 8cdb0411..b7f57b79 100644
--- a/src/ingest/sampling/mod.rs
+++ b/src/ingest/sampling/mod.rs
@@ -150,6 +150,10 @@ pub fn choose_indices(
 /// At most `cap` extra required indices are added (sampled from the required
 /// set using the same head/mid/tail distribution) to avoid blowing up when
 /// most items match.
+#[allow(
+    clippy::cognitive_complexity,
+    reason = "Linear collect-and-merge logic reads clearest as a single function"
+)]
 fn merge_required(
     sampled: Vec<usize>,
     total: usize,
@@ -161,8 +165,8 @@ fn merge_required(
         seen[i] = true;
     }
     let mut extra: Vec<usize> = Vec::new();
-    for i in 0..total {
-        if !seen[i] && must_include(i) {
+    for (i, &already) in seen.iter().enumerate() {
+        if !already && must_include(i) {
             extra.push(i);
         }
     }
@@ -260,8 +264,7 @@ mod tests {
     #[test]
     fn must_include_with_zero_cap() {
         let total = 10usize;
-        let indices =
-            choose_indices_default(total, 0, |i| i == 3 || i == 7);
+        let indices = choose_indices_default(total, 0, |i| i == 3 || i == 7);
         assert_eq!(indices, vec![3, 7]);
     }
 

From dce04173b01f7fd26901eff163fb2d1f14f8bb75 Mon Sep 17 00:00:00 2001
From: Daniel Kantor <git@daniel-kantor.com>
Date: Sun, 1 Feb 2026 23:33:42 +0100
Subject: [PATCH 3/6] fix undersampling bug

---
 src/ingest/sampling/mod.rs | 38 ++++++++++----------------------------
 1 file changed, 10 insertions(+), 28 deletions(-)

diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs
index b7f57b79..10a16a3b 100644
--- a/src/ingest/sampling/mod.rs
+++ b/src/ingest/sampling/mod.rs
@@ -58,7 +58,7 @@ pub fn choose_indices_default(
     must_include: impl Fn(usize) -> bool,
 ) -> Vec<usize> {
     if cap == 0 || total == 0 {
-        return collect_required(total, cap, &must_include);
+        return collect_required(total, &must_include);
     }
     if cap >= total {
         return (0..total).collect();
@@ -71,7 +71,7 @@ pub fn choose_indices_default(
     }
     if out.len() >= cap || out.len() >= total {
         out.truncate(cap.min(total));
-        return merge_required(out, total, cap, &must_include);
+        return merge_required(out, total, &must_include);
     }
     // Greedy phase: take a portion of remaining capacity linearly
     let mut idx = keep_first;
@@ -84,7 +84,7 @@ pub fn choose_indices_default(
         g += 1;
     }
     if out.len() >= cap || idx >= total {
-        return merge_required(out, total, cap, &must_include);
+        return merge_required(out, total, &must_include);
     }
     // Random phase: use accept_index on logical index to thin remaining
     while out.len() < cap && idx < total {
@@ -93,7 +93,7 @@ pub fn choose_indices_default(
         }
         idx += 1;
     }
-    merge_required(out, total, cap, &must_include)
+    merge_required(out, total, &must_include)
 }
 
 /// Choose head prefix indices.
@@ -105,7 +105,7 @@ pub fn choose_indices_head(
 ) -> Vec<usize> {
     let kept = total.min(cap);
     let out: Vec<usize> = (0..kept).collect();
-    merge_required(out, total, cap, &must_include)
+    merge_required(out, total, &must_include)
 }
 
 /// Choose tail suffix indices.
@@ -116,12 +116,12 @@ pub fn choose_indices_tail(
     must_include: impl Fn(usize) -> bool,
 ) -> Vec<usize> {
     if cap == 0 || total == 0 {
-        return collect_required(total, cap, &must_include);
+        return collect_required(total, &must_include);
     }
     let kept = total.min(cap);
     let start = total.saturating_sub(kept);
     let out: Vec<usize> = (start..total).collect();
-    merge_required(out, total, cap, &must_include)
+    merge_required(out, total, &must_include)
 }
 
 /// Dispatcher: choose indices for a given sampler kind.
@@ -147,9 +147,8 @@ pub fn choose_indices(
 }
 
 /// Merge required indices into an already-chosen set, preserving sorted order.
-/// At most `cap` extra required indices are added (sampled from the required
-/// set using the same head/mid/tail distribution) to avoid blowing up when
-/// most items match.
+/// All required indices are unconditionally kept — correctness demands that
+/// `must_include` items are never silently dropped.
 #[allow(
     clippy::cognitive_complexity,
     reason = "Linear collect-and-merge logic reads clearest as a single function"
@@ -157,7 +156,6 @@ pub fn choose_indices(
 fn merge_required(
     sampled: Vec<usize>,
     total: usize,
-    cap: usize,
     must_include: &impl Fn(usize) -> bool,
 ) -> Vec<usize> {
     let mut seen = vec![false; total];
@@ -173,11 +171,6 @@ fn merge_required(
     if extra.is_empty() {
         return sampled;
     }
-    // Sub-sample the extras so we don't blow past the cap.
-    if extra.len() > cap {
-        let sub = subsample_indices(extra.len(), cap);
-        extra = sub.into_iter().map(|i| extra[i]).collect();
-    }
     // Merge both sorted sequences
     let mut result = Vec::with_capacity(sampled.len() + extra.len());
     let (mut si, mut ei) = (0, 0);
@@ -198,20 +191,9 @@ fn merge_required(
 /// Collect only the required indices (used when cap is 0).
 fn collect_required(
     total: usize,
-    cap: usize,
     must_include: &impl Fn(usize) -> bool,
 ) -> Vec<usize> {
-    let all: Vec<usize> = (0..total).filter(|&i| must_include(i)).collect();
-    if all.len() <= cap || cap == 0 {
-        return all;
-    }
-    let sub = subsample_indices(all.len(), cap);
-    sub.into_iter().map(|i| all[i]).collect()
-}
-
-/// Pure default-policy sub-sampling with no `must_include` (breaks recursion).
-fn subsample_indices(total: usize, cap: usize) -> Vec<usize> {
-    choose_indices_default(total, cap, |_| false)
+    (0..total).filter(|&i| must_include(i)).collect()
 }
 
 #[cfg(test)]

From 0fb4143bbf206a7267aa9d2b15c8a980683bbc00 Mon Sep 17 00:00:00 2001
From: Daniel Kantor <git@daniel-kantor.com>
Date: Mon, 2 Feb 2026 09:07:40 +0100
Subject: [PATCH 4/6] .

---
 src/ingest/formats/json/mod.rs |  2 +-
 src/ingest/mod.rs              | 80 ++++++++++++++++++++++++++++++++++
 src/ingest/sampling/mod.rs     | 59 +++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs
index 90bfbb90..e38507c9 100644
--- a/src/ingest/formats/json/mod.rs
+++ b/src/ingest/formats/json/mod.rs
@@ -67,7 +67,7 @@ pub(crate) fn build_json_tree_arena_from_many(
 }
 
 /// Collect (byte_start, 1-based line number) for every non-empty line.
-pub fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
+pub(crate) fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
     let mut offsets = Vec::new();
     let mut pos = 0usize;
     for (line_idx, raw_line) in text.split('\n').enumerate() {
diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs
index dd48a47e..7f454093 100644
--- a/src/ingest/mod.rs
+++ b/src/ingest/mod.rs
@@ -104,6 +104,7 @@ pub(crate) fn ingest_into_arena(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::grep::{GrepConfig, GrepPatterns, GrepShow};
     use crate::order::NodeKind;
 
     #[test]
@@ -145,6 +146,85 @@ mod tests {
         assert_eq!(arena.nodes[root].object_len.unwrap_or(0), 2);
     }
 
+    fn grep_with_strong(pattern: &str) -> GrepConfig {
+        GrepConfig {
+            patterns: GrepPatterns::StrongOnly(
+                regex::Regex::new(pattern).unwrap(),
+            ),
+            show: GrepShow::Matching,
+        }
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_marks_matching_lines() {
+        let input = b"{\"a\":1}\n{\"b\":2}\n{\"c\":3}\n";
+        let grep = grep_with_strong("b");
+        let pred = jsonl_grep_predicate(input, &grep);
+        assert!(!pred(0), "line 0 should not match");
+        assert!(pred(1), "line 1 should match 'b'");
+        assert!(!pred(2), "line 2 should not match");
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_multiple_matches() {
+        let input = b"{\"x\":1}\n{\"x\":2}\n{\"y\":3}\n{\"x\":4}\n";
+        let grep = grep_with_strong("x");
+        let pred = jsonl_grep_predicate(input, &grep);
+        assert!(pred(0));
+        assert!(pred(1));
+        assert!(!pred(2));
+        assert!(pred(3));
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_no_strong_pattern_returns_noop() {
+        let input = b"{\"a\":1}\n{\"b\":2}\n";
+        let grep = GrepConfig::default(); // no patterns
+        let pred = jsonl_grep_predicate(input, &grep);
+        assert!(!pred(0));
+        assert!(!pred(1));
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_skips_empty_lines() {
+        // Empty lines are excluded from offsets, so indices are dense
+        let input = b"{\"a\":1}\n\n{\"b\":2}\n";
+        let grep = grep_with_strong("b");
+        let pred = jsonl_grep_predicate(input, &grep);
+        // Only 2 non-empty lines: index 0 = {"a":1}, index 1 = {"b":2}
+        assert!(!pred(0));
+        assert!(pred(1));
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_match_on_first_line() {
+        let input = b"{\"needle\":true}\n{\"other\":false}\n";
+        let grep = grep_with_strong("needle");
+        let pred = jsonl_grep_predicate(input, &grep);
+        assert!(pred(0), "match on first line should work");
+        assert!(!pred(1));
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_match_on_last_line() {
+        let input = b"{\"a\":1}\n{\"needle\":true}";
+        let grep = grep_with_strong("needle");
+        let pred = jsonl_grep_predicate(input, &grep);
+        assert!(!pred(0));
+        assert!(
+            pred(1),
+            "match on last line (no trailing newline) should work"
+        );
+    }
+
+    #[test]
+    fn jsonl_grep_predicate_out_of_bounds_returns_false() {
+        let input = b"{\"a\":1}\n{\"b\":2}\n";
+        let grep = grep_with_strong("a");
+        let pred = jsonl_grep_predicate(input, &grep);
+        assert!(!pred(99), "out of bounds index should return false");
+    }
+
     #[test]
     fn fileset_ingest_surfaces_parse_warnings() {
         let inputs = vec![fileset::FilesetInput {
diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs
index 10a16a3b..185663c7 100644
--- a/src/ingest/sampling/mod.rs
+++ b/src/ingest/sampling/mod.rs
@@ -258,4 +258,63 @@ mod tests {
         let indices = choose_indices_default(total, cap, |i| i == 0);
         assert_eq!(indices, (0..total).collect::<Vec<_>>());
     }
+
+    #[test]
+    fn head_sampler_includes_required_beyond_cap() {
+        let total = 20usize;
+        let cap = 3usize;
+        // Head keeps 0,1,2 — force index 17 to also be included
+        let indices = choose_indices_head(total, cap, |i| i == 17);
+        assert_eq!(&indices[..3], &[0, 1, 2]);
+        assert!(
+            indices.contains(&17),
+            "must_include index should be present: {indices:?}"
+        );
+        for w in indices.windows(2) {
+            assert!(w[0] < w[1], "indices should be sorted: {indices:?}");
+        }
+    }
+
+    #[test]
+    fn head_sampler_no_duplicates_when_required_already_sampled() {
+        let total = 10usize;
+        let cap = 5usize;
+        // Index 2 is already in head range 0..5
+        let indices = choose_indices_head(total, cap, |i| i == 2);
+        assert_eq!(indices, (0..5).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn tail_sampler_includes_required_beyond_cap() {
+        let total = 20usize;
+        let cap = 3usize;
+        // Tail keeps 17,18,19 — force index 2 to also be included
+        let indices = choose_indices_tail(total, cap, |i| i == 2);
+        assert!(indices.contains(&2), "must_include index should be present");
+        assert!(indices.contains(&17));
+        assert_eq!(indices, vec![2, 17, 18, 19]);
+    }
+
+    #[test]
+    fn tail_sampler_no_duplicates_when_required_already_sampled() {
+        let total = 10usize;
+        let cap = 5usize;
+        // Index 7 is already in tail range 5..10
+        let indices = choose_indices_tail(total, cap, |i| i == 7);
+        assert_eq!(indices, (5..10).collect::<Vec<_>>());
+    }
+
+    #[test]
+    fn tail_sampler_with_zero_cap_returns_only_required() {
+        let total = 10usize;
+        let indices = choose_indices_tail(total, 0, |i| i == 4 || i == 8);
+        assert_eq!(indices, vec![4, 8]);
+    }
+
+    #[test]
+    fn head_sampler_with_zero_cap_returns_only_required() {
+        let total = 10usize;
+        let indices = choose_indices_head(total, 0, |i| i == 4 || i == 8);
+        assert_eq!(indices, vec![4, 8]);
+    }
 }

From 3702eee2c27a31a3fafb1ede5d33372c6f574da7 Mon Sep 17 00:00:00 2001
From: Daniel Kantor <git@daniel-kantor.com>
Date: Mon, 2 Feb 2026 09:13:01 +0100
Subject: [PATCH 5/6] .

---
 src/ingest/fileset.rs          | 51 ++++++++++++++++++++++++----------
 src/ingest/formats/json/mod.rs |  8 ------
 src/ingest/mod.rs              |  4 +--
 src/lib.rs                     |  3 +-
 src/serialization/tests.rs     |  4 +++
 5 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/src/ingest/fileset.rs b/src/ingest/fileset.rs
index fe821ac2..9dd0329d 100644
--- a/src/ingest/fileset.rs
+++ b/src/ingest/fileset.rs
@@ -1,17 +1,17 @@
+use crate::grep::GrepConfig;
 use crate::order::NodeKind;
 use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
 
 use super::IngestOutput;
 use super::formats::{
-    json::{
-        build_json_tree_arena_from_slice, build_jsonl_tree_arena_from_slice,
-    },
+    json::build_json_tree_arena_from_slice,
     text::{
         build_text_tree_arena_from_bytes,
         build_text_tree_arena_from_bytes_with_mode,
     },
     yaml::build_yaml_tree_arena_from_bytes,
 };
+use super::jsonl_grep_predicate;
 use crate::PriorityConfig;
 
 /// Input descriptor for a single file in a multi-format fileset ingest.
@@ -41,7 +41,19 @@ pub enum FilesetInputKind {
 pub fn parse_fileset_multi(
     inputs: Vec<FilesetInput>,
     cfg: &PriorityConfig,
+    grep: &GrepConfig,
 ) -> IngestOutput {
+    let has_strong_grep = grep.has_strong();
+    // For non-JSONL formats under strong grep, disable array sampling so
+    // we don't accidentally sample away matching lines.
+    let non_jsonl_cfg = if has_strong_grep {
+        let mut c = *cfg;
+        c.array_max_items = usize::MAX;
+        c
+    } else {
+        *cfg
+    };
+
     let mut entries: Vec<FilesetEntry> = Vec::with_capacity(inputs.len());
     let mut warnings: Vec<String> = Vec::new();
     for FilesetInput {
@@ -54,26 +66,35 @@ pub fn parse_fileset_multi(
             FilesetInputKind::Json => parse_or_empty(
                 &name,
                 &mut bytes,
-                cfg,
+                &non_jsonl_cfg,
                 &mut warnings,
                 "JSON",
-                |bytes, cfg| build_json_tree_arena_from_slice(bytes, cfg),
-            ),
-            FilesetInputKind::Jsonl => parse_or_empty(
-                &name,
-                &bytes,
-                cfg,
-                &mut warnings,
-                "JSONL",
-                |bytes, cfg| build_jsonl_tree_arena_from_slice(bytes, cfg),
+                |bytes, c| build_json_tree_arena_from_slice(bytes, c),
             ),
+            FilesetInputKind::Jsonl => {
+                let must_include = jsonl_grep_predicate(&bytes, grep);
+                parse_or_empty(
+                    &name,
+                    &bytes,
+                    cfg,
+                    &mut warnings,
+                    "JSONL",
+                    |bytes, c| {
+                        crate::ingest::formats::json::parse_jsonl_one(
+                            bytes,
+                            c,
+                            &*must_include,
+                        )
+                    },
+                )
+            }
             FilesetInputKind::Yaml => parse_or_empty(
                 &name,
                 &bytes,
-                cfg,
+                &non_jsonl_cfg,
                 &mut warnings,
                 "YAML",
-                |bytes, cfg| build_yaml_tree_arena_from_bytes(bytes, cfg),
+                |bytes, c| build_yaml_tree_arena_from_bytes(bytes, c),
             ),
             FilesetInputKind::Text { atomic_lines } => {
                 (parse_text_bytes(&bytes, cfg, atomic_lines), false)
diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs
index e38507c9..69bd28fb 100644
--- a/src/ingest/formats/json/mod.rs
+++ b/src/ingest/formats/json/mod.rs
@@ -141,14 +141,6 @@ pub fn parse_jsonl_one(
     Ok(arena)
 }
 
-/// Parse JSONL from a byte slice (for fileset use).
-pub(crate) fn build_jsonl_tree_arena_from_slice(
-    bytes: &[u8],
-    cfg: &PriorityConfig,
-) -> Result<TreeArena> {
-    parse_jsonl_one(bytes, cfg, |_| false)
-}
-
 /// Convenience functions for the JSON ingest path.
 pub fn parse_json_one(
     bytes: Vec<u8>,
diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs
index 7f454093..bfcdf7d6 100644
--- a/src/ingest/mod.rs
+++ b/src/ingest/mod.rs
@@ -31,7 +31,7 @@ pub(crate) struct IngestOutput {
 ///
 /// Uses a single regex scan over the entire text and maps match positions
 /// back to line indices, avoiding per-line regex overhead.
-fn jsonl_grep_predicate(
+pub(crate) fn jsonl_grep_predicate(
     bytes: &[u8],
     grep: &GrepConfig,
 ) -> Box<dyn Fn(usize) -> bool> {
@@ -96,7 +96,7 @@ pub(crate) fn ingest_into_arena(
             )
         }
         InputKind::Fileset(inputs) => {
-            Ok(fileset::parse_fileset_multi(inputs, priority_cfg))
+            Ok(fileset::parse_fileset_multi(inputs, priority_cfg, grep))
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index d75a76ab..0abcf1ea 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -77,7 +77,8 @@ pub fn headson(
 ) -> Result<RenderOutput> {
     let mut prio = *priority_cfg;
     let is_jsonl = matches!(input, InputKind::Jsonl(_));
-    if grep.has_strong() && !is_jsonl {
+    let is_fileset = matches!(input, InputKind::Fileset(_));
+    if grep.has_strong() && !is_jsonl && !is_fileset {
         // Avoid sampling away potential matches in strong grep mode.
         // JSONL handles this via must_include in the sampler instead.
         prio.array_max_items = usize::MAX;
diff --git a/src/serialization/tests.rs b/src/serialization/tests.rs
index 8ec08964..5238cdea 100644
--- a/src/serialization/tests.rs
+++ b/src/serialization/tests.rs
@@ -560,6 +560,7 @@ fn fileset_tree_headers_free_keep_slot_stats_on_body_only() {
             },
         ],
         &cfg_prio,
+        &crate::GrepConfig::default(),
     )
     .arena;
     let order = build_order(&arena, &cfg_prio).unwrap();
@@ -644,6 +645,7 @@ fn fileset_tree_headers_free_scaffold_does_not_change_slot_stats() {
             },
         ],
         &cfg_prio,
+        &crate::GrepConfig::default(),
     )
     .arena;
     let order = build_order(&arena, &cfg_prio).unwrap();
@@ -734,6 +736,7 @@ fn fileset_sections_slot_stats_respect_header_budgeting() {
             },
         ],
         &cfg_prio,
+        &crate::GrepConfig::default(),
     )
     .arena;
     let order = build_order(&arena, &cfg_prio).unwrap();
@@ -826,6 +829,7 @@ fn slot_stats_match_render_for_code_and_text() {
             },
         }],
         &cfg_prio,
+        &crate::GrepConfig::default(),
     )
     .arena;
     let order = build_order(&arena, &cfg_prio).unwrap();

From 9c992f33e0601eadee2418268d249b349bd22c60 Mon Sep 17 00:00:00 2001
From: Daniel Kantor <git@daniel-kantor.com>
Date: Mon, 2 Feb 2026 09:26:12 +0100
Subject: [PATCH 6/6] .

---
 src/ingest/fileset.rs          |  13 +---
 src/ingest/formats/json/mod.rs |   8 ++-
 src/ingest/formats/text/mod.rs |   6 +-
 src/ingest/formats/yaml/mod.rs |   7 +-
 src/ingest/mod.rs              |  31 +++++++--
 src/ingest/sampling/mod.rs     | 122 +++++++++++++--------------------
 src/lib.rs                     |  12 +---
 7 files changed, 86 insertions(+), 113 deletions(-)

diff --git a/src/ingest/fileset.rs b/src/ingest/fileset.rs
index 9dd0329d..caffcc39 100644
--- a/src/ingest/fileset.rs
+++ b/src/ingest/fileset.rs
@@ -11,7 +11,7 @@ use super::formats::{
     },
     yaml::build_yaml_tree_arena_from_bytes,
 };
-use super::jsonl_grep_predicate;
+use super::{grep_adjusted_cfg, jsonl_grep_predicate};
 use crate::PriorityConfig;
 
 /// Input descriptor for a single file in a multi-format fileset ingest.
@@ -43,16 +43,7 @@ pub fn parse_fileset_multi(
     cfg: &PriorityConfig,
     grep: &GrepConfig,
 ) -> IngestOutput {
-    let has_strong_grep = grep.has_strong();
-    // For non-JSONL formats under strong grep, disable array sampling so
-    // we don't accidentally sample away matching lines.
-    let non_jsonl_cfg = if has_strong_grep {
-        let mut c = *cfg;
-        c.array_max_items = usize::MAX;
-        c
-    } else {
-        *cfg
-    };
+    let non_jsonl_cfg = grep_adjusted_cfg(cfg, grep);
 
     let mut entries: Vec<FilesetEntry> = Vec::with_capacity(inputs.len());
     let mut warnings: Vec<String> = Vec::new();
diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs
index 69bd28fb..974975d1 100644
--- a/src/ingest/formats/json/mod.rs
+++ b/src/ingest/formats/json/mod.rs
@@ -96,7 +96,9 @@ pub fn parse_jsonl_one(
     cfg: &PriorityConfig,
     must_include: impl Fn(usize) -> bool,
 ) -> Result<TreeArena> {
-    use crate::ingest::sampling::{ArraySamplerKind, choose_indices};
+    use crate::ingest::sampling::{
+        ArraySamplerKind, choose_indices, merge_required,
+    };
 
     let text = std::str::from_utf8(bytes)
         .map_err(|e| anyhow::anyhow!("JSONL input is not valid UTF-8: {e}"))?;
@@ -104,8 +106,8 @@ pub fn parse_jsonl_one(
     let line_offsets = jsonl_line_offsets(text);
     let total = line_offsets.len();
     let sampler_kind: ArraySamplerKind = cfg.array_sampler.into();
-    let kept_indices =
-        choose_indices(sampler_kind, total, cfg.array_max_items, must_include);
+    let sampled = choose_indices(sampler_kind, total, cfg.array_max_items);
+    let kept_indices = merge_required(sampled, total, &must_include);
 
     let builder = JsonTreeBuilder::new(cfg.array_max_items, sampler_kind);
     let root_id = builder.push_default();
diff --git a/src/ingest/formats/text/mod.rs b/src/ingest/formats/text/mod.rs
index b3574509..175000c8 100644
--- a/src/ingest/formats/text/mod.rs
+++ b/src/ingest/formats/text/mod.rs
@@ -129,8 +129,7 @@ impl TextArenaBuilder {
             n.arr_indices_len = 0;
             return id;
         }
-        let idxs =
-            choose_indices(self.sampler, total, self.array_cap, |_| false);
+        let idxs = choose_indices(self.sampler, total, self.array_cap);
         let kept = idxs.len().min(self.array_cap);
         let children_start = self.arena.children.len();
         for &orig_index in idxs.iter().take(kept) {
@@ -193,8 +192,7 @@ impl TextArenaBuilder {
         lines: &[String],
         total: usize,
     ) {
-        let idxs =
-            choose_indices(self.sampler, total, self.array_cap, |_| false);
+        let idxs = choose_indices(self.sampler, total, self.array_cap);
         let kept = idxs.len().min(self.array_cap);
         let children_start = self.arena.children.len();
         let mut pushed = 0usize;
diff --git a/src/ingest/formats/yaml/mod.rs b/src/ingest/formats/yaml/mod.rs
index 66651835..a6bbbb9e 100644
--- a/src/ingest/formats/yaml/mod.rs
+++ b/src/ingest/formats/yaml/mod.rs
@@ -159,12 +159,7 @@ impl YamlArenaBuilder {
         match y {
             Yaml::Array(v) => {
                 let total = v.len();
-                let idxs = choose_indices(
-                    self.sampler,
-                    total,
-                    self.array_cap,
-                    |_| false,
-                );
+                let idxs = choose_indices(self.sampler, total, self.array_cap);
                 let mut child_ids = Vec::with_capacity(idxs.len());
                 for i in &idxs {
                     if let Some(item) = v.get(*i) {
diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs
index bfcdf7d6..03e27105 100644
--- a/src/ingest/mod.rs
+++ b/src/ingest/mod.rs
@@ -26,6 +26,22 @@ pub(crate) struct IngestOutput {
     pub warnings: Vec<String>,
 }
 
+/// Return a copy of `cfg` with array sampling disabled when strong grep is
+/// active. Non-JSONL formats need this to avoid sampling away matches;
+/// JSONL handles it via `merge_required` in the sampler instead.
+pub(crate) fn grep_adjusted_cfg(
+    cfg: &PriorityConfig,
+    grep: &GrepConfig,
+) -> PriorityConfig {
+    if grep.has_strong() {
+        let mut c = *cfg;
+        c.array_max_items = usize::MAX;
+        c
+    } else {
+        *cfg
+    }
+}
+
 /// Build a predicate that returns true for JSONL line indices matching the
 /// strong grep pattern. When no grep is active, returns a no-op.
 ///
@@ -66,7 +82,8 @@ pub(crate) fn ingest_into_arena(
 ) -> Result<IngestOutput> {
     match input {
         InputKind::Json(bytes) => {
-            parse_json_one(bytes, priority_cfg).map(|arena| IngestOutput {
+            let cfg = grep_adjusted_cfg(priority_cfg, grep);
+            parse_json_one(bytes, &cfg).map(|arena| IngestOutput {
                 arena,
                 warnings: Vec::new(),
             })
@@ -81,19 +98,21 @@ pub(crate) fn ingest_into_arena(
             )
         }
         InputKind::Yaml(bytes) => {
-            parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput {
+            let cfg = grep_adjusted_cfg(priority_cfg, grep);
+            parse_yaml_one(&bytes, &cfg).map(|arena| IngestOutput {
                 arena,
                 warnings: Vec::new(),
             })
         }
         InputKind::Text { bytes, mode } => {
+            let cfg = grep_adjusted_cfg(priority_cfg, grep);
             let atomic = matches!(mode, crate::TextMode::CodeLike);
-            parse_text_one_with_mode(bytes, priority_cfg, atomic).map(
-                |arena| IngestOutput {
+            parse_text_one_with_mode(bytes, &cfg, atomic).map(|arena| {
+                IngestOutput {
                     arena,
                     warnings: Vec::new(),
-                },
-            )
+                }
+            })
         }
         InputKind::Fileset(inputs) => {
             Ok(fileset::parse_fileset_multi(inputs, priority_cfg, grep))
diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs
index 185663c7..621c039d 100644
--- a/src/ingest/sampling/mod.rs
+++ b/src/ingest/sampling/mod.rs
@@ -47,18 +47,13 @@ fn accept_index(i: u64) -> bool {
 }
 
 /// Choose indices using the default policy (keep-first, greedy, random accept).
-/// Items for which `must_include(i)` returns true are always kept.
 #[allow(
     clippy::cognitive_complexity,
     reason = "Single function mirrors JSON streaming sampler phases"
 )]
-pub fn choose_indices_default(
-    total: usize,
-    cap: usize,
-    must_include: impl Fn(usize) -> bool,
-) -> Vec<usize> {
+pub fn choose_indices_default(total: usize, cap: usize) -> Vec<usize> {
     if cap == 0 || total == 0 {
-        return collect_required(total, &must_include);
+        return Vec::new();
     }
     if cap >= total {
         return (0..total).collect();
@@ -71,7 +66,7 @@ pub fn choose_indices_default(
     }
     if out.len() >= cap || out.len() >= total {
         out.truncate(cap.min(total));
-        return merge_required(out, total, &must_include);
+        return out;
     }
     // Greedy phase: take a portion of remaining capacity linearly
     let mut idx = keep_first;
@@ -84,7 +79,7 @@ pub fn choose_indices_default(
         g += 1;
     }
     if out.len() >= cap || idx >= total {
-        return merge_required(out, total, &must_include);
+        return out;
     }
     // Random phase: use accept_index on logical index to thin remaining
     while out.len() < cap && idx < total {
@@ -93,67 +88,47 @@ pub fn choose_indices_default(
         }
         idx += 1;
     }
-    merge_required(out, total, &must_include)
+    out
 }
 
 /// Choose head prefix indices.
-/// Items for which `must_include(i)` returns true are always kept.
-pub fn choose_indices_head(
-    total: usize,
-    cap: usize,
-    must_include: impl Fn(usize) -> bool,
-) -> Vec<usize> {
+pub fn choose_indices_head(total: usize, cap: usize) -> Vec<usize> {
     let kept = total.min(cap);
-    let out: Vec<usize> = (0..kept).collect();
-    merge_required(out, total, &must_include)
+    (0..kept).collect()
 }
 
 /// Choose tail suffix indices.
-/// Items for which `must_include(i)` returns true are always kept.
-pub fn choose_indices_tail(
-    total: usize,
-    cap: usize,
-    must_include: impl Fn(usize) -> bool,
-) -> Vec<usize> {
+pub fn choose_indices_tail(total: usize, cap: usize) -> Vec<usize> {
     if cap == 0 || total == 0 {
-        return collect_required(total, &must_include);
+        return Vec::new();
     }
     let kept = total.min(cap);
     let start = total.saturating_sub(kept);
-    let out: Vec<usize> = (start..total).collect();
-    merge_required(out, total, &must_include)
+    (start..total).collect()
 }
 
 /// Dispatcher: choose indices for a given sampler kind.
-/// Items for which `must_include(i)` returns true are always kept,
-/// regardless of the sampling strategy or cap.
 pub fn choose_indices(
     kind: ArraySamplerKind,
     total: usize,
     cap: usize,
-    must_include: impl Fn(usize) -> bool,
 ) -> Vec<usize> {
     match kind {
-        ArraySamplerKind::Default => {
-            choose_indices_default(total, cap, must_include)
-        }
-        ArraySamplerKind::Head => {
-            choose_indices_head(total, cap, must_include)
-        }
-        ArraySamplerKind::Tail => {
-            choose_indices_tail(total, cap, must_include)
-        }
+        ArraySamplerKind::Default => choose_indices_default(total, cap),
+        ArraySamplerKind::Head => choose_indices_head(total, cap),
+        ArraySamplerKind::Tail => choose_indices_tail(total, cap),
     }
 }
 
 /// Merge required indices into an already-chosen set, preserving sorted order.
-/// All required indices are unconditionally kept — correctness demands that
-/// `must_include` items are never silently dropped.
+///
+/// Use this as a post-step after `choose_indices` when certain indices must
+/// be unconditionally kept (e.g., JSONL lines matching a grep pattern).
 #[allow(
     clippy::cognitive_complexity,
     reason = "Linear collect-and-merge logic reads clearest as a single function"
 )]
-fn merge_required(
+pub fn merge_required(
     sampled: Vec<usize>,
     total: usize,
     must_include: &impl Fn(usize) -> bool,
@@ -188,14 +163,6 @@ fn merge_required(
     result
 }
 
-/// Collect only the required indices (used when cap is 0).
-fn collect_required(
-    total: usize,
-    must_include: &impl Fn(usize) -> bool,
-) -> Vec<usize> {
-    (0..total).filter(|&i| must_include(i)).collect()
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -204,7 +171,7 @@ mod tests {
     fn default_sampler_returns_all_when_cap_not_binding() {
         let total = 10usize;
         let cap = total + 5;
-        let indices = choose_indices_default(total, cap, |_| false);
+        let indices = choose_indices_default(total, cap);
         assert_eq!(indices, (0..total).collect::<Vec<_>>());
     }
 
@@ -212,16 +179,17 @@ mod tests {
     fn default_sampler_respects_cap_when_smaller() {
         let total = 10usize;
         let cap = 3usize;
-        let indices = choose_indices_default(total, cap, |_| false);
+        let indices = choose_indices_default(total, cap);
         assert!(indices.len() <= cap);
     }
 
     #[test]
-    fn must_include_adds_missing_indices() {
+    fn merge_required_adds_missing_indices() {
         let total = 20usize;
         let cap = 3usize;
+        let sampled = choose_indices_default(total, cap);
         // Force index 15 to be included even though cap is 3
-        let indices = choose_indices_default(total, cap, |i| i == 15);
+        let indices = merge_required(sampled, total, &|i| i == 15);
         assert!(
             indices.contains(&15),
             "must_include index should be present: {indices:?}"
@@ -231,11 +199,11 @@ mod tests {
     }
 
     #[test]
-    fn must_include_preserves_sorted_order() {
+    fn merge_required_preserves_sorted_order() {
         let total = 100usize;
         let cap = 5usize;
-        let indices =
-            choose_indices_default(total, cap, |i| i == 50 || i == 90);
+        let sampled = choose_indices_default(total, cap);
+        let indices = merge_required(sampled, total, &|i| i == 50 || i == 90);
         for w in indices.windows(2) {
             assert!(w[0] < w[1], "indices should be sorted: {indices:?}");
         }
@@ -244,27 +212,30 @@ mod tests {
     }
 
     #[test]
-    fn must_include_with_zero_cap() {
+    fn merge_required_with_zero_cap() {
         let total = 10usize;
-        let indices = choose_indices_default(total, 0, |i| i == 3 || i == 7);
+        let sampled = choose_indices_default(total, 0);
+        let indices = merge_required(sampled, total, &|i| i == 3 || i == 7);
         assert_eq!(indices, vec![3, 7]);
     }
 
     #[test]
-    fn must_include_no_duplicates_when_already_sampled() {
+    fn merge_required_no_duplicates_when_already_sampled() {
         let total = 10usize;
         let cap = 10usize;
+        let sampled = choose_indices_default(total, cap);
         // All indices already sampled; must_include shouldn't duplicate
-        let indices = choose_indices_default(total, cap, |i| i == 0);
+        let indices = merge_required(sampled, total, &|i| i == 0);
         assert_eq!(indices, (0..total).collect::<Vec<_>>());
     }
 
     #[test]
-    fn head_sampler_includes_required_beyond_cap() {
+    fn head_sampler_merge_includes_required_beyond_cap() {
         let total = 20usize;
         let cap = 3usize;
+        let sampled = choose_indices_head(total, cap);
         // Head keeps 0,1,2 — force index 17 to also be included
-        let indices = choose_indices_head(total, cap, |i| i == 17);
+        let indices = merge_required(sampled, total, &|i| i == 17);
         assert_eq!(&indices[..3], &[0, 1, 2]);
         assert!(
             indices.contains(&17),
@@ -276,45 +247,50 @@ mod tests {
     }
 
     #[test]
-    fn head_sampler_no_duplicates_when_required_already_sampled() {
+    fn head_sampler_merge_no_duplicates_when_already_sampled() {
         let total = 10usize;
         let cap = 5usize;
+        let sampled = choose_indices_head(total, cap);
         // Index 2 is already in head range 0..5
-        let indices = choose_indices_head(total, cap, |i| i == 2);
+        let indices = merge_required(sampled, total, &|i| i == 2);
         assert_eq!(indices, (0..5).collect::<Vec<_>>());
     }
 
     #[test]
-    fn tail_sampler_includes_required_beyond_cap() {
+    fn tail_sampler_merge_includes_required_beyond_cap() {
         let total = 20usize;
         let cap = 3usize;
+        let sampled = choose_indices_tail(total, cap);
         // Tail keeps 17,18,19 — force index 2 to also be included
-        let indices = choose_indices_tail(total, cap, |i| i == 2);
+        let indices = merge_required(sampled, total, &|i| i == 2);
         assert!(indices.contains(&2), "must_include index should be present");
         assert!(indices.contains(&17));
         assert_eq!(indices, vec![2, 17, 18, 19]);
     }
 
     #[test]
-    fn tail_sampler_no_duplicates_when_required_already_sampled() {
+    fn tail_sampler_merge_no_duplicates_when_already_sampled() {
         let total = 10usize;
         let cap = 5usize;
+        let sampled = choose_indices_tail(total, cap);
         // Index 7 is already in tail range 5..10
-        let indices = choose_indices_tail(total, cap, |i| i == 7);
+        let indices = merge_required(sampled, total, &|i| i == 7);
         assert_eq!(indices, (5..10).collect::<Vec<_>>());
     }
 
     #[test]
-    fn tail_sampler_with_zero_cap_returns_only_required() {
+    fn tail_sampler_merge_with_zero_cap_returns_only_required() {
         let total = 10usize;
-        let indices = choose_indices_tail(total, 0, |i| i == 4 || i == 8);
+        let sampled = choose_indices_tail(total, 0);
+        let indices = merge_required(sampled, total, &|i| i == 4 || i == 8);
         assert_eq!(indices, vec![4, 8]);
     }
 
     #[test]
-    fn head_sampler_with_zero_cap_returns_only_required() {
+    fn head_sampler_merge_with_zero_cap_returns_only_required() {
         let total = 10usize;
-        let indices = choose_indices_head(total, 0, |i| i == 4 || i == 8);
+        let sampled = choose_indices_head(total, 0);
+        let indices = merge_required(sampled, total, &|i| i == 4 || i == 8);
         assert_eq!(indices, vec![4, 8]);
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 0abcf1ea..d087aa0b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -75,17 +75,9 @@ pub fn headson(
     grep: &GrepConfig,
     budgets: Budgets,
 ) -> Result<RenderOutput> {
-    let mut prio = *priority_cfg;
-    let is_jsonl = matches!(input, InputKind::Jsonl(_));
-    let is_fileset = matches!(input, InputKind::Fileset(_));
-    if grep.has_strong() && !is_jsonl && !is_fileset {
-        // Avoid sampling away potential matches in strong grep mode.
-        // JSONL handles this via must_include in the sampler instead.
-        prio.array_max_items = usize::MAX;
-    }
     let crate::ingest::IngestOutput { arena, warnings } =
-        crate::ingest::ingest_into_arena(input, &prio, grep)?;
-    let mut order_build = order::build_order(&arena, &prio)?;
+        crate::ingest::ingest_into_arena(input, priority_cfg, grep)?;
+    let mut order_build = order::build_order(&arena, priority_cfg)?;
     let out = find_largest_render_under_budgets(
         &mut order_build,
         config,