From ce396ec1fdba7f93cf725c36c1f8dd99de7cfb11 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Sun, 1 Feb 2026 22:58:35 +0100 Subject: [PATCH 1/6] perf: optimize sampling for grep mode --- src/ingest/formats/json/mod.rs | 11 ++- src/ingest/formats/text/mod.rs | 4 +- src/ingest/formats/yaml/mod.rs | 2 +- src/ingest/mod.rs | 45 ++++++++- src/ingest/sampling/mod.rs | 164 ++++++++++++++++++++++++++++++--- src/lib.rs | 6 +- 6 files changed, 204 insertions(+), 28 deletions(-) diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs index 70b4fbaa..90bfbb90 100644 --- a/src/ingest/formats/json/mod.rs +++ b/src/ingest/formats/json/mod.rs @@ -67,7 +67,7 @@ pub(crate) fn build_json_tree_arena_from_many( } /// Collect (byte_start, 1-based line number) for every non-empty line. -fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { +pub fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { let mut offsets = Vec::new(); let mut pos = 0usize; for (line_idx, raw_line) in text.split('\n').enumerate() { @@ -88,10 +88,13 @@ fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { /// /// Lines are sampled using the same strategy as JSON arrays (controlled by /// `PriorityConfig::array_max_items` and `array_sampler`), so only a subset -/// of lines is actually parsed for large inputs. +/// of lines is actually parsed for large inputs. When a `must_include` +/// predicate is provided, matching lines are always kept regardless of the +/// sampling cap. pub fn parse_jsonl_one( bytes: &[u8], cfg: &PriorityConfig, + must_include: impl Fn(usize) -> bool, ) -> Result { use crate::ingest::sampling::{ArraySamplerKind, choose_indices}; @@ -102,7 +105,7 @@ pub fn parse_jsonl_one( let total = line_offsets.len(); let sampler_kind: ArraySamplerKind = cfg.array_sampler.into(); let kept_indices = - choose_indices(sampler_kind, total, cfg.array_max_items); + choose_indices(sampler_kind, total, cfg.array_max_items, must_include); let builder = JsonTreeBuilder::new(cfg.array_max_items, sampler_kind); let root_id = builder.push_default(); @@ -143,7 +146,7 @@ pub(crate) fn build_jsonl_tree_arena_from_slice( bytes: &[u8], cfg: &PriorityConfig, ) -> Result { - parse_jsonl_one(bytes, cfg) + parse_jsonl_one(bytes, cfg, |_| false) } /// Convenience functions for the JSON ingest path. diff --git a/src/ingest/formats/text/mod.rs b/src/ingest/formats/text/mod.rs index 175000c8..aa76897b 100644 --- a/src/ingest/formats/text/mod.rs +++ b/src/ingest/formats/text/mod.rs @@ -129,7 +129,7 @@ impl TextArenaBuilder { n.arr_indices_len = 0; return id; } - let idxs = choose_indices(self.sampler, total, self.array_cap); + let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false); let kept = idxs.len().min(self.array_cap); let children_start = self.arena.children.len(); for &orig_index in idxs.iter().take(kept) { @@ -192,7 +192,7 @@ impl TextArenaBuilder { lines: &[String], total: usize, ) { - let idxs = choose_indices(self.sampler, total, self.array_cap); + let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false); let kept = idxs.len().min(self.array_cap); let children_start = self.arena.children.len(); let mut pushed = 0usize; diff --git a/src/ingest/formats/yaml/mod.rs b/src/ingest/formats/yaml/mod.rs index a6bbbb9e..d8b9d936 100644 --- a/src/ingest/formats/yaml/mod.rs +++ b/src/ingest/formats/yaml/mod.rs @@ -159,7 +159,7 @@ impl YamlArenaBuilder { match y { Yaml::Array(v) => { let total = v.len(); - let idxs = choose_indices(self.sampler, total, self.array_cap); + let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false); let mut child_ids = Vec::with_capacity(idxs.len()); for i in &idxs { if let Some(item) = v.get(*i) { diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index fb9fec26..c36f67c1 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -3,6 +3,7 @@ use anyhow::Result; use crate::order::PriorityConfig; use crate::utils::tree_arena::JsonTreeArena as TreeArena; +use crate::grep::GrepConfig; use crate::InputKind; pub mod fileset; @@ -25,10 +26,43 @@ pub(crate) struct IngestOutput { pub warnings: Vec, } +/// Build a predicate that returns true for JSONL line indices matching the +/// strong grep pattern. When no grep is active, returns a no-op. +/// +/// Uses a single regex scan over the entire text and maps match positions +/// back to line indices, avoiding per-line regex overhead. +fn jsonl_grep_predicate( + bytes: &[u8], + grep: &GrepConfig, +) -> Box bool> { + let Some(re) = grep.patterns.strong() else { + return Box::new(|_| false); + }; + let Ok(text) = std::str::from_utf8(bytes) else { + return Box::new(|_| false); + }; + let offsets = formats::json::jsonl_line_offsets(text); + if offsets.is_empty() { + return Box::new(|_| false); + } + // Single regex pass: find all match positions and map to line indices. + let mut matching = vec![false; offsets.len()]; + for m in re.find_iter(text) { + let pos = m.start(); + // Binary search for the line containing this byte position. + let idx = offsets.partition_point(|&(start, _)| start <= pos); + if idx > 0 { + matching[idx - 1] = true; + } + } + Box::new(move |i: usize| matching.get(i).copied().unwrap_or(false)) +} + /// Dispatch the appropriate ingest path for any supported input kind. pub(crate) fn ingest_into_arena( input: InputKind, priority_cfg: &PriorityConfig, + grep: &GrepConfig, ) -> Result { match input { InputKind::Json(bytes) => { @@ -38,10 +72,12 @@ pub(crate) fn ingest_into_arena( }) } InputKind::Jsonl(bytes) => { - parse_jsonl_one(&bytes, priority_cfg).map(|arena| IngestOutput { - arena, - warnings: Vec::new(), - }) + let must_include = jsonl_grep_predicate(&bytes, grep); + parse_jsonl_one(&bytes, priority_cfg, |i| must_include(i)) + .map(|arena| IngestOutput { + arena, + warnings: Vec::new(), + }) } InputKind::Yaml(bytes) => { parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput { @@ -118,6 +154,7 @@ mod tests { let IngestOutput { arena, warnings } = ingest_into_arena( InputKind::Fileset(inputs), &PriorityConfig::new(usize::MAX, usize::MAX), + &GrepConfig::default(), ) .unwrap(); assert!(arena.is_fileset, "fileset input should mark arena"); diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs index 8e22e07c..8cdb0411 100644 --- a/src/ingest/sampling/mod.rs +++ b/src/ingest/sampling/mod.rs @@ -47,13 +47,18 @@ fn accept_index(i: u64) -> bool { } /// Choose indices using the default policy (keep-first, greedy, random accept). +/// Items for which `must_include(i)` returns true are always kept. #[allow( clippy::cognitive_complexity, reason = "Single function mirrors JSON streaming sampler phases" )] -pub fn choose_indices_default(total: usize, cap: usize) -> Vec { +pub fn choose_indices_default( + total: usize, + cap: usize, + must_include: impl Fn(usize) -> bool, +) -> Vec { if cap == 0 || total == 0 { - return Vec::new(); + return collect_required(total, cap, &must_include); } if cap >= total { return (0..total).collect(); @@ -66,7 +71,7 @@ pub fn choose_indices_default(total: usize, cap: usize) -> Vec { } if out.len() >= cap || out.len() >= total { out.truncate(cap.min(total)); - return out; + return merge_required(out, total, cap, &must_include); } // Greedy phase: take a portion of remaining capacity linearly let mut idx = keep_first; @@ -79,7 +84,7 @@ pub fn choose_indices_default(total: usize, cap: usize) -> Vec { g += 1; } if out.len() >= cap || idx >= total { - return out; + return merge_required(out, total, cap, &must_include); } // Random phase: use accept_index on logical index to thin remaining while out.len() < cap && idx < total { @@ -88,38 +93,123 @@ pub fn choose_indices_default(total: usize, cap: usize) -> Vec { } idx += 1; } - out + merge_required(out, total, cap, &must_include) } /// Choose head prefix indices. -pub fn choose_indices_head(total: usize, cap: usize) -> Vec { +/// Items for which `must_include(i)` returns true are always kept. +pub fn choose_indices_head( + total: usize, + cap: usize, + must_include: impl Fn(usize) -> bool, +) -> Vec { let kept = total.min(cap); - (0..kept).collect() + let out: Vec = (0..kept).collect(); + merge_required(out, total, cap, &must_include) } /// Choose tail suffix indices. -pub fn choose_indices_tail(total: usize, cap: usize) -> Vec { +/// Items for which `must_include(i)` returns true are always kept. +pub fn choose_indices_tail( + total: usize, + cap: usize, + must_include: impl Fn(usize) -> bool, +) -> Vec { if cap == 0 || total == 0 { - return Vec::new(); + return collect_required(total, cap, &must_include); } let kept = total.min(cap); let start = total.saturating_sub(kept); - (start..total).collect() + let out: Vec = (start..total).collect(); + merge_required(out, total, cap, &must_include) } /// Dispatcher: choose indices for a given sampler kind. +/// Items for which `must_include(i)` returns true are always kept, +/// regardless of the sampling strategy or cap. pub fn choose_indices( kind: ArraySamplerKind, total: usize, cap: usize, + must_include: impl Fn(usize) -> bool, ) -> Vec { match kind { - ArraySamplerKind::Default => choose_indices_default(total, cap), - ArraySamplerKind::Head => choose_indices_head(total, cap), - ArraySamplerKind::Tail => choose_indices_tail(total, cap), + ArraySamplerKind::Default => { + choose_indices_default(total, cap, must_include) + } + ArraySamplerKind::Head => { + choose_indices_head(total, cap, must_include) + } + ArraySamplerKind::Tail => { + choose_indices_tail(total, cap, must_include) + } } } +/// Merge required indices into an already-chosen set, preserving sorted order. +/// At most `cap` extra required indices are added (sampled from the required +/// set using the same head/mid/tail distribution) to avoid blowing up when +/// most items match. +fn merge_required( + sampled: Vec, + total: usize, + cap: usize, + must_include: &impl Fn(usize) -> bool, +) -> Vec { + let mut seen = vec![false; total]; + for &i in &sampled { + seen[i] = true; + } + let mut extra: Vec = Vec::new(); + for i in 0..total { + if !seen[i] && must_include(i) { + extra.push(i); + } + } + if extra.is_empty() { + return sampled; + } + // Sub-sample the extras so we don't blow past the cap. + if extra.len() > cap { + let sub = subsample_indices(extra.len(), cap); + extra = sub.into_iter().map(|i| extra[i]).collect(); + } + // Merge both sorted sequences + let mut result = Vec::with_capacity(sampled.len() + extra.len()); + let (mut si, mut ei) = (0, 0); + while si < sampled.len() && ei < extra.len() { + if sampled[si] <= extra[ei] { + result.push(sampled[si]); + si += 1; + } else { + result.push(extra[ei]); + ei += 1; + } + } + result.extend_from_slice(&sampled[si..]); + result.extend_from_slice(&extra[ei..]); + result +} + +/// Collect only the required indices (used when cap is 0). +fn collect_required( + total: usize, + cap: usize, + must_include: &impl Fn(usize) -> bool, +) -> Vec { + let all: Vec = (0..total).filter(|&i| must_include(i)).collect(); + if all.len() <= cap || cap == 0 { + return all; + } + let sub = subsample_indices(all.len(), cap); + sub.into_iter().map(|i| all[i]).collect() +} + +/// Pure default-policy sub-sampling with no `must_include` (breaks recursion). +fn subsample_indices(total: usize, cap: usize) -> Vec { + choose_indices_default(total, cap, |_| false) +} + #[cfg(test)] mod tests { use super::*; @@ -128,7 +218,7 @@ mod tests { fn default_sampler_returns_all_when_cap_not_binding() { let total = 10usize; let cap = total + 5; - let indices = choose_indices_default(total, cap); + let indices = choose_indices_default(total, cap, |_| false); assert_eq!(indices, (0..total).collect::>()); } @@ -136,7 +226,51 @@ mod tests { fn default_sampler_respects_cap_when_smaller() { let total = 10usize; let cap = 3usize; - let indices = choose_indices_default(total, cap); + let indices = choose_indices_default(total, cap, |_| false); assert!(indices.len() <= cap); } + + #[test] + fn must_include_adds_missing_indices() { + let total = 20usize; + let cap = 3usize; + // Force index 15 to be included even though cap is 3 + let indices = choose_indices_default(total, cap, |i| i == 15); + assert!( + indices.contains(&15), + "must_include index should be present: {indices:?}" + ); + // Original sampled indices should still be present + assert!(indices.contains(&0), "head items should be present"); + } + + #[test] + fn must_include_preserves_sorted_order() { + let total = 100usize; + let cap = 5usize; + let indices = + choose_indices_default(total, cap, |i| i == 50 || i == 90); + for w in indices.windows(2) { + assert!(w[0] < w[1], "indices should be sorted: {indices:?}"); + } + assert!(indices.contains(&50)); + assert!(indices.contains(&90)); + } + + #[test] + fn must_include_with_zero_cap() { + let total = 10usize; + let indices = + choose_indices_default(total, 0, |i| i == 3 || i == 7); + assert_eq!(indices, vec![3, 7]); + } + + #[test] + fn must_include_no_duplicates_when_already_sampled() { + let total = 10usize; + let cap = 10usize; + // All indices already sampled; must_include shouldn't duplicate + let indices = choose_indices_default(total, cap, |i| i == 0); + assert_eq!(indices, (0..total).collect::>()); + } } diff --git a/src/lib.rs b/src/lib.rs index d275174e..d75a76ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -76,12 +76,14 @@ pub fn headson( budgets: Budgets, ) -> Result { let mut prio = *priority_cfg; - if grep.has_strong() { + let is_jsonl = matches!(input, InputKind::Jsonl(_)); + if grep.has_strong() && !is_jsonl { // Avoid sampling away potential matches in strong grep mode. + // JSONL handles this via must_include in the sampler instead. prio.array_max_items = usize::MAX; } let crate::ingest::IngestOutput { arena, warnings } = - crate::ingest::ingest_into_arena(input, &prio)?; + crate::ingest::ingest_into_arena(input, &prio, grep)?; let mut order_build = order::build_order(&arena, &prio)?; let out = find_largest_render_under_budgets( &mut order_build, From b19489c8632ba72dcbd145f4f43f514f373f35ae Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Sun, 1 Feb 2026 22:58:50 +0100 Subject: [PATCH 2/6] fixes to grep mode sampling --- src/ingest/formats/text/mod.rs | 6 ++++-- src/ingest/formats/yaml/mod.rs | 7 ++++++- src/ingest/mod.rs | 9 +++++---- src/ingest/sampling/mod.rs | 11 +++++++---- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/ingest/formats/text/mod.rs b/src/ingest/formats/text/mod.rs index aa76897b..b3574509 100644 --- a/src/ingest/formats/text/mod.rs +++ b/src/ingest/formats/text/mod.rs @@ -129,7 +129,8 @@ impl TextArenaBuilder { n.arr_indices_len = 0; return id; } - let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false); + let idxs = + choose_indices(self.sampler, total, self.array_cap, |_| false); let kept = idxs.len().min(self.array_cap); let children_start = self.arena.children.len(); for &orig_index in idxs.iter().take(kept) { @@ -192,7 +193,8 @@ impl TextArenaBuilder { lines: &[String], total: usize, ) { - let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false); + let idxs = + choose_indices(self.sampler, total, self.array_cap, |_| false); let kept = idxs.len().min(self.array_cap); let children_start = self.arena.children.len(); let mut pushed = 0usize; diff --git a/src/ingest/formats/yaml/mod.rs b/src/ingest/formats/yaml/mod.rs index d8b9d936..66651835 100644 --- a/src/ingest/formats/yaml/mod.rs +++ b/src/ingest/formats/yaml/mod.rs @@ -159,7 +159,12 @@ impl YamlArenaBuilder { match y { Yaml::Array(v) => { let total = v.len(); - let idxs = choose_indices(self.sampler, total, self.array_cap, |_| false); + let idxs = choose_indices( + self.sampler, + total, + self.array_cap, + |_| false, + ); let mut child_ids = Vec::with_capacity(idxs.len()); for i in &idxs { if let Some(item) = v.get(*i) { diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index c36f67c1..dd48a47e 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -3,8 +3,8 @@ use anyhow::Result; use crate::order::PriorityConfig; use crate::utils::tree_arena::JsonTreeArena as TreeArena; -use crate::grep::GrepConfig; use crate::InputKind; +use crate::grep::GrepConfig; pub mod fileset; pub mod format; @@ -73,11 +73,12 @@ pub(crate) fn ingest_into_arena( } InputKind::Jsonl(bytes) => { let must_include = jsonl_grep_predicate(&bytes, grep); - parse_jsonl_one(&bytes, priority_cfg, |i| must_include(i)) - .map(|arena| IngestOutput { + parse_jsonl_one(&bytes, priority_cfg, &*must_include).map( + |arena| IngestOutput { arena, warnings: Vec::new(), - }) + }, + ) } InputKind::Yaml(bytes) => { parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput { diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs index 8cdb0411..b7f57b79 100644 --- a/src/ingest/sampling/mod.rs +++ b/src/ingest/sampling/mod.rs @@ -150,6 +150,10 @@ pub fn choose_indices( /// At most `cap` extra required indices are added (sampled from the required /// set using the same head/mid/tail distribution) to avoid blowing up when /// most items match. +#[allow( + clippy::cognitive_complexity, + reason = "Linear collect-and-merge logic reads clearest as a single function" +)] fn merge_required( sampled: Vec, total: usize, @@ -161,8 +165,8 @@ fn merge_required( seen[i] = true; } let mut extra: Vec = Vec::new(); - for i in 0..total { - if !seen[i] && must_include(i) { + for (i, &already) in seen.iter().enumerate() { + if !already && must_include(i) { extra.push(i); } } @@ -260,8 +264,7 @@ mod tests { #[test] fn must_include_with_zero_cap() { let total = 10usize; - let indices = - choose_indices_default(total, 0, |i| i == 3 || i == 7); + let indices = choose_indices_default(total, 0, |i| i == 3 || i == 7); assert_eq!(indices, vec![3, 7]); } From dce04173b01f7fd26901eff163fb2d1f14f8bb75 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Sun, 1 Feb 2026 23:33:42 +0100 Subject: [PATCH 3/6] fix undersampling bug --- src/ingest/sampling/mod.rs | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs index b7f57b79..10a16a3b 100644 --- a/src/ingest/sampling/mod.rs +++ b/src/ingest/sampling/mod.rs @@ -58,7 +58,7 @@ pub fn choose_indices_default( must_include: impl Fn(usize) -> bool, ) -> Vec { if cap == 0 || total == 0 { - return collect_required(total, cap, &must_include); + return collect_required(total, &must_include); } if cap >= total { return (0..total).collect(); @@ -71,7 +71,7 @@ pub fn choose_indices_default( } if out.len() >= cap || out.len() >= total { out.truncate(cap.min(total)); - return merge_required(out, total, cap, &must_include); + return merge_required(out, total, &must_include); } // Greedy phase: take a portion of remaining capacity linearly let mut idx = keep_first; @@ -84,7 +84,7 @@ pub fn choose_indices_default( g += 1; } if out.len() >= cap || idx >= total { - return merge_required(out, total, cap, &must_include); + return merge_required(out, total, &must_include); } // Random phase: use accept_index on logical index to thin remaining while out.len() < cap && idx < total { @@ -93,7 +93,7 @@ pub fn choose_indices_default( } idx += 1; } - merge_required(out, total, cap, &must_include) + merge_required(out, total, &must_include) } /// Choose head prefix indices. @@ -105,7 +105,7 @@ pub fn choose_indices_head( ) -> Vec { let kept = total.min(cap); let out: Vec = (0..kept).collect(); - merge_required(out, total, cap, &must_include) + merge_required(out, total, &must_include) } /// Choose tail suffix indices. @@ -116,12 +116,12 @@ pub fn choose_indices_tail( must_include: impl Fn(usize) -> bool, ) -> Vec { if cap == 0 || total == 0 { - return collect_required(total, cap, &must_include); + return collect_required(total, &must_include); } let kept = total.min(cap); let start = total.saturating_sub(kept); let out: Vec = (start..total).collect(); - merge_required(out, total, cap, &must_include) + merge_required(out, total, &must_include) } /// Dispatcher: choose indices for a given sampler kind. @@ -147,9 +147,8 @@ pub fn choose_indices( } /// Merge required indices into an already-chosen set, preserving sorted order. -/// At most `cap` extra required indices are added (sampled from the required -/// set using the same head/mid/tail distribution) to avoid blowing up when -/// most items match. +/// All required indices are unconditionally kept — correctness demands that +/// `must_include` items are never silently dropped. #[allow( clippy::cognitive_complexity, reason = "Linear collect-and-merge logic reads clearest as a single function" @@ -157,7 +156,6 @@ pub fn choose_indices( fn merge_required( sampled: Vec, total: usize, - cap: usize, must_include: &impl Fn(usize) -> bool, ) -> Vec { let mut seen = vec![false; total]; @@ -173,11 +171,6 @@ fn merge_required( if extra.is_empty() { return sampled; } - // Sub-sample the extras so we don't blow past the cap. - if extra.len() > cap { - let sub = subsample_indices(extra.len(), cap); - extra = sub.into_iter().map(|i| extra[i]).collect(); - } // Merge both sorted sequences let mut result = Vec::with_capacity(sampled.len() + extra.len()); let (mut si, mut ei) = (0, 0); @@ -198,20 +191,9 @@ fn merge_required( /// Collect only the required indices (used when cap is 0). fn collect_required( total: usize, - cap: usize, must_include: &impl Fn(usize) -> bool, ) -> Vec { - let all: Vec = (0..total).filter(|&i| must_include(i)).collect(); - if all.len() <= cap || cap == 0 { - return all; - } - let sub = subsample_indices(all.len(), cap); - sub.into_iter().map(|i| all[i]).collect() -} - -/// Pure default-policy sub-sampling with no `must_include` (breaks recursion). -fn subsample_indices(total: usize, cap: usize) -> Vec { - choose_indices_default(total, cap, |_| false) + (0..total).filter(|&i| must_include(i)).collect() } #[cfg(test)] From 0fb4143bbf206a7267aa9d2b15c8a980683bbc00 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Mon, 2 Feb 2026 09:07:40 +0100 Subject: [PATCH 4/6] . --- src/ingest/formats/json/mod.rs | 2 +- src/ingest/mod.rs | 80 ++++++++++++++++++++++++++++++++++ src/ingest/sampling/mod.rs | 59 +++++++++++++++++++++++++ 3 files changed, 140 insertions(+), 1 deletion(-) diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs index 90bfbb90..e38507c9 100644 --- a/src/ingest/formats/json/mod.rs +++ b/src/ingest/formats/json/mod.rs @@ -67,7 +67,7 @@ pub(crate) fn build_json_tree_arena_from_many( } /// Collect (byte_start, 1-based line number) for every non-empty line. -pub fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { +pub(crate) fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { let mut offsets = Vec::new(); let mut pos = 0usize; for (line_idx, raw_line) in text.split('\n').enumerate() { diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index dd48a47e..7f454093 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -104,6 +104,7 @@ pub(crate) fn ingest_into_arena( #[cfg(test)] mod tests { use super::*; + use crate::grep::{GrepConfig, GrepPatterns, GrepShow}; use crate::order::NodeKind; #[test] @@ -145,6 +146,85 @@ mod tests { assert_eq!(arena.nodes[root].object_len.unwrap_or(0), 2); } + fn grep_with_strong(pattern: &str) -> GrepConfig { + GrepConfig { + patterns: GrepPatterns::StrongOnly( + regex::Regex::new(pattern).unwrap(), + ), + show: GrepShow::Matching, + } + } + + #[test] + fn jsonl_grep_predicate_marks_matching_lines() { + let input = b"{\"a\":1}\n{\"b\":2}\n{\"c\":3}\n"; + let grep = grep_with_strong("b"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(0), "line 0 should not match"); + assert!(pred(1), "line 1 should match 'b'"); + assert!(!pred(2), "line 2 should not match"); + } + + #[test] + fn jsonl_grep_predicate_multiple_matches() { + let input = b"{\"x\":1}\n{\"x\":2}\n{\"y\":3}\n{\"x\":4}\n"; + let grep = grep_with_strong("x"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(pred(0)); + assert!(pred(1)); + assert!(!pred(2)); + assert!(pred(3)); + } + + #[test] + fn jsonl_grep_predicate_no_strong_pattern_returns_noop() { + let input = b"{\"a\":1}\n{\"b\":2}\n"; + let grep = GrepConfig::default(); // no patterns + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(0)); + assert!(!pred(1)); + } + + #[test] + fn jsonl_grep_predicate_skips_empty_lines() { + // Empty lines are excluded from offsets, so indices are dense + let input = b"{\"a\":1}\n\n{\"b\":2}\n"; + let grep = grep_with_strong("b"); + let pred = jsonl_grep_predicate(input, &grep); + // Only 2 non-empty lines: index 0 = {"a":1}, index 1 = {"b":2} + assert!(!pred(0)); + assert!(pred(1)); + } + + #[test] + fn jsonl_grep_predicate_match_on_first_line() { + let input = b"{\"needle\":true}\n{\"other\":false}\n"; + let grep = grep_with_strong("needle"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(pred(0), "match on first line should work"); + assert!(!pred(1)); + } + + #[test] + fn jsonl_grep_predicate_match_on_last_line() { + let input = b"{\"a\":1}\n{\"needle\":true}"; + let grep = grep_with_strong("needle"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(0)); + assert!( + pred(1), + "match on last line (no trailing newline) should work" + ); + } + + #[test] + fn jsonl_grep_predicate_out_of_bounds_returns_false() { + let input = b"{\"a\":1}\n{\"b\":2}\n"; + let grep = grep_with_strong("a"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(99), "out of bounds index should return false"); + } + #[test] fn fileset_ingest_surfaces_parse_warnings() { let inputs = vec![fileset::FilesetInput { diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs index 10a16a3b..185663c7 100644 --- a/src/ingest/sampling/mod.rs +++ b/src/ingest/sampling/mod.rs @@ -258,4 +258,63 @@ mod tests { let indices = choose_indices_default(total, cap, |i| i == 0); assert_eq!(indices, (0..total).collect::>()); } + + #[test] + fn head_sampler_includes_required_beyond_cap() { + let total = 20usize; + let cap = 3usize; + // Head keeps 0,1,2 — force index 17 to also be included + let indices = choose_indices_head(total, cap, |i| i == 17); + assert_eq!(&indices[..3], &[0, 1, 2]); + assert!( + indices.contains(&17), + "must_include index should be present: {indices:?}" + ); + for w in indices.windows(2) { + assert!(w[0] < w[1], "indices should be sorted: {indices:?}"); + } + } + + #[test] + fn head_sampler_no_duplicates_when_required_already_sampled() { + let total = 10usize; + let cap = 5usize; + // Index 2 is already in head range 0..5 + let indices = choose_indices_head(total, cap, |i| i == 2); + assert_eq!(indices, (0..5).collect::>()); + } + + #[test] + fn tail_sampler_includes_required_beyond_cap() { + let total = 20usize; + let cap = 3usize; + // Tail keeps 17,18,19 — force index 2 to also be included + let indices = choose_indices_tail(total, cap, |i| i == 2); + assert!(indices.contains(&2), "must_include index should be present"); + assert!(indices.contains(&17)); + assert_eq!(indices, vec![2, 17, 18, 19]); + } + + #[test] + fn tail_sampler_no_duplicates_when_required_already_sampled() { + let total = 10usize; + let cap = 5usize; + // Index 7 is already in tail range 5..10 + let indices = choose_indices_tail(total, cap, |i| i == 7); + assert_eq!(indices, (5..10).collect::>()); + } + + #[test] + fn tail_sampler_with_zero_cap_returns_only_required() { + let total = 10usize; + let indices = choose_indices_tail(total, 0, |i| i == 4 || i == 8); + assert_eq!(indices, vec![4, 8]); + } + + #[test] + fn head_sampler_with_zero_cap_returns_only_required() { + let total = 10usize; + let indices = choose_indices_head(total, 0, |i| i == 4 || i == 8); + assert_eq!(indices, vec![4, 8]); + } } From 3702eee2c27a31a3fafb1ede5d33372c6f574da7 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Mon, 2 Feb 2026 09:13:01 +0100 Subject: [PATCH 5/6] . --- src/ingest/fileset.rs | 51 ++++++++++++++++++++++++---------- src/ingest/formats/json/mod.rs | 8 ------ src/ingest/mod.rs | 4 +-- src/lib.rs | 3 +- src/serialization/tests.rs | 4 +++ 5 files changed, 44 insertions(+), 26 deletions(-) diff --git a/src/ingest/fileset.rs b/src/ingest/fileset.rs index fe821ac2..9dd0329d 100644 --- a/src/ingest/fileset.rs +++ b/src/ingest/fileset.rs @@ -1,17 +1,17 @@ +use crate::grep::GrepConfig; use crate::order::NodeKind; use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode}; use super::IngestOutput; use super::formats::{ - json::{ - build_json_tree_arena_from_slice, build_jsonl_tree_arena_from_slice, - }, + json::build_json_tree_arena_from_slice, text::{ build_text_tree_arena_from_bytes, build_text_tree_arena_from_bytes_with_mode, }, yaml::build_yaml_tree_arena_from_bytes, }; +use super::jsonl_grep_predicate; use crate::PriorityConfig; /// Input descriptor for a single file in a multi-format fileset ingest. @@ -41,7 +41,19 @@ pub enum FilesetInputKind { pub fn parse_fileset_multi( inputs: Vec, cfg: &PriorityConfig, + grep: &GrepConfig, ) -> IngestOutput { + let has_strong_grep = grep.has_strong(); + // For non-JSONL formats under strong grep, disable array sampling so + // we don't accidentally sample away matching lines. + let non_jsonl_cfg = if has_strong_grep { + let mut c = *cfg; + c.array_max_items = usize::MAX; + c + } else { + *cfg + }; + let mut entries: Vec = Vec::with_capacity(inputs.len()); let mut warnings: Vec = Vec::new(); for FilesetInput { @@ -54,26 +66,35 @@ pub fn parse_fileset_multi( FilesetInputKind::Json => parse_or_empty( &name, &mut bytes, - cfg, + &non_jsonl_cfg, &mut warnings, "JSON", - |bytes, cfg| build_json_tree_arena_from_slice(bytes, cfg), - ), - FilesetInputKind::Jsonl => parse_or_empty( - &name, - &bytes, - cfg, - &mut warnings, - "JSONL", - |bytes, cfg| build_jsonl_tree_arena_from_slice(bytes, cfg), + |bytes, c| build_json_tree_arena_from_slice(bytes, c), ), + FilesetInputKind::Jsonl => { + let must_include = jsonl_grep_predicate(&bytes, grep); + parse_or_empty( + &name, + &bytes, + cfg, + &mut warnings, + "JSONL", + |bytes, c| { + crate::ingest::formats::json::parse_jsonl_one( + bytes, + c, + &*must_include, + ) + }, + ) + } FilesetInputKind::Yaml => parse_or_empty( &name, &bytes, - cfg, + &non_jsonl_cfg, &mut warnings, "YAML", - |bytes, cfg| build_yaml_tree_arena_from_bytes(bytes, cfg), + |bytes, c| build_yaml_tree_arena_from_bytes(bytes, c), ), FilesetInputKind::Text { atomic_lines } => { (parse_text_bytes(&bytes, cfg, atomic_lines), false) diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs index e38507c9..69bd28fb 100644 --- a/src/ingest/formats/json/mod.rs +++ b/src/ingest/formats/json/mod.rs @@ -141,14 +141,6 @@ pub fn parse_jsonl_one( Ok(arena) } -/// Parse JSONL from a byte slice (for fileset use). -pub(crate) fn build_jsonl_tree_arena_from_slice( - bytes: &[u8], - cfg: &PriorityConfig, -) -> Result { - parse_jsonl_one(bytes, cfg, |_| false) -} - /// Convenience functions for the JSON ingest path. pub fn parse_json_one( bytes: Vec, diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index 7f454093..bfcdf7d6 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -31,7 +31,7 @@ pub(crate) struct IngestOutput { /// /// Uses a single regex scan over the entire text and maps match positions /// back to line indices, avoiding per-line regex overhead. -fn jsonl_grep_predicate( +pub(crate) fn jsonl_grep_predicate( bytes: &[u8], grep: &GrepConfig, ) -> Box bool> { @@ -96,7 +96,7 @@ pub(crate) fn ingest_into_arena( ) } InputKind::Fileset(inputs) => { - Ok(fileset::parse_fileset_multi(inputs, priority_cfg)) + Ok(fileset::parse_fileset_multi(inputs, priority_cfg, grep)) } } } diff --git a/src/lib.rs b/src/lib.rs index d75a76ab..0abcf1ea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,7 +77,8 @@ pub fn headson( ) -> Result { let mut prio = *priority_cfg; let is_jsonl = matches!(input, InputKind::Jsonl(_)); - if grep.has_strong() && !is_jsonl { + let is_fileset = matches!(input, InputKind::Fileset(_)); + if grep.has_strong() && !is_jsonl && !is_fileset { // Avoid sampling away potential matches in strong grep mode. // JSONL handles this via must_include in the sampler instead. prio.array_max_items = usize::MAX; diff --git a/src/serialization/tests.rs b/src/serialization/tests.rs index 8ec08964..5238cdea 100644 --- a/src/serialization/tests.rs +++ b/src/serialization/tests.rs @@ -560,6 +560,7 @@ fn fileset_tree_headers_free_keep_slot_stats_on_body_only() { }, ], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); @@ -644,6 +645,7 @@ fn fileset_tree_headers_free_scaffold_does_not_change_slot_stats() { }, ], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); @@ -734,6 +736,7 @@ fn fileset_sections_slot_stats_respect_header_budgeting() { }, ], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); @@ -826,6 +829,7 @@ fn slot_stats_match_render_for_code_and_text() { }, }], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); From 9c992f33e0601eadee2418268d249b349bd22c60 Mon Sep 17 00:00:00 2001 From: Daniel Kantor Date: Mon, 2 Feb 2026 09:26:12 +0100 Subject: [PATCH 6/6] . --- src/ingest/fileset.rs | 13 +--- src/ingest/formats/json/mod.rs | 8 ++- src/ingest/formats/text/mod.rs | 6 +- src/ingest/formats/yaml/mod.rs | 7 +- src/ingest/mod.rs | 31 +++++++-- src/ingest/sampling/mod.rs | 122 +++++++++++++-------------------- src/lib.rs | 12 +--- 7 files changed, 86 insertions(+), 113 deletions(-) diff --git a/src/ingest/fileset.rs b/src/ingest/fileset.rs index 9dd0329d..caffcc39 100644 --- a/src/ingest/fileset.rs +++ b/src/ingest/fileset.rs @@ -11,7 +11,7 @@ use super::formats::{ }, yaml::build_yaml_tree_arena_from_bytes, }; -use super::jsonl_grep_predicate; +use super::{grep_adjusted_cfg, jsonl_grep_predicate}; use crate::PriorityConfig; /// Input descriptor for a single file in a multi-format fileset ingest. @@ -43,16 +43,7 @@ pub fn parse_fileset_multi( cfg: &PriorityConfig, grep: &GrepConfig, ) -> IngestOutput { - let has_strong_grep = grep.has_strong(); - // For non-JSONL formats under strong grep, disable array sampling so - // we don't accidentally sample away matching lines. - let non_jsonl_cfg = if has_strong_grep { - let mut c = *cfg; - c.array_max_items = usize::MAX; - c - } else { - *cfg - }; + let non_jsonl_cfg = grep_adjusted_cfg(cfg, grep); let mut entries: Vec = Vec::with_capacity(inputs.len()); let mut warnings: Vec = Vec::new(); diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs index 69bd28fb..974975d1 100644 --- a/src/ingest/formats/json/mod.rs +++ b/src/ingest/formats/json/mod.rs @@ -96,7 +96,9 @@ pub fn parse_jsonl_one( cfg: &PriorityConfig, must_include: impl Fn(usize) -> bool, ) -> Result { - use crate::ingest::sampling::{ArraySamplerKind, choose_indices}; + use crate::ingest::sampling::{ + ArraySamplerKind, choose_indices, merge_required, + }; let text = std::str::from_utf8(bytes) .map_err(|e| anyhow::anyhow!("JSONL input is not valid UTF-8: {e}"))?; @@ -104,8 +106,8 @@ pub fn parse_jsonl_one( let line_offsets = jsonl_line_offsets(text); let total = line_offsets.len(); let sampler_kind: ArraySamplerKind = cfg.array_sampler.into(); - let kept_indices = - choose_indices(sampler_kind, total, cfg.array_max_items, must_include); + let sampled = choose_indices(sampler_kind, total, cfg.array_max_items); + let kept_indices = merge_required(sampled, total, &must_include); let builder = JsonTreeBuilder::new(cfg.array_max_items, sampler_kind); let root_id = builder.push_default(); diff --git a/src/ingest/formats/text/mod.rs b/src/ingest/formats/text/mod.rs index b3574509..175000c8 100644 --- a/src/ingest/formats/text/mod.rs +++ b/src/ingest/formats/text/mod.rs @@ -129,8 +129,7 @@ impl TextArenaBuilder { n.arr_indices_len = 0; return id; } - let idxs = - choose_indices(self.sampler, total, self.array_cap, |_| false); + let idxs = choose_indices(self.sampler, total, self.array_cap); let kept = idxs.len().min(self.array_cap); let children_start = self.arena.children.len(); for &orig_index in idxs.iter().take(kept) { @@ -193,8 +192,7 @@ impl TextArenaBuilder { lines: &[String], total: usize, ) { - let idxs = - choose_indices(self.sampler, total, self.array_cap, |_| false); + let idxs = choose_indices(self.sampler, total, self.array_cap); let kept = idxs.len().min(self.array_cap); let children_start = self.arena.children.len(); let mut pushed = 0usize; diff --git a/src/ingest/formats/yaml/mod.rs b/src/ingest/formats/yaml/mod.rs index 66651835..a6bbbb9e 100644 --- a/src/ingest/formats/yaml/mod.rs +++ b/src/ingest/formats/yaml/mod.rs @@ -159,12 +159,7 @@ impl YamlArenaBuilder { match y { Yaml::Array(v) => { let total = v.len(); - let idxs = choose_indices( - self.sampler, - total, - self.array_cap, - |_| false, - ); + let idxs = choose_indices(self.sampler, total, self.array_cap); let mut child_ids = Vec::with_capacity(idxs.len()); for i in &idxs { if let Some(item) = v.get(*i) { diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index bfcdf7d6..03e27105 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -26,6 +26,22 @@ pub(crate) struct IngestOutput { pub warnings: Vec, } +/// Return a copy of `cfg` with array sampling disabled when strong grep is +/// active. Non-JSONL formats need this to avoid sampling away matches; +/// JSONL handles it via `merge_required` in the sampler instead. +pub(crate) fn grep_adjusted_cfg( + cfg: &PriorityConfig, + grep: &GrepConfig, +) -> PriorityConfig { + if grep.has_strong() { + let mut c = *cfg; + c.array_max_items = usize::MAX; + c + } else { + *cfg + } +} + /// Build a predicate that returns true for JSONL line indices matching the /// strong grep pattern. When no grep is active, returns a no-op. /// @@ -66,7 +82,8 @@ pub(crate) fn ingest_into_arena( ) -> Result { match input { InputKind::Json(bytes) => { - parse_json_one(bytes, priority_cfg).map(|arena| IngestOutput { + let cfg = grep_adjusted_cfg(priority_cfg, grep); + parse_json_one(bytes, &cfg).map(|arena| IngestOutput { arena, warnings: Vec::new(), }) @@ -81,19 +98,21 @@ pub(crate) fn ingest_into_arena( ) } InputKind::Yaml(bytes) => { - parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput { + let cfg = grep_adjusted_cfg(priority_cfg, grep); + parse_yaml_one(&bytes, &cfg).map(|arena| IngestOutput { arena, warnings: Vec::new(), }) } InputKind::Text { bytes, mode } => { + let cfg = grep_adjusted_cfg(priority_cfg, grep); let atomic = matches!(mode, crate::TextMode::CodeLike); - parse_text_one_with_mode(bytes, priority_cfg, atomic).map( - |arena| IngestOutput { + parse_text_one_with_mode(bytes, &cfg, atomic).map(|arena| { + IngestOutput { arena, warnings: Vec::new(), - }, - ) + } + }) } InputKind::Fileset(inputs) => { Ok(fileset::parse_fileset_multi(inputs, priority_cfg, grep)) diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs index 185663c7..621c039d 100644 --- a/src/ingest/sampling/mod.rs +++ b/src/ingest/sampling/mod.rs @@ -47,18 +47,13 @@ fn accept_index(i: u64) -> bool { } /// Choose indices using the default policy (keep-first, greedy, random accept). -/// Items for which `must_include(i)` returns true are always kept. #[allow( clippy::cognitive_complexity, reason = "Single function mirrors JSON streaming sampler phases" )] -pub fn choose_indices_default( - total: usize, - cap: usize, - must_include: impl Fn(usize) -> bool, -) -> Vec { +pub fn choose_indices_default(total: usize, cap: usize) -> Vec { if cap == 0 || total == 0 { - return collect_required(total, &must_include); + return Vec::new(); } if cap >= total { return (0..total).collect(); @@ -71,7 +66,7 @@ pub fn choose_indices_default( } if out.len() >= cap || out.len() >= total { out.truncate(cap.min(total)); - return merge_required(out, total, &must_include); + return out; } // Greedy phase: take a portion of remaining capacity linearly let mut idx = keep_first; @@ -84,7 +79,7 @@ pub fn choose_indices_default( g += 1; } if out.len() >= cap || idx >= total { - return merge_required(out, total, &must_include); + return out; } // Random phase: use accept_index on logical index to thin remaining while out.len() < cap && idx < total { @@ -93,67 +88,47 @@ pub fn choose_indices_default( } idx += 1; } - merge_required(out, total, &must_include) + out } /// Choose head prefix indices. -/// Items for which `must_include(i)` returns true are always kept. -pub fn choose_indices_head( - total: usize, - cap: usize, - must_include: impl Fn(usize) -> bool, -) -> Vec { +pub fn choose_indices_head(total: usize, cap: usize) -> Vec { let kept = total.min(cap); - let out: Vec = (0..kept).collect(); - merge_required(out, total, &must_include) + (0..kept).collect() } /// Choose tail suffix indices. -/// Items for which `must_include(i)` returns true are always kept. -pub fn choose_indices_tail( - total: usize, - cap: usize, - must_include: impl Fn(usize) -> bool, -) -> Vec { +pub fn choose_indices_tail(total: usize, cap: usize) -> Vec { if cap == 0 || total == 0 { - return collect_required(total, &must_include); + return Vec::new(); } let kept = total.min(cap); let start = total.saturating_sub(kept); - let out: Vec = (start..total).collect(); - merge_required(out, total, &must_include) + (start..total).collect() } /// Dispatcher: choose indices for a given sampler kind. -/// Items for which `must_include(i)` returns true are always kept, -/// regardless of the sampling strategy or cap. pub fn choose_indices( kind: ArraySamplerKind, total: usize, cap: usize, - must_include: impl Fn(usize) -> bool, ) -> Vec { match kind { - ArraySamplerKind::Default => { - choose_indices_default(total, cap, must_include) - } - ArraySamplerKind::Head => { - choose_indices_head(total, cap, must_include) - } - ArraySamplerKind::Tail => { - choose_indices_tail(total, cap, must_include) - } + ArraySamplerKind::Default => choose_indices_default(total, cap), + ArraySamplerKind::Head => choose_indices_head(total, cap), + ArraySamplerKind::Tail => choose_indices_tail(total, cap), } } /// Merge required indices into an already-chosen set, preserving sorted order. -/// All required indices are unconditionally kept — correctness demands that -/// `must_include` items are never silently dropped. +/// +/// Use this as a post-step after `choose_indices` when certain indices must +/// be unconditionally kept (e.g., JSONL lines matching a grep pattern). #[allow( clippy::cognitive_complexity, reason = "Linear collect-and-merge logic reads clearest as a single function" )] -fn merge_required( +pub fn merge_required( sampled: Vec, total: usize, must_include: &impl Fn(usize) -> bool, @@ -188,14 +163,6 @@ fn merge_required( result } -/// Collect only the required indices (used when cap is 0). -fn collect_required( - total: usize, - must_include: &impl Fn(usize) -> bool, -) -> Vec { - (0..total).filter(|&i| must_include(i)).collect() -} - #[cfg(test)] mod tests { use super::*; @@ -204,7 +171,7 @@ mod tests { fn default_sampler_returns_all_when_cap_not_binding() { let total = 10usize; let cap = total + 5; - let indices = choose_indices_default(total, cap, |_| false); + let indices = choose_indices_default(total, cap); assert_eq!(indices, (0..total).collect::>()); } @@ -212,16 +179,17 @@ mod tests { fn default_sampler_respects_cap_when_smaller() { let total = 10usize; let cap = 3usize; - let indices = choose_indices_default(total, cap, |_| false); + let indices = choose_indices_default(total, cap); assert!(indices.len() <= cap); } #[test] - fn must_include_adds_missing_indices() { + fn merge_required_adds_missing_indices() { let total = 20usize; let cap = 3usize; + let sampled = choose_indices_default(total, cap); // Force index 15 to be included even though cap is 3 - let indices = choose_indices_default(total, cap, |i| i == 15); + let indices = merge_required(sampled, total, &|i| i == 15); assert!( indices.contains(&15), "must_include index should be present: {indices:?}" @@ -231,11 +199,11 @@ mod tests { } #[test] - fn must_include_preserves_sorted_order() { + fn merge_required_preserves_sorted_order() { let total = 100usize; let cap = 5usize; - let indices = - choose_indices_default(total, cap, |i| i == 50 || i == 90); + let sampled = choose_indices_default(total, cap); + let indices = merge_required(sampled, total, &|i| i == 50 || i == 90); for w in indices.windows(2) { assert!(w[0] < w[1], "indices should be sorted: {indices:?}"); } @@ -244,27 +212,30 @@ mod tests { } #[test] - fn must_include_with_zero_cap() { + fn merge_required_with_zero_cap() { let total = 10usize; - let indices = choose_indices_default(total, 0, |i| i == 3 || i == 7); + let sampled = choose_indices_default(total, 0); + let indices = merge_required(sampled, total, &|i| i == 3 || i == 7); assert_eq!(indices, vec![3, 7]); } #[test] - fn must_include_no_duplicates_when_already_sampled() { + fn merge_required_no_duplicates_when_already_sampled() { let total = 10usize; let cap = 10usize; + let sampled = choose_indices_default(total, cap); // All indices already sampled; must_include shouldn't duplicate - let indices = choose_indices_default(total, cap, |i| i == 0); + let indices = merge_required(sampled, total, &|i| i == 0); assert_eq!(indices, (0..total).collect::>()); } #[test] - fn head_sampler_includes_required_beyond_cap() { + fn head_sampler_merge_includes_required_beyond_cap() { let total = 20usize; let cap = 3usize; + let sampled = choose_indices_head(total, cap); // Head keeps 0,1,2 — force index 17 to also be included - let indices = choose_indices_head(total, cap, |i| i == 17); + let indices = merge_required(sampled, total, &|i| i == 17); assert_eq!(&indices[..3], &[0, 1, 2]); assert!( indices.contains(&17), @@ -276,45 +247,50 @@ mod tests { } #[test] - fn head_sampler_no_duplicates_when_required_already_sampled() { + fn head_sampler_merge_no_duplicates_when_already_sampled() { let total = 10usize; let cap = 5usize; + let sampled = choose_indices_head(total, cap); // Index 2 is already in head range 0..5 - let indices = choose_indices_head(total, cap, |i| i == 2); + let indices = merge_required(sampled, total, &|i| i == 2); assert_eq!(indices, (0..5).collect::>()); } #[test] - fn tail_sampler_includes_required_beyond_cap() { + fn tail_sampler_merge_includes_required_beyond_cap() { let total = 20usize; let cap = 3usize; + let sampled = choose_indices_tail(total, cap); // Tail keeps 17,18,19 — force index 2 to also be included - let indices = choose_indices_tail(total, cap, |i| i == 2); + let indices = merge_required(sampled, total, &|i| i == 2); assert!(indices.contains(&2), "must_include index should be present"); assert!(indices.contains(&17)); assert_eq!(indices, vec![2, 17, 18, 19]); } #[test] - fn tail_sampler_no_duplicates_when_required_already_sampled() { + fn tail_sampler_merge_no_duplicates_when_already_sampled() { let total = 10usize; let cap = 5usize; + let sampled = choose_indices_tail(total, cap); // Index 7 is already in tail range 5..10 - let indices = choose_indices_tail(total, cap, |i| i == 7); + let indices = merge_required(sampled, total, &|i| i == 7); assert_eq!(indices, (5..10).collect::>()); } #[test] - fn tail_sampler_with_zero_cap_returns_only_required() { + fn tail_sampler_merge_with_zero_cap_returns_only_required() { let total = 10usize; - let indices = choose_indices_tail(total, 0, |i| i == 4 || i == 8); + let sampled = choose_indices_tail(total, 0); + let indices = merge_required(sampled, total, &|i| i == 4 || i == 8); assert_eq!(indices, vec![4, 8]); } #[test] - fn head_sampler_with_zero_cap_returns_only_required() { + fn head_sampler_merge_with_zero_cap_returns_only_required() { let total = 10usize; - let indices = choose_indices_head(total, 0, |i| i == 4 || i == 8); + let sampled = choose_indices_head(total, 0); + let indices = merge_required(sampled, total, &|i| i == 4 || i == 8); assert_eq!(indices, vec![4, 8]); } } diff --git a/src/lib.rs b/src/lib.rs index 0abcf1ea..d087aa0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -75,17 +75,9 @@ pub fn headson( grep: &GrepConfig, budgets: Budgets, ) -> Result { - let mut prio = *priority_cfg; - let is_jsonl = matches!(input, InputKind::Jsonl(_)); - let is_fileset = matches!(input, InputKind::Fileset(_)); - if grep.has_strong() && !is_jsonl && !is_fileset { - // Avoid sampling away potential matches in strong grep mode. - // JSONL handles this via must_include in the sampler instead. - prio.array_max_items = usize::MAX; - } let crate::ingest::IngestOutput { arena, warnings } = - crate::ingest::ingest_into_arena(input, &prio, grep)?; - let mut order_build = order::build_order(&arena, &prio)?; + crate::ingest::ingest_into_arena(input, priority_cfg, grep)?; + let mut order_build = order::build_order(&arena, priority_cfg)?; let out = find_largest_render_under_budgets( &mut order_build, config,