diff --git a/src/ingest/fileset.rs b/src/ingest/fileset.rs index fe821ac2..caffcc39 100644 --- a/src/ingest/fileset.rs +++ b/src/ingest/fileset.rs @@ -1,17 +1,17 @@ +use crate::grep::GrepConfig; use crate::order::NodeKind; use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode}; use super::IngestOutput; use super::formats::{ - json::{ - build_json_tree_arena_from_slice, build_jsonl_tree_arena_from_slice, - }, + json::build_json_tree_arena_from_slice, text::{ build_text_tree_arena_from_bytes, build_text_tree_arena_from_bytes_with_mode, }, yaml::build_yaml_tree_arena_from_bytes, }; +use super::{grep_adjusted_cfg, jsonl_grep_predicate}; use crate::PriorityConfig; /// Input descriptor for a single file in a multi-format fileset ingest. @@ -41,7 +41,10 @@ pub enum FilesetInputKind { pub fn parse_fileset_multi( inputs: Vec, cfg: &PriorityConfig, + grep: &GrepConfig, ) -> IngestOutput { + let non_jsonl_cfg = grep_adjusted_cfg(cfg, grep); + let mut entries: Vec = Vec::with_capacity(inputs.len()); let mut warnings: Vec = Vec::new(); for FilesetInput { @@ -54,26 +57,35 @@ pub fn parse_fileset_multi( FilesetInputKind::Json => parse_or_empty( &name, &mut bytes, - cfg, + &non_jsonl_cfg, &mut warnings, "JSON", - |bytes, cfg| build_json_tree_arena_from_slice(bytes, cfg), - ), - FilesetInputKind::Jsonl => parse_or_empty( - &name, - &bytes, - cfg, - &mut warnings, - "JSONL", - |bytes, cfg| build_jsonl_tree_arena_from_slice(bytes, cfg), + |bytes, c| build_json_tree_arena_from_slice(bytes, c), ), + FilesetInputKind::Jsonl => { + let must_include = jsonl_grep_predicate(&bytes, grep); + parse_or_empty( + &name, + &bytes, + cfg, + &mut warnings, + "JSONL", + |bytes, c| { + crate::ingest::formats::json::parse_jsonl_one( + bytes, + c, + &*must_include, + ) + }, + ) + } FilesetInputKind::Yaml => parse_or_empty( &name, &bytes, - cfg, + &non_jsonl_cfg, &mut warnings, "YAML", - |bytes, cfg| build_yaml_tree_arena_from_bytes(bytes, cfg), + |bytes, c| build_yaml_tree_arena_from_bytes(bytes, c), ), FilesetInputKind::Text { atomic_lines } => { (parse_text_bytes(&bytes, cfg, atomic_lines), false) diff --git a/src/ingest/formats/json/mod.rs b/src/ingest/formats/json/mod.rs index 70b4fbaa..974975d1 100644 --- a/src/ingest/formats/json/mod.rs +++ b/src/ingest/formats/json/mod.rs @@ -67,7 +67,7 @@ pub(crate) fn build_json_tree_arena_from_many( } /// Collect (byte_start, 1-based line number) for every non-empty line. -fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { +pub(crate) fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { let mut offsets = Vec::new(); let mut pos = 0usize; for (line_idx, raw_line) in text.split('\n').enumerate() { @@ -88,12 +88,17 @@ fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> { /// /// Lines are sampled using the same strategy as JSON arrays (controlled by /// `PriorityConfig::array_max_items` and `array_sampler`), so only a subset -/// of lines is actually parsed for large inputs. +/// of lines is actually parsed for large inputs. When a `must_include` +/// predicate is provided, matching lines are always kept regardless of the +/// sampling cap. pub fn parse_jsonl_one( bytes: &[u8], cfg: &PriorityConfig, + must_include: impl Fn(usize) -> bool, ) -> Result { - use crate::ingest::sampling::{ArraySamplerKind, choose_indices}; + use crate::ingest::sampling::{ + ArraySamplerKind, choose_indices, merge_required, + }; let text = std::str::from_utf8(bytes) .map_err(|e| anyhow::anyhow!("JSONL input is not valid UTF-8: {e}"))?; @@ -101,8 +106,8 @@ pub fn parse_jsonl_one( let line_offsets = jsonl_line_offsets(text); let total = line_offsets.len(); let sampler_kind: ArraySamplerKind = cfg.array_sampler.into(); - let kept_indices = - choose_indices(sampler_kind, total, cfg.array_max_items); + let sampled = choose_indices(sampler_kind, total, cfg.array_max_items); + let kept_indices = merge_required(sampled, total, &must_include); let builder = JsonTreeBuilder::new(cfg.array_max_items, sampler_kind); let root_id = builder.push_default(); @@ -138,14 +143,6 @@ pub fn parse_jsonl_one( Ok(arena) } -/// Parse JSONL from a byte slice (for fileset use). -pub(crate) fn build_jsonl_tree_arena_from_slice( - bytes: &[u8], - cfg: &PriorityConfig, -) -> Result { - parse_jsonl_one(bytes, cfg) -} - /// Convenience functions for the JSON ingest path. pub fn parse_json_one( bytes: Vec, diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index fb9fec26..03e27105 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -4,6 +4,7 @@ use crate::order::PriorityConfig; use crate::utils::tree_arena::JsonTreeArena as TreeArena; use crate::InputKind; +use crate::grep::GrepConfig; pub mod fileset; pub mod format; @@ -25,41 +26,96 @@ pub(crate) struct IngestOutput { pub warnings: Vec, } +/// Return a copy of `cfg` with array sampling disabled when strong grep is +/// active. Non-JSONL formats need this to avoid sampling away matches; +/// JSONL handles it via `merge_required` in the sampler instead. +pub(crate) fn grep_adjusted_cfg( + cfg: &PriorityConfig, + grep: &GrepConfig, +) -> PriorityConfig { + if grep.has_strong() { + let mut c = *cfg; + c.array_max_items = usize::MAX; + c + } else { + *cfg + } +} + +/// Build a predicate that returns true for JSONL line indices matching the +/// strong grep pattern. When no grep is active, returns a no-op. +/// +/// Uses a single regex scan over the entire text and maps match positions +/// back to line indices, avoiding per-line regex overhead. +pub(crate) fn jsonl_grep_predicate( + bytes: &[u8], + grep: &GrepConfig, +) -> Box bool> { + let Some(re) = grep.patterns.strong() else { + return Box::new(|_| false); + }; + let Ok(text) = std::str::from_utf8(bytes) else { + return Box::new(|_| false); + }; + let offsets = formats::json::jsonl_line_offsets(text); + if offsets.is_empty() { + return Box::new(|_| false); + } + // Single regex pass: find all match positions and map to line indices. + let mut matching = vec![false; offsets.len()]; + for m in re.find_iter(text) { + let pos = m.start(); + // Binary search for the line containing this byte position. + let idx = offsets.partition_point(|&(start, _)| start <= pos); + if idx > 0 { + matching[idx - 1] = true; + } + } + Box::new(move |i: usize| matching.get(i).copied().unwrap_or(false)) +} + /// Dispatch the appropriate ingest path for any supported input kind. pub(crate) fn ingest_into_arena( input: InputKind, priority_cfg: &PriorityConfig, + grep: &GrepConfig, ) -> Result { match input { InputKind::Json(bytes) => { - parse_json_one(bytes, priority_cfg).map(|arena| IngestOutput { + let cfg = grep_adjusted_cfg(priority_cfg, grep); + parse_json_one(bytes, &cfg).map(|arena| IngestOutput { arena, warnings: Vec::new(), }) } InputKind::Jsonl(bytes) => { - parse_jsonl_one(&bytes, priority_cfg).map(|arena| IngestOutput { - arena, - warnings: Vec::new(), - }) + let must_include = jsonl_grep_predicate(&bytes, grep); + parse_jsonl_one(&bytes, priority_cfg, &*must_include).map( + |arena| IngestOutput { + arena, + warnings: Vec::new(), + }, + ) } InputKind::Yaml(bytes) => { - parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput { + let cfg = grep_adjusted_cfg(priority_cfg, grep); + parse_yaml_one(&bytes, &cfg).map(|arena| IngestOutput { arena, warnings: Vec::new(), }) } InputKind::Text { bytes, mode } => { + let cfg = grep_adjusted_cfg(priority_cfg, grep); let atomic = matches!(mode, crate::TextMode::CodeLike); - parse_text_one_with_mode(bytes, priority_cfg, atomic).map( - |arena| IngestOutput { + parse_text_one_with_mode(bytes, &cfg, atomic).map(|arena| { + IngestOutput { arena, warnings: Vec::new(), - }, - ) + } + }) } InputKind::Fileset(inputs) => { - Ok(fileset::parse_fileset_multi(inputs, priority_cfg)) + Ok(fileset::parse_fileset_multi(inputs, priority_cfg, grep)) } } } @@ -67,6 +123,7 @@ pub(crate) fn ingest_into_arena( #[cfg(test)] mod tests { use super::*; + use crate::grep::{GrepConfig, GrepPatterns, GrepShow}; use crate::order::NodeKind; #[test] @@ -108,6 +165,85 @@ mod tests { assert_eq!(arena.nodes[root].object_len.unwrap_or(0), 2); } + fn grep_with_strong(pattern: &str) -> GrepConfig { + GrepConfig { + patterns: GrepPatterns::StrongOnly( + regex::Regex::new(pattern).unwrap(), + ), + show: GrepShow::Matching, + } + } + + #[test] + fn jsonl_grep_predicate_marks_matching_lines() { + let input = b"{\"a\":1}\n{\"b\":2}\n{\"c\":3}\n"; + let grep = grep_with_strong("b"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(0), "line 0 should not match"); + assert!(pred(1), "line 1 should match 'b'"); + assert!(!pred(2), "line 2 should not match"); + } + + #[test] + fn jsonl_grep_predicate_multiple_matches() { + let input = b"{\"x\":1}\n{\"x\":2}\n{\"y\":3}\n{\"x\":4}\n"; + let grep = grep_with_strong("x"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(pred(0)); + assert!(pred(1)); + assert!(!pred(2)); + assert!(pred(3)); + } + + #[test] + fn jsonl_grep_predicate_no_strong_pattern_returns_noop() { + let input = b"{\"a\":1}\n{\"b\":2}\n"; + let grep = GrepConfig::default(); // no patterns + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(0)); + assert!(!pred(1)); + } + + #[test] + fn jsonl_grep_predicate_skips_empty_lines() { + // Empty lines are excluded from offsets, so indices are dense + let input = b"{\"a\":1}\n\n{\"b\":2}\n"; + let grep = grep_with_strong("b"); + let pred = jsonl_grep_predicate(input, &grep); + // Only 2 non-empty lines: index 0 = {"a":1}, index 1 = {"b":2} + assert!(!pred(0)); + assert!(pred(1)); + } + + #[test] + fn jsonl_grep_predicate_match_on_first_line() { + let input = b"{\"needle\":true}\n{\"other\":false}\n"; + let grep = grep_with_strong("needle"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(pred(0), "match on first line should work"); + assert!(!pred(1)); + } + + #[test] + fn jsonl_grep_predicate_match_on_last_line() { + let input = b"{\"a\":1}\n{\"needle\":true}"; + let grep = grep_with_strong("needle"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(0)); + assert!( + pred(1), + "match on last line (no trailing newline) should work" + ); + } + + #[test] + fn jsonl_grep_predicate_out_of_bounds_returns_false() { + let input = b"{\"a\":1}\n{\"b\":2}\n"; + let grep = grep_with_strong("a"); + let pred = jsonl_grep_predicate(input, &grep); + assert!(!pred(99), "out of bounds index should return false"); + } + #[test] fn fileset_ingest_surfaces_parse_warnings() { let inputs = vec![fileset::FilesetInput { @@ -118,6 +254,7 @@ mod tests { let IngestOutput { arena, warnings } = ingest_into_arena( InputKind::Fileset(inputs), &PriorityConfig::new(usize::MAX, usize::MAX), + &GrepConfig::default(), ) .unwrap(); assert!(arena.is_fileset, "fileset input should mark arena"); diff --git a/src/ingest/sampling/mod.rs b/src/ingest/sampling/mod.rs index 8e22e07c..621c039d 100644 --- a/src/ingest/sampling/mod.rs +++ b/src/ingest/sampling/mod.rs @@ -120,6 +120,49 @@ pub fn choose_indices( } } +/// Merge required indices into an already-chosen set, preserving sorted order. +/// +/// Use this as a post-step after `choose_indices` when certain indices must +/// be unconditionally kept (e.g., JSONL lines matching a grep pattern). +#[allow( + clippy::cognitive_complexity, + reason = "Linear collect-and-merge logic reads clearest as a single function" +)] +pub fn merge_required( + sampled: Vec, + total: usize, + must_include: &impl Fn(usize) -> bool, +) -> Vec { + let mut seen = vec![false; total]; + for &i in &sampled { + seen[i] = true; + } + let mut extra: Vec = Vec::new(); + for (i, &already) in seen.iter().enumerate() { + if !already && must_include(i) { + extra.push(i); + } + } + if extra.is_empty() { + return sampled; + } + // Merge both sorted sequences + let mut result = Vec::with_capacity(sampled.len() + extra.len()); + let (mut si, mut ei) = (0, 0); + while si < sampled.len() && ei < extra.len() { + if sampled[si] <= extra[ei] { + result.push(sampled[si]); + si += 1; + } else { + result.push(extra[ei]); + ei += 1; + } + } + result.extend_from_slice(&sampled[si..]); + result.extend_from_slice(&extra[ei..]); + result +} + #[cfg(test)] mod tests { use super::*; @@ -139,4 +182,115 @@ mod tests { let indices = choose_indices_default(total, cap); assert!(indices.len() <= cap); } + + #[test] + fn merge_required_adds_missing_indices() { + let total = 20usize; + let cap = 3usize; + let sampled = choose_indices_default(total, cap); + // Force index 15 to be included even though cap is 3 + let indices = merge_required(sampled, total, &|i| i == 15); + assert!( + indices.contains(&15), + "must_include index should be present: {indices:?}" + ); + // Original sampled indices should still be present + assert!(indices.contains(&0), "head items should be present"); + } + + #[test] + fn merge_required_preserves_sorted_order() { + let total = 100usize; + let cap = 5usize; + let sampled = choose_indices_default(total, cap); + let indices = merge_required(sampled, total, &|i| i == 50 || i == 90); + for w in indices.windows(2) { + assert!(w[0] < w[1], "indices should be sorted: {indices:?}"); + } + assert!(indices.contains(&50)); + assert!(indices.contains(&90)); + } + + #[test] + fn merge_required_with_zero_cap() { + let total = 10usize; + let sampled = choose_indices_default(total, 0); + let indices = merge_required(sampled, total, &|i| i == 3 || i == 7); + assert_eq!(indices, vec![3, 7]); + } + + #[test] + fn merge_required_no_duplicates_when_already_sampled() { + let total = 10usize; + let cap = 10usize; + let sampled = choose_indices_default(total, cap); + // All indices already sampled; must_include shouldn't duplicate + let indices = merge_required(sampled, total, &|i| i == 0); + assert_eq!(indices, (0..total).collect::>()); + } + + #[test] + fn head_sampler_merge_includes_required_beyond_cap() { + let total = 20usize; + let cap = 3usize; + let sampled = choose_indices_head(total, cap); + // Head keeps 0,1,2 — force index 17 to also be included + let indices = merge_required(sampled, total, &|i| i == 17); + assert_eq!(&indices[..3], &[0, 1, 2]); + assert!( + indices.contains(&17), + "must_include index should be present: {indices:?}" + ); + for w in indices.windows(2) { + assert!(w[0] < w[1], "indices should be sorted: {indices:?}"); + } + } + + #[test] + fn head_sampler_merge_no_duplicates_when_already_sampled() { + let total = 10usize; + let cap = 5usize; + let sampled = choose_indices_head(total, cap); + // Index 2 is already in head range 0..5 + let indices = merge_required(sampled, total, &|i| i == 2); + assert_eq!(indices, (0..5).collect::>()); + } + + #[test] + fn tail_sampler_merge_includes_required_beyond_cap() { + let total = 20usize; + let cap = 3usize; + let sampled = choose_indices_tail(total, cap); + // Tail keeps 17,18,19 — force index 2 to also be included + let indices = merge_required(sampled, total, &|i| i == 2); + assert!(indices.contains(&2), "must_include index should be present"); + assert!(indices.contains(&17)); + assert_eq!(indices, vec![2, 17, 18, 19]); + } + + #[test] + fn tail_sampler_merge_no_duplicates_when_already_sampled() { + let total = 10usize; + let cap = 5usize; + let sampled = choose_indices_tail(total, cap); + // Index 7 is already in tail range 5..10 + let indices = merge_required(sampled, total, &|i| i == 7); + assert_eq!(indices, (5..10).collect::>()); + } + + #[test] + fn tail_sampler_merge_with_zero_cap_returns_only_required() { + let total = 10usize; + let sampled = choose_indices_tail(total, 0); + let indices = merge_required(sampled, total, &|i| i == 4 || i == 8); + assert_eq!(indices, vec![4, 8]); + } + + #[test] + fn head_sampler_merge_with_zero_cap_returns_only_required() { + let total = 10usize; + let sampled = choose_indices_head(total, 0); + let indices = merge_required(sampled, total, &|i| i == 4 || i == 8); + assert_eq!(indices, vec![4, 8]); + } } diff --git a/src/lib.rs b/src/lib.rs index d275174e..d087aa0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -75,14 +75,9 @@ pub fn headson( grep: &GrepConfig, budgets: Budgets, ) -> Result { - let mut prio = *priority_cfg; - if grep.has_strong() { - // Avoid sampling away potential matches in strong grep mode. - prio.array_max_items = usize::MAX; - } let crate::ingest::IngestOutput { arena, warnings } = - crate::ingest::ingest_into_arena(input, &prio)?; - let mut order_build = order::build_order(&arena, &prio)?; + crate::ingest::ingest_into_arena(input, priority_cfg, grep)?; + let mut order_build = order::build_order(&arena, priority_cfg)?; let out = find_largest_render_under_budgets( &mut order_build, config, diff --git a/src/serialization/tests.rs b/src/serialization/tests.rs index 8ec08964..5238cdea 100644 --- a/src/serialization/tests.rs +++ b/src/serialization/tests.rs @@ -560,6 +560,7 @@ fn fileset_tree_headers_free_keep_slot_stats_on_body_only() { }, ], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); @@ -644,6 +645,7 @@ fn fileset_tree_headers_free_scaffold_does_not_change_slot_stats() { }, ], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); @@ -734,6 +736,7 @@ fn fileset_sections_slot_stats_respect_header_budgeting() { }, ], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap(); @@ -826,6 +829,7 @@ fn slot_stats_match_render_for_code_and_text() { }, }], &cfg_prio, + &crate::GrepConfig::default(), ) .arena; let order = build_order(&arena, &cfg_prio).unwrap();