Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 27 additions & 15 deletions src/ingest/fileset.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
use crate::grep::GrepConfig;
use crate::order::NodeKind;
use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};

use super::IngestOutput;
use super::formats::{
json::{
build_json_tree_arena_from_slice, build_jsonl_tree_arena_from_slice,
},
json::build_json_tree_arena_from_slice,
text::{
build_text_tree_arena_from_bytes,
build_text_tree_arena_from_bytes_with_mode,
},
yaml::build_yaml_tree_arena_from_bytes,
};
use super::{grep_adjusted_cfg, jsonl_grep_predicate};
use crate::PriorityConfig;

/// Input descriptor for a single file in a multi-format fileset ingest.
Expand Down Expand Up @@ -41,7 +41,10 @@ pub enum FilesetInputKind {
pub fn parse_fileset_multi(
inputs: Vec<FilesetInput>,
cfg: &PriorityConfig,
grep: &GrepConfig,
) -> IngestOutput {
let non_jsonl_cfg = grep_adjusted_cfg(cfg, grep);

let mut entries: Vec<FilesetEntry> = Vec::with_capacity(inputs.len());
let mut warnings: Vec<String> = Vec::new();
for FilesetInput {
Expand All @@ -54,26 +57,35 @@ pub fn parse_fileset_multi(
FilesetInputKind::Json => parse_or_empty(
&name,
&mut bytes,
cfg,
&non_jsonl_cfg,
&mut warnings,
"JSON",
|bytes, cfg| build_json_tree_arena_from_slice(bytes, cfg),
),
FilesetInputKind::Jsonl => parse_or_empty(
&name,
&bytes,
cfg,
&mut warnings,
"JSONL",
|bytes, cfg| build_jsonl_tree_arena_from_slice(bytes, cfg),
|bytes, c| build_json_tree_arena_from_slice(bytes, c),
),
FilesetInputKind::Jsonl => {
let must_include = jsonl_grep_predicate(&bytes, grep);
parse_or_empty(
&name,
&bytes,
cfg,
&mut warnings,
"JSONL",
|bytes, c| {
crate::ingest::formats::json::parse_jsonl_one(
bytes,
c,
&*must_include,
)
},
)
}
FilesetInputKind::Yaml => parse_or_empty(
&name,
&bytes,
cfg,
&non_jsonl_cfg,
&mut warnings,
"YAML",
|bytes, cfg| build_yaml_tree_arena_from_bytes(bytes, cfg),
|bytes, c| build_yaml_tree_arena_from_bytes(bytes, c),
),
FilesetInputKind::Text { atomic_lines } => {
(parse_text_bytes(&bytes, cfg, atomic_lines), false)
Expand Down
23 changes: 10 additions & 13 deletions src/ingest/formats/json/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ pub(crate) fn build_json_tree_arena_from_many(
}

/// Collect (byte_start, 1-based line number) for every non-empty line.
fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
pub(crate) fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
let mut offsets = Vec::new();
let mut pos = 0usize;
for (line_idx, raw_line) in text.split('\n').enumerate() {
Expand All @@ -88,21 +88,26 @@ fn jsonl_line_offsets(text: &str) -> Vec<(usize, usize)> {
///
/// Lines are sampled using the same strategy as JSON arrays (controlled by
/// `PriorityConfig::array_max_items` and `array_sampler`), so only a subset
/// of lines is actually parsed for large inputs.
/// of lines is actually parsed for large inputs. When a `must_include`
/// predicate is provided, matching lines are always kept regardless of the
/// sampling cap.
pub fn parse_jsonl_one(
bytes: &[u8],
cfg: &PriorityConfig,
must_include: impl Fn(usize) -> bool,
) -> Result<TreeArena> {
use crate::ingest::sampling::{ArraySamplerKind, choose_indices};
use crate::ingest::sampling::{
ArraySamplerKind, choose_indices, merge_required,
};

let text = std::str::from_utf8(bytes)
.map_err(|e| anyhow::anyhow!("JSONL input is not valid UTF-8: {e}"))?;

let line_offsets = jsonl_line_offsets(text);
let total = line_offsets.len();
let sampler_kind: ArraySamplerKind = cfg.array_sampler.into();
let kept_indices =
choose_indices(sampler_kind, total, cfg.array_max_items);
let sampled = choose_indices(sampler_kind, total, cfg.array_max_items);
let kept_indices = merge_required(sampled, total, &must_include);

let builder = JsonTreeBuilder::new(cfg.array_max_items, sampler_kind);
let root_id = builder.push_default();
Expand Down Expand Up @@ -138,14 +143,6 @@ pub fn parse_jsonl_one(
Ok(arena)
}

/// Parse JSONL from a byte slice (for fileset use).
pub(crate) fn build_jsonl_tree_arena_from_slice(
bytes: &[u8],
cfg: &PriorityConfig,
) -> Result<TreeArena> {
parse_jsonl_one(bytes, cfg)
}

/// Convenience functions for the JSON ingest path.
pub fn parse_json_one(
bytes: Vec<u8>,
Expand Down
159 changes: 148 additions & 11 deletions src/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use crate::order::PriorityConfig;
use crate::utils::tree_arena::JsonTreeArena as TreeArena;

use crate::InputKind;
use crate::grep::GrepConfig;

pub mod fileset;
pub mod format;
Expand All @@ -25,48 +26,104 @@ pub(crate) struct IngestOutput {
pub warnings: Vec<String>,
}

/// Return a copy of `cfg` with array sampling disabled when strong grep is
/// active. Non-JSONL formats need this to avoid sampling away matches;
/// JSONL handles it via `merge_required` in the sampler instead.
pub(crate) fn grep_adjusted_cfg(
cfg: &PriorityConfig,
grep: &GrepConfig,
) -> PriorityConfig {
if grep.has_strong() {
let mut c = *cfg;
c.array_max_items = usize::MAX;
c
} else {
*cfg
}
}

/// Build a predicate that returns true for JSONL line indices matching the
/// strong grep pattern. When no grep is active, returns a no-op.
///
/// Uses a single regex scan over the entire text and maps match positions
/// back to line indices, avoiding per-line regex overhead.
pub(crate) fn jsonl_grep_predicate(
bytes: &[u8],
grep: &GrepConfig,
) -> Box<dyn Fn(usize) -> bool> {
let Some(re) = grep.patterns.strong() else {
return Box::new(|_| false);
};
let Ok(text) = std::str::from_utf8(bytes) else {
return Box::new(|_| false);
};
let offsets = formats::json::jsonl_line_offsets(text);
if offsets.is_empty() {
return Box::new(|_| false);
}
// Single regex pass: find all match positions and map to line indices.
let mut matching = vec![false; offsets.len()];
for m in re.find_iter(text) {
let pos = m.start();
// Binary search for the line containing this byte position.
let idx = offsets.partition_point(|&(start, _)| start <= pos);
if idx > 0 {
matching[idx - 1] = true;
}
}
Box::new(move |i: usize| matching.get(i).copied().unwrap_or(false))
}

/// Dispatch the appropriate ingest path for any supported input kind.
pub(crate) fn ingest_into_arena(
input: InputKind,
priority_cfg: &PriorityConfig,
grep: &GrepConfig,
) -> Result<IngestOutput> {
match input {
InputKind::Json(bytes) => {
parse_json_one(bytes, priority_cfg).map(|arena| IngestOutput {
let cfg = grep_adjusted_cfg(priority_cfg, grep);
parse_json_one(bytes, &cfg).map(|arena| IngestOutput {
arena,
warnings: Vec::new(),
})
}
InputKind::Jsonl(bytes) => {
parse_jsonl_one(&bytes, priority_cfg).map(|arena| IngestOutput {
arena,
warnings: Vec::new(),
})
let must_include = jsonl_grep_predicate(&bytes, grep);
parse_jsonl_one(&bytes, priority_cfg, &*must_include).map(
|arena| IngestOutput {
arena,
warnings: Vec::new(),
},
)
}
InputKind::Yaml(bytes) => {
parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput {
let cfg = grep_adjusted_cfg(priority_cfg, grep);
parse_yaml_one(&bytes, &cfg).map(|arena| IngestOutput {
arena,
warnings: Vec::new(),
})
}
InputKind::Text { bytes, mode } => {
let cfg = grep_adjusted_cfg(priority_cfg, grep);
let atomic = matches!(mode, crate::TextMode::CodeLike);
parse_text_one_with_mode(bytes, priority_cfg, atomic).map(
|arena| IngestOutput {
parse_text_one_with_mode(bytes, &cfg, atomic).map(|arena| {
IngestOutput {
arena,
warnings: Vec::new(),
},
)
}
})
}
InputKind::Fileset(inputs) => {
Ok(fileset::parse_fileset_multi(inputs, priority_cfg))
Ok(fileset::parse_fileset_multi(inputs, priority_cfg, grep))
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::grep::{GrepConfig, GrepPatterns, GrepShow};
use crate::order::NodeKind;

#[test]
Expand Down Expand Up @@ -108,6 +165,85 @@ mod tests {
assert_eq!(arena.nodes[root].object_len.unwrap_or(0), 2);
}

fn grep_with_strong(pattern: &str) -> GrepConfig {
GrepConfig {
patterns: GrepPatterns::StrongOnly(
regex::Regex::new(pattern).unwrap(),
),
show: GrepShow::Matching,
}
}

#[test]
fn jsonl_grep_predicate_marks_matching_lines() {
let input = b"{\"a\":1}\n{\"b\":2}\n{\"c\":3}\n";
let grep = grep_with_strong("b");
let pred = jsonl_grep_predicate(input, &grep);
assert!(!pred(0), "line 0 should not match");
assert!(pred(1), "line 1 should match 'b'");
assert!(!pred(2), "line 2 should not match");
}

#[test]
fn jsonl_grep_predicate_multiple_matches() {
let input = b"{\"x\":1}\n{\"x\":2}\n{\"y\":3}\n{\"x\":4}\n";
let grep = grep_with_strong("x");
let pred = jsonl_grep_predicate(input, &grep);
assert!(pred(0));
assert!(pred(1));
assert!(!pred(2));
assert!(pred(3));
}

#[test]
fn jsonl_grep_predicate_no_strong_pattern_returns_noop() {
let input = b"{\"a\":1}\n{\"b\":2}\n";
let grep = GrepConfig::default(); // no patterns
let pred = jsonl_grep_predicate(input, &grep);
assert!(!pred(0));
assert!(!pred(1));
}

#[test]
fn jsonl_grep_predicate_skips_empty_lines() {
// Empty lines are excluded from offsets, so indices are dense
let input = b"{\"a\":1}\n\n{\"b\":2}\n";
let grep = grep_with_strong("b");
let pred = jsonl_grep_predicate(input, &grep);
// Only 2 non-empty lines: index 0 = {"a":1}, index 1 = {"b":2}
assert!(!pred(0));
assert!(pred(1));
}

#[test]
fn jsonl_grep_predicate_match_on_first_line() {
let input = b"{\"needle\":true}\n{\"other\":false}\n";
let grep = grep_with_strong("needle");
let pred = jsonl_grep_predicate(input, &grep);
assert!(pred(0), "match on first line should work");
assert!(!pred(1));
}

#[test]
fn jsonl_grep_predicate_match_on_last_line() {
let input = b"{\"a\":1}\n{\"needle\":true}";
let grep = grep_with_strong("needle");
let pred = jsonl_grep_predicate(input, &grep);
assert!(!pred(0));
assert!(
pred(1),
"match on last line (no trailing newline) should work"
);
}

#[test]
fn jsonl_grep_predicate_out_of_bounds_returns_false() {
let input = b"{\"a\":1}\n{\"b\":2}\n";
let grep = grep_with_strong("a");
let pred = jsonl_grep_predicate(input, &grep);
assert!(!pred(99), "out of bounds index should return false");
}

#[test]
fn fileset_ingest_surfaces_parse_warnings() {
let inputs = vec![fileset::FilesetInput {
Expand All @@ -118,6 +254,7 @@ mod tests {
let IngestOutput { arena, warnings } = ingest_into_arena(
InputKind::Fileset(inputs),
&PriorityConfig::new(usize::MAX, usize::MAX),
&GrepConfig::default(),
)
.unwrap();
assert!(arena.is_fileset, "fileset input should mark arena");
Expand Down
Loading
Loading