Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,17 @@ fn summarize(
)
.map(|out| out.text)
.map_err(to_pyerr),
"jsonl" | "ndjson" => headson_core::headson(
InputKind::Jsonl(input),
&cfg,
&prio,
&grep_cfg,
budgets,
)
.map(|out| out.text)
.map_err(to_pyerr),
other => Err(to_pyerr(anyhow::anyhow!(
"unknown input_format: {} (expected 'json' | 'yaml' | 'text')",
"unknown input_format: {} (expected 'json' | 'jsonl' | 'yaml' | 'text')",
other
))),
})
Expand Down
1 change: 1 addition & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ pub enum StyleArg {
#[derive(Copy, Clone, Debug, ValueEnum)]
pub enum InputFormat {
Json,
Jsonl,
Yaml,
Text,
}
Expand Down
55 changes: 30 additions & 25 deletions src/cli/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@ pub(crate) fn run(cli: &Cli) -> Result<(String, CliWarnings)> {
}

fn detect_fileset_input_kind(name: &str) -> headson::FilesetInputKind {
let lower = name.to_ascii_lowercase();
if lower.ends_with(".yaml") || lower.ends_with(".yml") {
headson::FilesetInputKind::Yaml
} else if lower.ends_with(".json") {
headson::FilesetInputKind::Json
} else {
fileset_text_kind(&lower)
match headson::Format::from_filename(name) {
headson::Format::Json => headson::FilesetInputKind::Json,
headson::Format::Jsonl => headson::FilesetInputKind::Jsonl,
headson::Format::Yaml => headson::FilesetInputKind::Yaml,
headson::Format::Unknown => {
fileset_text_kind(&name.to_ascii_lowercase())
}
}
}

Expand Down Expand Up @@ -484,6 +484,13 @@ fn render_single_input(
grep_cfg,
budgets,
),
InputFormat::Jsonl => headson::headson(
headson::InputKind::Jsonl(bytes),
cfg,
prio,
grep_cfg,
budgets,
),
InputFormat::Text => headson::headson(
headson::InputKind::Text {
bytes,
Expand Down Expand Up @@ -523,7 +530,10 @@ fn resolve_effective_template_for_single(
OutputFormat::Auto => {
if lower_name.ends_with(".yaml") || lower_name.ends_with(".yml") {
headson::OutputTemplate::Yaml
} else if lower_name.ends_with(".json") {
} else if lower_name.ends_with(".jsonl")
|| lower_name.ends_with(".ndjson")
|| lower_name.ends_with(".json")
{
headson::map_json_template_for_style(style)
} else {
// Unknown extension: prefer text template.
Expand Down Expand Up @@ -639,25 +649,20 @@ fn build_single_render_config(
cfg
}

fn detect_input_format_from_ext(name: &str) -> InputFormat {
match headson::Format::from_filename(name) {
headson::Format::Json => InputFormat::Json,
headson::Format::Jsonl => InputFormat::Jsonl,
headson::Format::Yaml => InputFormat::Yaml,
headson::Format::Unknown => InputFormat::Text,
}
}

fn select_input_format(cli: &Cli, lower_name: &str) -> InputFormat {
let is_yaml_ext =
lower_name.ends_with(".yaml") || lower_name.ends_with(".yml");
match cli.format {
OutputFormat::Auto => {
if let Some(fmt) = cli.input_format {
fmt
} else if is_yaml_ext {
InputFormat::Yaml
} else if lower_name.ends_with(".json") {
InputFormat::Json
} else {
InputFormat::Text
}
}
OutputFormat::Json => cli.input_format.unwrap_or(InputFormat::Json),
OutputFormat::Yaml => cli.input_format.unwrap_or(InputFormat::Yaml),
OutputFormat::Text => cli.input_format.unwrap_or(InputFormat::Text),
if let Some(fmt) = cli.input_format {
return fmt;
}
detect_input_format_from_ext(lower_name)
}

#[cfg(test)]
Expand Down
13 changes: 12 additions & 1 deletion src/ingest/fileset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};

use super::IngestOutput;
use super::formats::{
json::build_json_tree_arena_from_slice,
json::{
build_json_tree_arena_from_slice, build_jsonl_tree_arena_from_slice,
},
text::{
build_text_tree_arena_from_bytes,
build_text_tree_arena_from_bytes_with_mode,
Expand All @@ -30,6 +32,7 @@ pub(crate) struct FilesetEntry {
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum FilesetInputKind {
Json,
Jsonl,
Yaml,
Text { atomic_lines: bool },
}
Expand All @@ -56,6 +59,14 @@ pub fn parse_fileset_multi(
"JSON",
|bytes, cfg| build_json_tree_arena_from_slice(bytes, cfg),
),
FilesetInputKind::Jsonl => parse_or_empty(
&name,
&bytes,
cfg,
&mut warnings,
"JSONL",
|bytes, cfg| build_jsonl_tree_arena_from_slice(bytes, cfg),
),
FilesetInputKind::Yaml => parse_or_empty(
&name,
&bytes,
Expand Down
6 changes: 6 additions & 0 deletions src/ingest/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum Format {
Json,
Jsonl,
Yaml,
Unknown,
}
Expand All @@ -17,6 +18,8 @@ impl Format {
use std::path::Path;
const EXT_FORMATS: &[(&str, Format)] = &[
("json", Format::Json),
("jsonl", Format::Jsonl),
("ndjson", Format::Jsonl),
("yaml", Format::Yaml),
("yml", Format::Yaml),
];
Expand Down Expand Up @@ -47,6 +50,9 @@ mod tests {
assert_eq!(Format::from_filename("c.yml"), Format::Yaml);
assert_eq!(Format::from_filename("d.JSON"), Format::Json);
assert_eq!(Format::from_filename("e.YmL"), Format::Yaml);
assert_eq!(Format::from_filename("f.jsonl"), Format::Jsonl);
assert_eq!(Format::from_filename("g.ndjson"), Format::Jsonl);
assert_eq!(Format::from_filename("h.JSONL"), Format::Jsonl);
assert_eq!(Format::from_filename("noext"), Format::Unknown);
assert_eq!(Format::from_filename("weird.tar.gz"), Format::Unknown);
}
Expand Down
4 changes: 2 additions & 2 deletions src/ingest/formats/json/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ impl JsonTreeBuilder {
id
}

fn push_default(&self) -> usize {
pub(crate) fn push_default(&self) -> usize {
let mut a = self.arena.borrow_mut();
let id = a.nodes.len();
a.nodes.push(JsonTreeNode::default());
Expand Down Expand Up @@ -90,7 +90,7 @@ impl JsonTreeBuilder {
})
}

fn finish_array(
pub(crate) fn finish_array(
&self,
id: usize,
kept: usize,
Expand Down
56 changes: 56 additions & 0 deletions src/ingest/formats/json/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,62 @@ pub(crate) fn build_json_tree_arena_from_many(
Ok(arena)
}

/// Parse JSONL (newline-delimited JSON) into a tree arena.
/// Each non-empty line is parsed as independent JSON. The result is an array
/// whose children are the parsed lines, with 1-based line numbers stored as
/// array indices. The root node is marked with `is_jsonl_root = true`.
pub fn parse_jsonl_one(
bytes: &[u8],
cfg: &PriorityConfig,
) -> Result<TreeArena> {
let text = std::str::from_utf8(bytes)
.map_err(|e| anyhow::anyhow!("JSONL input is not valid UTF-8: {e}"))?;
let builder =
JsonTreeBuilder::new(cfg.array_max_items, cfg.array_sampler.into());
let root_id = builder.push_default();
let mut child_ids: Vec<usize> = Vec::new();
let mut line_numbers: Vec<usize> = Vec::new();

for (line_idx, line) in text.lines().enumerate() {
if line.trim().is_empty() {
continue;
}
// simd-json requires a mutable slice for in-place parsing
let mut line_bytes = line.as_bytes().to_vec();
let mut de = simd_json::Deserializer::from_slice(&mut line_bytes)
.map_err(|e| {
anyhow::anyhow!("JSONL line {}: {}", line_idx + 1, e)
})?;
let seed = builder.seed();
let child_id: usize = seed.deserialize(&mut de).map_err(|e| {
anyhow::anyhow!("JSONL line {}: {}", line_idx + 1, e)
})?;
child_ids.push(child_id);
line_numbers.push(line_idx + 1);
}

let kept = child_ids.len();
builder.finish_array(root_id, kept, kept, child_ids, line_numbers);

let mut arena = builder.finish();
arena.root_id = root_id;

if let Some(node) = arena.nodes.get_mut(root_id) {
node.array_len = Some(kept);
node.is_jsonl_root = true;
}

Ok(arena)
}

/// Parse JSONL from a byte slice (for fileset use).
pub(crate) fn build_jsonl_tree_arena_from_slice(
bytes: &[u8],
cfg: &PriorityConfig,
) -> Result<TreeArena> {
parse_jsonl_one(bytes, cfg)
}

/// Convenience functions for the JSON ingest path.
pub fn parse_json_one(
bytes: Vec<u8>,
Expand Down
1 change: 1 addition & 0 deletions src/ingest/formats/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ pub mod yaml;

// Re-export commonly used helpers for convenience
pub use json::parse_json_one;
pub use json::parse_jsonl_one;
pub use text::parse_text_one_with_mode;
pub use yaml::parse_yaml_one;
10 changes: 9 additions & 1 deletion src/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ pub mod sampling;
unused_imports,
reason = "Re-exported helpers need to stay public even when unused internally"
)]
pub use formats::{parse_json_one, parse_text_one_with_mode, parse_yaml_one};
pub use formats::{
parse_json_one, parse_jsonl_one, parse_text_one_with_mode, parse_yaml_one,
};

#[derive(Debug)]
pub(crate) struct IngestOutput {
Expand All @@ -35,6 +37,12 @@ pub(crate) fn ingest_into_arena(
warnings: Vec::new(),
})
}
InputKind::Jsonl(bytes) => {
parse_jsonl_one(&bytes, priority_cfg).map(|arena| IngestOutput {
arena,
warnings: Vec::new(),
})
}
InputKind::Yaml(bytes) => {
parse_yaml_one(&bytes, priority_cfg).map(|arena| IngestOutput {
arena,
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub use grep::{
build_grep_config_from_patterns, combine_patterns,
};
pub use ingest::fileset::{FilesetInput, FilesetInputKind};
pub use ingest::format::Format;
pub use order::types::{ArrayBias, ArraySamplerStrategy};
pub use order::{
NodeId, NodeKind, PriorityConfig, PriorityOrder, RankedNode, build_order,
Expand Down Expand Up @@ -61,6 +62,7 @@ pub enum TextMode {

pub enum InputKind {
Json(Vec<u8>),
Jsonl(Vec<u8>),
Yaml(Vec<u8>),
Text { bytes: Vec<u8>, mode: TextMode },
Fileset(Vec<FilesetInput>),
Expand Down
7 changes: 7 additions & 0 deletions src/order/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,13 @@ pub fn build_order(
code_lines.insert(*pq_id, Arc::clone(lines));
}
}
for (arena_idx, node) in arena.nodes.iter().enumerate() {
if node.is_jsonl_root {
if let Some(Some(pq_id)) = arena_to_pq.get(arena_idx) {
object_type[*pq_id] = ObjectType::JsonlRoot;
}
}
}
Ok(PriorityOrder {
metrics,
nodes,
Expand Down
1 change: 1 addition & 0 deletions src/order/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ pub enum NodeKind {
pub enum ObjectType {
Object,
Fileset,
JsonlRoot,
}

#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
Expand Down
19 changes: 16 additions & 3 deletions src/serialization/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,13 @@ impl<'a> RenderEngine<'a> {
out: &mut Out<'_>,
) {
let config = self.config;
let is_jsonl_root = self.order.object_type.get(id)
== Some(&crate::order::types::ObjectType::JsonlRoot);
let (children_pairs, kept) = self.gather_array_children_with_template(
id,
depth,
config.template,
is_jsonl_root,
);
let omitted = self.leaf.omitted_for(id, kept).unwrap_or(0);
let ctx = ArrayCtx {
Expand All @@ -100,6 +103,7 @@ impl<'a> RenderEngine<'a> {
omitted_at_start: config.prefer_tail_arrays,
source_hint: self.leaf.source_hint(id),
code_highlight: self.leaf.code_highlights_for(id, config.template),
is_jsonl_root,
};
render_array(config.template, &ctx, out)
}
Expand Down Expand Up @@ -193,7 +197,9 @@ impl<'a> RenderEngine<'a> {
id: usize,
depth: usize,
template: crate::serialization::types::OutputTemplate,
is_jsonl_root: bool,
) -> (Vec<ArrayChildPair>, usize) {
let child_depth = if is_jsonl_root { depth } else { depth + 1 };
let Some(children_ids) = self.order.children.get(id) else {
return (Vec::new(), 0);
};
Expand All @@ -207,7 +213,7 @@ impl<'a> RenderEngine<'a> {
let child_kind = self.order.nodes[child_id.0].display_kind();
let rendered = self.render_node_to_string_with_template(
child_id.0,
depth + 1,
child_depth,
false,
template,
);
Expand Down Expand Up @@ -267,8 +273,14 @@ impl<'a> RenderEngine<'a> {
template: crate::serialization::types::OutputTemplate,
) {
let config = self.config;
let (children_pairs, kept) =
self.gather_array_children_with_template(id, depth, template);
let is_jsonl_root = self.order.object_type.get(id)
== Some(&crate::order::types::ObjectType::JsonlRoot);
let (children_pairs, kept) = self.gather_array_children_with_template(
id,
depth,
template,
is_jsonl_root,
);
let omitted = self.leaf.omitted_for(id, kept).unwrap_or(0);
let ctx = ArrayCtx {
children: children_pairs,
Expand All @@ -279,6 +291,7 @@ impl<'a> RenderEngine<'a> {
omitted_at_start: config.prefer_tail_arrays,
source_hint: self.leaf.source_hint(id),
code_highlight: self.leaf.code_highlights_for(id, template),
is_jsonl_root,
};
render_array(template, &ctx, out)
}
Expand Down
2 changes: 1 addition & 1 deletion src/serialization/fileset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ impl<'a> RenderEngine<'a> {
fn fileset_template_for(&self, raw_key: &str) -> OutputTemplate {
match Format::from_filename(raw_key) {
Format::Yaml => OutputTemplate::Yaml,
Format::Json => match self.config.style {
Format::Json | Format::Jsonl => match self.config.style {
crate::serialization::types::Style::Strict => {
OutputTemplate::Json
}
Expand Down
Loading
Loading