diff --git a/CHANGELOG.md b/CHANGELOG.md index 44555fb5a..8515a9271 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Features +- Add `--json` flag for JSONL format output. ## Bugfixes diff --git a/Cargo.lock b/Cargo.lock index 6b2060293..46b6e3b3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,6 +78,12 @@ dependencies = [ "once_cell", ] +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bitflags" version = "1.3.2" @@ -294,6 +300,7 @@ dependencies = [ "aho-corasick", "anyhow", "argmax", + "base64", "clap", "clap_complete", "crossbeam-channel", @@ -312,6 +319,7 @@ dependencies = [ "nu-ansi-term", "regex", "regex-syntax", + "serde_json", "tempfile", "test-case", "tikv-jemallocator", @@ -388,6 +396,12 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + [[package]] name = "jiff" version = "0.2.16" @@ -621,6 +635,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + [[package]] name = "same-file" version = "1.0.6" @@ -659,6 +679,19 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.145" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", + "serde_core", +] + [[package]] name = "shlex" version = "1.3.0" diff --git a/Cargo.toml b/Cargo.toml index 6d2631e8b..790da9f8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,19 +5,13 @@ description = "fd is a simple, fast and user-friendly alternative to find." exclude = ["/benchmarks/*"] homepage = "https://github.com/sharkdp/fd" documentation = "https://docs.rs/fd-find" -keywords = [ - "search", - "find", - "file", - "filesystem", - "tool", -] +keywords = ["search", "find", "file", "filesystem", "tool"] license = "MIT OR Apache-2.0" name = "fd-find" readme = "README.md" repository = "https://github.com/sharkdp/fd" version = "10.3.0" -edition= "2024" +edition = "2024" rust-version = "1.90.0" [badges.appveyor] @@ -46,6 +40,7 @@ crossbeam-channel = "0.5.15" clap_complete = {version = "4.5.61", optional = true} faccess = "0.2.4" jiff = "0.2.16" +base64 = "0.22.1" [dependencies.clap] version = "4.5.53" @@ -57,7 +52,11 @@ default-features = false features = ["nu-ansi-term"] [target.'cfg(unix)'.dependencies] -nix = { version = "0.30.1", default-features = false, features = ["signal", "user", "hostname"] } +nix = { version = "0.30.1", default-features = false, features = [ + "signal", + "user", + "hostname", +] } [target.'cfg(all(unix, not(target_os = "redox")))'.dependencies] libc = "0.2" @@ -68,13 +67,14 @@ libc = "0.2" # This has to be kept in sync with src/main.rs where the allocator for # the program is set. [target.'cfg(all(not(windows), not(target_os = "android"), not(target_os = "macos"), not(target_os = "freebsd"), not(target_os = "openbsd"), not(target_os = "illumos"), not(all(target_env = "musl", target_pointer_width = "32")), not(target_arch = "riscv64")))'.dependencies] -tikv-jemallocator = {version = "0.6.0", optional = true} +tikv-jemallocator = { version = "0.6.0", optional = true } [dev-dependencies] diff = "0.1" tempfile = "3.23" filetime = "0.2" test-case = "3.3" +serde_json = "1.0.145" [profile.release] lto = true diff --git a/doc/fd.1 b/doc/fd.1 index df42b1724..cd72d5f16 100644 --- a/doc/fd.1 +++ b/doc/fd.1 @@ -510,6 +510,34 @@ Maximum number of arguments to pass to the command given with -X. If the number greater than the given size, the command given with -X is run again with remaining arguments. A batch size of zero means there is no limit (default), but note that batching might still happen due to OS restrictions on the maximum length of command lines. +.TP +.BI "\-\-json " +.RS +Specify JSONL (as known as NDJSON) format to use for the output. + +Output fields: + + - "path": An object containing the path of the file. When the path is valid UTF-8, it this contains a single "text" field + containing the path as a string. Otherwise it contains a single "bytes" field containing the base64 encoded bytes of the + path. + + On windows, this may use a lossy UTF-8 encoding, since there isn't an obvious way to encode the pathname. + + If a custom path separator is given, it is used in the "text" field, but not in the "bytes" field. + + - "type": The file type (e.g., "file", "directory", "symlink"). + + - "size_bytes": The file size in bytes. + + - "mode": The file permissions in octal (e.g., 644). + + - "modified": The last modification time in RFC3339 (ISO 8601) format (e.g., 2000-01-01T12:00:00Z). + + - "accessed": The last access time in RFC3339 format. + + - "created": The creation time in RFC3339 format. +.RE +.TP .SH PATTERN SYNTAX The regular expression syntax used by fd is documented here: diff --git a/src/cli.rs b/src/cli.rs index d5174689d..2cfb5e8a2 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -648,6 +648,16 @@ pub struct Opts { )] search_path: Vec, + /// Print results in JSONL format. + #[arg( + long, + value_name = "json", + help = "Print results in JSONL format so you can pipe it to tools.", + conflicts_with_all(&["format", "list_details"]), + long_help + )] + pub json: bool, + /// By default, relative paths are prefixed with './' when -x/--exec, /// -X/--exec-batch, or -0/--print0 are given, to reduce the risk of a /// path starting with '-' being treated as a command line option. Use diff --git a/src/config.rs b/src/config.rs index 9e18120c4..57a3fa792 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,6 +1,5 @@ use std::{path::PathBuf, sync::Arc, time::Duration}; -use lscolors::LsColors; use regex::bytes::RegexSet; use crate::exec::CommandSet; @@ -8,7 +7,7 @@ use crate::filetypes::FileTypes; #[cfg(unix)] use crate::filter::OwnerFilter; use crate::filter::{SizeFilter, TimeFilter}; -use crate::fmt::FormatTemplate; +use crate::fmt::OutputFormat; /// Configuration options for *fd*. pub struct Config { @@ -70,10 +69,6 @@ pub struct Config { /// `max_buffer_time`. pub max_buffer_time: Option, - /// `None` if the output should not be colorized. Otherwise, a `LsColors` instance that defines - /// how to style different filetypes. - pub ls_colors: Option, - /// Whether or not we are writing to an interactive terminal #[cfg_attr(not(unix), allow(unused))] pub interactive_terminal: bool, @@ -87,8 +82,10 @@ pub struct Config { /// The value (if present) will be a lowercase string without leading dots. pub extensions: Option, - /// A format string to use to format results, similarly to exec - pub format: Option, + /// The format to use for the output + /// + /// determined by multiple options + pub format: OutputFormat, /// If a value is supplied, each item found will be used to generate and execute commands. pub command: Option>, diff --git a/src/fmt/json.rs b/src/fmt/json.rs new file mode 100644 index 000000000..e51597005 --- /dev/null +++ b/src/fmt/json.rs @@ -0,0 +1,88 @@ +use std::borrow::Cow; +use std::fs::{FileType, Metadata}; +use std::io::Write; +#[cfg(unix)] +use std::os::unix::{ffi::OsStrExt, fs::MetadataExt}; +use std::path::{MAIN_SEPARATOR, Path}; +use std::time::SystemTime; + +use base64::{Engine as _, prelude::BASE64_STANDARD}; +use jiff::Timestamp; + +pub fn output_json( + out: &mut W, + path: &Path, + filetype: Option, + metadata: Option<&Metadata>, + path_separator: &Option, +) -> std::io::Result<()> { + out.write_all(b"{")?; + + // Print the path as an object that either has a "text" key containing the + // utf8 path, or a "bytes" key with the base64 encoded bytes of the path + #[cfg(unix)] + match path.to_str() { + Some(text) => { + let final_path: Cow = if let Some(sep) = path_separator { + text.replace(MAIN_SEPARATOR, sep).into() + } else { + text.into() + }; + // NB: This assumes that rust's debug output for a string + // is a valid JSON string. At time of writing this is the case + // but it is possible, though unlikely, that this could change + // in the future. + write!(out, r#""path":{{"text":{:?}}}"#, final_path)?; + } + None => { + let encoded_bytes = BASE64_STANDARD.encode(path.as_os_str().as_bytes()); + write!(out, r#""path":{{"bytes":"{}"}}"#, encoded_bytes)?; + } + }; + // On non-unix platforms, if the path isn't valid utf-8, + // we don't know what kind of encoding was used, and + // as_encoded_bytes() isn't necessarily stable between rust versions + // so the best we can really do is a lossy string + #[cfg(not(unix))] + write!(out, r#""path":{{"text":{:?}}}"#, path.to_string_lossy())?; + + // print the type of file + let ft = match filetype { + Some(ft) if ft.is_dir() => "directory", + Some(ft) if ft.is_file() => "file", + Some(ft) if ft.is_symlink() => "symlink", + _ => "unknown", + }; + write!(out, r#","type":"{}""#, ft)?; + + if let Some(meta) = metadata { + // Output the mode as octal + // We also need to mask it to just include the permission + // bits and not the file type bits (that is handled by "type" above) + #[cfg(unix)] + write!(out, r#","mode":"{:o}""#, meta.mode() & 0x7777)?; + + write!(out, r#","size_bytes":{}"#, meta.len())?; + + // would it be better to do these with os-specific functions? + if let Ok(modified) = meta.modified().map(json_timestamp) { + write!(out, r#","modified":"{}""#, modified)?; + } + if let Ok(accessed) = meta.accessed().map(json_timestamp) { + write!(out, r#","modified":"{}""#, accessed)?; + } + if let Ok(created) = meta.created().map(json_timestamp) { + write!(out, r#","modified":"{}""#, created)?; + } + } + + out.write_all(b"}") +} + +fn json_timestamp(time: SystemTime) -> Timestamp { + // System timestamps should always be valid, so assume that we can + // unwrap it + // If we ever do want to handle an error here, maybe convert to either the MAX or MIN + // timestamp depending on which side of the epoch the SystemTime is? + Timestamp::try_from(time).expect("Invalid timestamp") +} diff --git a/src/fmt/mod.rs b/src/fmt/mod.rs index 87ee41923..12ce3207c 100644 --- a/src/fmt/mod.rs +++ b/src/fmt/mod.rs @@ -1,281 +1,27 @@ mod input; - -use std::borrow::Cow; -use std::ffi::{OsStr, OsString}; -use std::fmt::{self, Display, Formatter}; -use std::path::{Component, Path, Prefix}; -use std::sync::OnceLock; - -use aho_corasick::AhoCorasick; - -use self::input::{basename, dirname, remove_extension}; - -/// Designates what should be written to a buffer -/// -/// Each `Token` contains either text, or a placeholder variant, which will be used to generate -/// commands after all tokens for a given command template have been collected. -#[derive(Clone, Debug, PartialEq, Eq)] -pub enum Token { - Placeholder, - Basename, - Parent, - NoExt, - BasenameNoExt, - Text(String), -} - -impl Display for Token { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - match *self { - Token::Placeholder => f.write_str("{}")?, - Token::Basename => f.write_str("{/}")?, - Token::Parent => f.write_str("{//}")?, - Token::NoExt => f.write_str("{.}")?, - Token::BasenameNoExt => f.write_str("{/.}")?, - Token::Text(ref string) => f.write_str(string)?, - } - Ok(()) - } -} - -/// A parsed format string -/// -/// This is either a collection of `Token`s including at least one placeholder variant, -/// or a fixed text. -#[derive(Clone, Debug, PartialEq)] -pub enum FormatTemplate { - Tokens(Vec), - Text(String), -} - -static PLACEHOLDERS: OnceLock = OnceLock::new(); - -impl FormatTemplate { - pub fn has_tokens(&self) -> bool { - matches!(self, FormatTemplate::Tokens(_)) - } - - pub fn parse(fmt: &str) -> Self { - // NOTE: we assume that { and } have the same length - const BRACE_LEN: usize = '{'.len_utf8(); - let mut tokens = Vec::new(); - let mut remaining = fmt; - let mut buf = String::new(); - let placeholders = PLACEHOLDERS.get_or_init(|| { - AhoCorasick::new(["{{", "}}", "{}", "{/}", "{//}", "{.}", "{/.}"]).unwrap() - }); - while let Some(m) = placeholders.find(remaining) { - match m.pattern().as_u32() { - 0 | 1 => { - // we found an escaped {{ or }}, so add - // everything up to the first char to the buffer - // then skip the second one. - buf += &remaining[..m.start() + BRACE_LEN]; - remaining = &remaining[m.end()..]; - } - id if !remaining[m.end()..].starts_with('}') => { - buf += &remaining[..m.start()]; - if !buf.is_empty() { - tokens.push(Token::Text(std::mem::take(&mut buf))); - } - tokens.push(token_from_pattern_id(id)); - remaining = &remaining[m.end()..]; - } - _ => { - // We got a normal pattern, but the final "}" - // is escaped, so add up to that to the buffer, then - // skip the final } - buf += &remaining[..m.end()]; - remaining = &remaining[m.end() + BRACE_LEN..]; - } - } - } - // Add the rest of the string to the buffer, and add the final buffer to the tokens - if !remaining.is_empty() { - buf += remaining; - } - if tokens.is_empty() { - // No placeholders were found, so just return the text - return FormatTemplate::Text(buf); - } - // Add final text segment - if !buf.is_empty() { - tokens.push(Token::Text(buf)); - } - debug_assert!(!tokens.is_empty()); - FormatTemplate::Tokens(tokens) - } - - /// Generate a result string from this template. If path_separator is Some, then it will replace - /// the path separator in all placeholder tokens. Fixed text and tokens are not affected by - /// path separator substitution. - pub fn generate(&self, path: impl AsRef, path_separator: Option<&str>) -> OsString { - use Token::*; - let path = path.as_ref(); - - match *self { - Self::Tokens(ref tokens) => { - let mut s = OsString::new(); - for token in tokens { - match token { - Basename => s.push(Self::replace_separator(basename(path), path_separator)), - BasenameNoExt => s.push(Self::replace_separator( - &remove_extension(basename(path).as_ref()), - path_separator, - )), - NoExt => s.push(Self::replace_separator( - &remove_extension(path), - path_separator, - )), - Parent => s.push(Self::replace_separator(&dirname(path), path_separator)), - Placeholder => { - s.push(Self::replace_separator(path.as_ref(), path_separator)) - } - Text(string) => s.push(string), - } - } - s - } - Self::Text(ref text) => OsString::from(text), - } - } - - /// Replace the path separator in the input with the custom separator string. If path_separator - /// is None, simply return a borrowed Cow of the input. Otherwise, the input is - /// interpreted as a Path and its components are iterated through and re-joined into a new - /// OsString. - fn replace_separator<'a>(path: &'a OsStr, path_separator: Option<&str>) -> Cow<'a, OsStr> { - // fast-path - no replacement necessary - if path_separator.is_none() { - return Cow::Borrowed(path); - } - - let path_separator = path_separator.unwrap(); - let mut out = OsString::with_capacity(path.len()); - let mut components = Path::new(path).components().peekable(); - - while let Some(comp) = components.next() { - match comp { - // Absolute paths on Windows are tricky. A Prefix component is usually a drive - // letter or UNC path, and is usually followed by RootDir. There are also - // "verbatim" prefixes beginning with "\\?\" that skip normalization. We choose to - // ignore verbatim path prefixes here because they're very rare, might be - // impossible to reach here, and there's no good way to deal with them. If users - // are doing something advanced involving verbatim windows paths, they can do their - // own output filtering with a tool like sed. - Component::Prefix(prefix) => { - if let Prefix::UNC(server, share) = prefix.kind() { - // Prefix::UNC is a parsed version of '\\server\share' - out.push(path_separator); - out.push(path_separator); - out.push(server); - out.push(path_separator); - out.push(share); - } else { - // All other Windows prefix types are rendered as-is. This results in e.g. "C:" for - // drive letters. DeviceNS and Verbatim* prefixes won't have backslashes converted, - // but they're not returned by directories fd can search anyway so we don't worry - // about them. - out.push(comp.as_os_str()); - } - } - - // Root directory is always replaced with the custom separator. - Component::RootDir => out.push(path_separator), - - // Everything else is joined normally, with a trailing separator if we're not last - _ => { - out.push(comp.as_os_str()); - if components.peek().is_some() { - out.push(path_separator); - } - } - } - } - Cow::Owned(out) - } -} - -// Convert the id from an aho-corasick match to the -// appropriate token -fn token_from_pattern_id(id: u32) -> Token { - use Token::*; - match id { - 2 => Placeholder, - 3 => Basename, - 4 => Parent, - 5 => NoExt, - 6 => BasenameNoExt, - _ => unreachable!(), - } +pub mod json; +mod template; + +use lscolors::LsColors; + +pub use self::template::{FormatTemplate, Token}; + +/// Description of how the results should be formatted in the output +pub enum OutputFormat { + /// Default. + /// Output as a plain path + Plain, + /// Output the path with color highlighting + Color(LsColors), + /// Use a custom template to format the results + Template(FormatTemplate), + /// Output in the json lines (jsonl, newline separated values) format + Jsonl, } -#[cfg(test)] -mod fmt_tests { - use super::*; - use std::path::PathBuf; - - #[test] - fn parse_no_placeholders() { - let templ = FormatTemplate::parse("This string has no placeholders"); - assert_eq!( - templ, - FormatTemplate::Text("This string has no placeholders".into()) - ); - } - - #[test] - fn parse_only_brace_escapes() { - let templ = FormatTemplate::parse("This string only has escapes like {{ and }}"); - assert_eq!( - templ, - FormatTemplate::Text("This string only has escapes like { and }".into()) - ); - } - - #[test] - fn all_placeholders() { - use Token::*; - - let templ = FormatTemplate::parse( - "{{path={} \ - basename={/} \ - parent={//} \ - noExt={.} \ - basenameNoExt={/.} \ - }}", - ); - assert_eq!( - templ, - FormatTemplate::Tokens(vec![ - Text("{path=".into()), - Placeholder, - Text(" basename=".into()), - Basename, - Text(" parent=".into()), - Parent, - Text(" noExt=".into()), - NoExt, - Text(" basenameNoExt=".into()), - BasenameNoExt, - Text(" }".into()), - ]) - ); - - let mut path = PathBuf::new(); - path.push("a"); - path.push("folder"); - path.push("file.txt"); - - let expanded = templ.generate(&path, Some("/")).into_string().unwrap(); - - assert_eq!( - expanded, - "{path=a/folder/file.txt \ - basename=file.txt \ - parent=a/folder \ - noExt=a/folder/file \ - basenameNoExt=file }" - ); +impl OutputFormat { + /// Return true if the output format uses ANSI colors + pub fn uses_color(&self) -> bool { + matches!(self, OutputFormat::Color(_)) } } diff --git a/src/fmt/template.rs b/src/fmt/template.rs new file mode 100644 index 000000000..7394f27e0 --- /dev/null +++ b/src/fmt/template.rs @@ -0,0 +1,279 @@ +use std::borrow::Cow; +use std::ffi::{OsStr, OsString}; +use std::fmt::{self, Display, Formatter}; +use std::path::{Component, Path, Prefix}; +use std::sync::OnceLock; + +use aho_corasick::AhoCorasick; + +use super::input::{basename, dirname, remove_extension}; + +/// Designates what should be written to a buffer +/// +/// Each `Token` contains either text, or a placeholder variant, which will be used to generate +/// commands after all tokens for a given command template have been collected. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Token { + Placeholder, + Basename, + Parent, + NoExt, + BasenameNoExt, + Text(String), +} + +impl Display for Token { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + match *self { + Token::Placeholder => f.write_str("{}")?, + Token::Basename => f.write_str("{/}")?, + Token::Parent => f.write_str("{//}")?, + Token::NoExt => f.write_str("{.}")?, + Token::BasenameNoExt => f.write_str("{/.}")?, + Token::Text(ref string) => f.write_str(string)?, + } + Ok(()) + } +} + +/// A parsed format string +/// +/// This is either a collection of `Token`s including at least one placeholder variant, +/// or a fixed text. +#[derive(Clone, Debug, PartialEq)] +pub enum FormatTemplate { + Tokens(Vec), + Text(String), +} + +static PLACEHOLDERS: OnceLock = OnceLock::new(); + +impl FormatTemplate { + pub fn has_tokens(&self) -> bool { + matches!(self, FormatTemplate::Tokens(_)) + } + + pub fn parse(fmt: &str) -> Self { + // NOTE: we assume that { and } have the same length + const BRACE_LEN: usize = '{'.len_utf8(); + let mut tokens = Vec::new(); + let mut remaining = fmt; + let mut buf = String::new(); + let placeholders = PLACEHOLDERS.get_or_init(|| { + AhoCorasick::new(["{{", "}}", "{}", "{/}", "{//}", "{.}", "{/.}"]).unwrap() + }); + while let Some(m) = placeholders.find(remaining) { + match m.pattern().as_u32() { + 0 | 1 => { + // we found an escaped {{ or }}, so add + // everything up to the first char to the buffer + // then skip the second one. + buf += &remaining[..m.start() + BRACE_LEN]; + remaining = &remaining[m.end()..]; + } + id if !remaining[m.end()..].starts_with('}') => { + buf += &remaining[..m.start()]; + if !buf.is_empty() { + tokens.push(Token::Text(std::mem::take(&mut buf))); + } + tokens.push(token_from_pattern_id(id)); + remaining = &remaining[m.end()..]; + } + _ => { + // We got a normal pattern, but the final "}" + // is escaped, so add up to that to the buffer, then + // skip the final } + buf += &remaining[..m.end()]; + remaining = &remaining[m.end() + BRACE_LEN..]; + } + } + } + // Add the rest of the string to the buffer, and add the final buffer to the tokens + if !remaining.is_empty() { + buf += remaining; + } + if tokens.is_empty() { + // No placeholders were found, so just return the text + return FormatTemplate::Text(buf); + } + // Add final text segment + if !buf.is_empty() { + tokens.push(Token::Text(buf)); + } + debug_assert!(!tokens.is_empty()); + FormatTemplate::Tokens(tokens) + } + + /// Generate a result string from this template. If path_separator is Some, then it will replace + /// the path separator in all placeholder tokens. Fixed text and tokens are not affected by + /// path separator substitution. + pub fn generate(&self, path: impl AsRef, path_separator: Option<&str>) -> OsString { + use Token::*; + let path = path.as_ref(); + + match *self { + Self::Tokens(ref tokens) => { + let mut s = OsString::new(); + for token in tokens { + match token { + Basename => s.push(Self::replace_separator(basename(path), path_separator)), + BasenameNoExt => s.push(Self::replace_separator( + &remove_extension(basename(path).as_ref()), + path_separator, + )), + NoExt => s.push(Self::replace_separator( + &remove_extension(path), + path_separator, + )), + Parent => s.push(Self::replace_separator(&dirname(path), path_separator)), + Placeholder => { + s.push(Self::replace_separator(path.as_ref(), path_separator)) + } + Text(string) => s.push(string), + } + } + s + } + Self::Text(ref text) => OsString::from(text), + } + } + + /// Replace the path separator in the input with the custom separator string. If path_separator + /// is None, simply return a borrowed Cow of the input. Otherwise, the input is + /// interpreted as a Path and its components are iterated through and re-joined into a new + /// OsString. + fn replace_separator<'a>(path: &'a OsStr, path_separator: Option<&str>) -> Cow<'a, OsStr> { + // fast-path - no replacement necessary + if path_separator.is_none() { + return Cow::Borrowed(path); + } + + let path_separator = path_separator.unwrap(); + let mut out = OsString::with_capacity(path.len()); + let mut components = Path::new(path).components().peekable(); + + while let Some(comp) = components.next() { + match comp { + // Absolute paths on Windows are tricky. A Prefix component is usually a drive + // letter or UNC path, and is usually followed by RootDir. There are also + // "verbatim" prefixes beginning with "\\?\" that skip normalization. We choose to + // ignore verbatim path prefixes here because they're very rare, might be + // impossible to reach here, and there's no good way to deal with them. If users + // are doing something advanced involving verbatim windows paths, they can do their + // own output filtering with a tool like sed. + Component::Prefix(prefix) => { + if let Prefix::UNC(server, share) = prefix.kind() { + // Prefix::UNC is a parsed version of '\\server\share' + out.push(path_separator); + out.push(path_separator); + out.push(server); + out.push(path_separator); + out.push(share); + } else { + // All other Windows prefix types are rendered as-is. This results in e.g. "C:" for + // drive letters. DeviceNS and Verbatim* prefixes won't have backslashes converted, + // but they're not returned by directories fd can search anyway so we don't worry + // about them. + out.push(comp.as_os_str()); + } + } + + // Root directory is always replaced with the custom separator. + Component::RootDir => out.push(path_separator), + + // Everything else is joined normally, with a trailing separator if we're not last + _ => { + out.push(comp.as_os_str()); + if components.peek().is_some() { + out.push(path_separator); + } + } + } + } + Cow::Owned(out) + } +} + +// Convert the id from an aho-corasick match to the +// appropriate token +fn token_from_pattern_id(id: u32) -> Token { + use Token::*; + match id { + 2 => Placeholder, + 3 => Basename, + 4 => Parent, + 5 => NoExt, + 6 => BasenameNoExt, + _ => unreachable!(), + } +} + +#[cfg(test)] +mod fmt_tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn parse_no_placeholders() { + let templ = FormatTemplate::parse("This string has no placeholders"); + assert_eq!( + templ, + FormatTemplate::Text("This string has no placeholders".into()) + ); + } + + #[test] + fn parse_only_brace_escapes() { + let templ = FormatTemplate::parse("This string only has escapes like {{ and }}"); + assert_eq!( + templ, + FormatTemplate::Text("This string only has escapes like { and }".into()) + ); + } + + #[test] + fn all_placeholders() { + use Token::*; + + let templ = FormatTemplate::parse( + "{{path={} \ + basename={/} \ + parent={//} \ + noExt={.} \ + basenameNoExt={/.} \ + }}", + ); + assert_eq!( + templ, + FormatTemplate::Tokens(vec![ + Text("{path=".into()), + Placeholder, + Text(" basename=".into()), + Basename, + Text(" parent=".into()), + Parent, + Text(" noExt=".into()), + NoExt, + Text(" basenameNoExt=".into()), + BasenameNoExt, + Text(" }".into()), + ]) + ); + + let mut path = PathBuf::new(); + path.push("a"); + path.push("folder"); + path.push("file.txt"); + + let expanded = templ.generate(&path, Some("/")).into_string().unwrap(); + + assert_eq!( + expanded, + "{path=a/folder/file.txt \ + basename=file.txt \ + parent=a/folder \ + noExt=a/folder/file \ + basenameNoExt=file }" + ); + } +} diff --git a/src/main.rs b/src/main.rs index fafb3b900..9e161cae5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,6 +32,7 @@ use crate::filetypes::FileTypes; #[cfg(unix)] use crate::filter::OwnerFilter; use crate::filter::TimeFilter; +use crate::fmt::{FormatTemplate, OutputFormat}; use crate::regex_helper::{pattern_has_uppercase_char, pattern_matches_strings_with_leading_dot}; // We use jemalloc for performance reasons, see https://github.com/sharkdp/fd/pull/481 @@ -232,11 +233,18 @@ fn construct_config(mut opts: Opts, pattern_regexps: &[String]) -> Result true, HyperlinkWhen::Never => false, @@ -265,7 +273,6 @@ fn construct_config(mut opts: Opts, pattern_regexps: &[String]) -> Result Result String { path.replace(std::path::MAIN_SEPARATOR, new_path_separator) } -// TODO: this function is performance critical and can probably be optimized -pub fn print_entry(stdout: &mut W, entry: &DirEntry, config: &Config) -> io::Result<()> { - let mut has_hyperlink = false; - if config.hyperlink - && let Some(url) = PathUrl::new(entry.path()) - { - write!(stdout, "\x1B]8;;{url}\x1B\\")?; - has_hyperlink = true; - } +pub struct Printer<'a, W> { + config: &'a Config, + stdout: W, +} - if let Some(ref format) = config.format { - print_entry_format(stdout, entry, config, format)?; - } else if let Some(ref ls_colors) = config.ls_colors { - print_entry_colorized(stdout, entry, config, ls_colors)?; - } else { - print_entry_uncolorized(stdout, entry, config)?; - }; +impl<'a, W: Write> Printer<'a, W> { + pub fn new(config: &'a Config, stdout: W) -> Self { + Self { config, stdout } + } - if has_hyperlink { - write!(stdout, "\x1B]8;;\x1B\\")?; + pub fn flush(&mut self) -> io::Result<()> { + self.stdout.flush() } - if config.null_separator { - write!(stdout, "\0") - } else { - writeln!(stdout) + // TODO: this function is performance critical and can probably be optimized + pub fn print_entry(&mut self, entry: &DirEntry) -> io::Result<()> { + let mut has_hyperlink = false; + if self.config.hyperlink + && let Some(url) = PathUrl::new(entry.path()) + { + write!(self.stdout, "\x1B]8;;{url}\x1B\\")?; + has_hyperlink = true; + } + match &self.config.format { + Plain => self.print_entry_uncolorized(entry)?, + Color(colors) => self.print_entry_colorized(entry, colors)?, + Template(template) => self.print_entry_format(entry, template)?, + Jsonl => self.print_entry_json(entry)?, + }; + + if has_hyperlink { + write!(self.stdout, "\x1B]8;;\x1B\\")?; + } + + if self.config.null_separator { + write!(self.stdout, "\0") + } else { + writeln!(self.stdout) + } } -} -// Display a trailing slash if the path is a directory and the config option is enabled. -// If the path_separator option is set, display that instead. -// The trailing slash will not be colored. -#[inline] -fn print_trailing_slash( - stdout: &mut W, - entry: &DirEntry, - config: &Config, - style: Option<&Style>, -) -> io::Result<()> { - if entry.file_type().is_some_and(|ft| ft.is_dir()) { - write!( - stdout, - "{}", - style - .map(Style::to_nu_ansi_term_style) - .unwrap_or_default() - .paint(&config.actual_path_separator) - )?; + // Display a trailing slash if the path is a directory and the config option is enabled. + // If the path_separator option is set, display that instead. + // The trailing slash will not be colored. + #[inline] + fn print_trailing_slash(&mut self, entry: &DirEntry, style: Option<&Style>) -> io::Result<()> { + if entry.file_type().is_some_and(|ft| ft.is_dir()) { + write!( + self.stdout, + "{}", + style + .map(Style::to_nu_ansi_term_style) + .unwrap_or_default() + .paint(&self.config.actual_path_separator) + )?; + } + Ok(()) } - Ok(()) -} -// TODO: this function is performance critical and can probably be optimized -fn print_entry_format( - stdout: &mut W, - entry: &DirEntry, - config: &Config, - format: &FormatTemplate, -) -> io::Result<()> { - let output = format.generate( - entry.stripped_path(config), - config.path_separator.as_deref(), - ); - // TODO: support writing raw bytes on unix? - write!(stdout, "{}", output.to_string_lossy()) -} + // TODO: this function is performance critical and can probably be optimized + fn print_entry_format(&mut self, entry: &DirEntry, format: &FormatTemplate) -> io::Result<()> { + let output = format.generate( + entry.stripped_path(self.config), + self.config.path_separator.as_deref(), + ); + // TODO: support writing raw bytes on unix? + write!(self.stdout, "{}", output.to_string_lossy()) + } -// TODO: this function is performance critical and can probably be optimized -fn print_entry_colorized( - stdout: &mut W, - entry: &DirEntry, - config: &Config, - ls_colors: &LsColors, -) -> io::Result<()> { - // Split the path between the parent and the last component - let mut offset = 0; - let path = entry.stripped_path(config); - let path_str = path.to_string_lossy(); - - if let Some(parent) = path.parent() { - offset = parent.to_string_lossy().len(); - for c in path_str[offset..].chars() { - if std::path::is_separator(c) { - offset += c.len_utf8(); - } else { - break; + // TODO: this function is performance critical and can probably be optimized + fn print_entry_colorized(&mut self, entry: &DirEntry, ls_colors: &LsColors) -> io::Result<()> { + // Split the path between the parent and the last component + let mut offset = 0; + let path = entry.stripped_path(self.config); + let path_str = path.to_string_lossy(); + + if let Some(parent) = path.parent() { + offset = parent.to_string_lossy().len(); + for c in path_str[offset..].chars() { + if std::path::is_separator(c) { + offset += c.len_utf8(); + } else { + break; + } } } - } - if offset > 0 { - let mut parent_str = Cow::from(&path_str[..offset]); - if let Some(ref separator) = config.path_separator { - *parent_str.to_mut() = replace_path_separator(&parent_str, separator); + if offset > 0 { + let mut parent_str = Cow::from(&path_str[..offset]); + if let Some(ref separator) = self.config.path_separator { + *parent_str.to_mut() = replace_path_separator(&parent_str, separator); + } + + let style = ls_colors + .style_for_indicator(Indicator::Directory) + .map(Style::to_nu_ansi_term_style) + .unwrap_or_default(); + write!(self.stdout, "{}", style.paint(parent_str))?; } - let style = ls_colors - .style_for_indicator(Indicator::Directory) + let style = entry + .style(ls_colors) .map(Style::to_nu_ansi_term_style) .unwrap_or_default(); - write!(stdout, "{}", style.paint(parent_str))?; - } + write!(self.stdout, "{}", style.paint(&path_str[offset..]))?; - let style = entry - .style(ls_colors) - .map(Style::to_nu_ansi_term_style) - .unwrap_or_default(); - write!(stdout, "{}", style.paint(&path_str[offset..]))?; + self.print_trailing_slash(entry, ls_colors.style_for_indicator(Indicator::Directory))?; - print_trailing_slash( - stdout, - entry, - config, - ls_colors.style_for_indicator(Indicator::Directory), - )?; + Ok(()) + } - Ok(()) -} + // TODO: this function is performance critical and can probably be optimized + fn print_entry_uncolorized_base(&mut self, entry: &DirEntry) -> io::Result<()> { + let path = entry.stripped_path(self.config); + + let mut path_string = path.to_string_lossy(); + if let Some(ref separator) = self.config.path_separator { + *path_string.to_mut() = replace_path_separator(&path_string, separator); + } + write!(self.stdout, "{path_string}")?; + self.print_trailing_slash(entry, None) + } -// TODO: this function is performance critical and can probably be optimized -fn print_entry_uncolorized_base( - stdout: &mut W, - entry: &DirEntry, - config: &Config, -) -> io::Result<()> { - let path = entry.stripped_path(config); - - let mut path_string = path.to_string_lossy(); - if let Some(ref separator) = config.path_separator { - *path_string.to_mut() = replace_path_separator(&path_string, separator); + #[cfg(not(unix))] + fn print_entry_uncolorized(&mut self, entry: &DirEntry) -> io::Result<()> { + self.print_entry_uncolorized_base(entry) } - write!(stdout, "{path_string}")?; - print_trailing_slash(stdout, entry, config, None) -} -#[cfg(not(unix))] -fn print_entry_uncolorized( - stdout: &mut W, - entry: &DirEntry, - config: &Config, -) -> io::Result<()> { - print_entry_uncolorized_base(stdout, entry, config) -} + #[cfg(unix)] + fn print_entry_uncolorized(&mut self, entry: &DirEntry) -> io::Result<()> { + use std::os::unix::ffi::OsStrExt; + + if self.config.interactive_terminal || self.config.path_separator.is_some() { + // Fall back to the base implementation + self.print_entry_uncolorized_base(entry) + } else { + // Print path as raw bytes, allowing invalid UTF-8 filenames to be passed to other processes + self.stdout + .write_all(entry.stripped_path(self.config).as_os_str().as_bytes())?; + self.print_trailing_slash(entry, None) + } + } -#[cfg(unix)] -fn print_entry_uncolorized( - stdout: &mut W, - entry: &DirEntry, - config: &Config, -) -> io::Result<()> { - use std::os::unix::ffi::OsStrExt; - - if config.interactive_terminal || config.path_separator.is_some() { - // Fall back to the base implementation - print_entry_uncolorized_base(stdout, entry, config) - } else { - // Print path as raw bytes, allowing invalid UTF-8 filenames to be passed to other processes - stdout.write_all(entry.stripped_path(config).as_os_str().as_bytes())?; - print_trailing_slash(stdout, entry, config, None) + /// Print the entry as a jsonl line + fn print_entry_json(&mut self, entry: &DirEntry) -> io::Result<()> { + let path = entry.stripped_path(self.config); + // Should we have an option to avoid doing a stat call? + // Is it worth doing json output if all you have is the path and file type? + let metadata = entry.metadata(); + + crate::fmt::json::output_json( + &mut self.stdout, + path, + entry.file_type(), + metadata, + &self.config.path_separator, + )?; + Ok(()) } } diff --git a/src/walk.rs b/src/walk.rs index 27f295db5..587671362 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -21,6 +21,7 @@ use crate::error::print_error; use crate::exec; use crate::exit_codes::{ExitCode, merge_exitcodes}; use crate::filesystem; +use crate::fmt::OutputFormat; use crate::output; /// The receiver thread can either be buffering results or directly streaming to the console. @@ -136,8 +137,6 @@ struct ReceiverBuffer<'a, W> { interrupt_flag: &'a AtomicBool, /// Receiver for worker results. rx: Receiver, - /// Standard output. - stdout: W, /// The current buffer mode. mode: ReceiverMode, /// The deadline to switch to streaming mode. @@ -146,9 +145,11 @@ struct ReceiverBuffer<'a, W> { buffer: Vec, /// Result count. num_results: usize, + /// The stdout printer instance. + printer: output::Printer<'a, W>, } -impl<'a, W: Write> ReceiverBuffer<'a, W> { +impl<'a, W: Write + 'a> ReceiverBuffer<'a, W> { /// Create a new receiver buffer. fn new(state: &'a WorkerState, rx: Receiver, stdout: W) -> Self { let config = &state.config; @@ -162,20 +163,20 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { quit_flag, interrupt_flag, rx, - stdout, mode: ReceiverMode::Buffering, deadline, buffer: Vec::with_capacity(MAX_BUFFER_LENGTH), num_results: 0, + printer: output::Printer::new(config, stdout), } } /// Process results until finished. fn process(&mut self) -> ExitCode { loop { - if let Err(ec) = self.poll() { + if let Err(err) = self.poll() { self.quit_flag.store(true, Ordering::Relaxed); - return ec; + return err; } } } @@ -250,7 +251,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { /// Output a path. fn print(&mut self, entry: &DirEntry) -> Result<(), ExitCode> { - if let Err(e) = output::print_entry(&mut self.stdout, entry, self.config) + if let Err(e) = self.printer.print_entry(entry) && e.kind() != ::std::io::ErrorKind::BrokenPipe { print_error(format!("Could not write to output: {e}")); @@ -294,7 +295,7 @@ impl<'a, W: Write> ReceiverBuffer<'a, W> { /// Flush stdout if necessary. fn flush(&mut self) -> Result<(), ExitCode> { - if self.stdout.flush().is_err() { + if self.printer.flush().is_err() { // Probably a broken pipe. Exit gracefully. return Err(ExitCode::GeneralError); } @@ -597,7 +598,7 @@ impl WorkerState { } if config.is_printing() - && let Some(ls_colors) = &config.ls_colors + && let OutputFormat::Color(ls_colors) = &config.format { // Compute colors in parallel entry.style(ls_colors); @@ -624,7 +625,7 @@ impl WorkerState { let config = &self.config; let walker = self.build_walker(paths)?; - if config.ls_colors.is_some() && config.is_printing() { + if config.format.uses_color() && config.is_printing() { let quit_flag = Arc::clone(&self.quit_flag); let interrupt_flag = Arc::clone(&self.interrupt_flag); diff --git a/tests/tests.rs b/tests/tests.rs index e3d43b74b..bb342f9eb 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -2707,3 +2707,71 @@ fn test_hyperlink() { te.assert_output(&["--hyperlink=always", "a.foo"], &expected); } + +/// Test json output +#[test] +fn test_json() { + let te = TestEnv::new(DEFAULT_DIRS, DEFAULT_FILES); + + // We use path-separator=/ so that the paths are the same on windows as on + // unix + let re = te.assert_success_and_get_output(".", &["--json", "--path-separator=/", "foo"]); + let stdout = String::from_utf8_lossy(&re.stdout); + let found_files: std::collections::HashSet<_> = stdout + .split("\n") + .flat_map(|line| { + if line.is_empty() { + return None; + } + let file: serde_json::Value = serde_json::from_str(line).unwrap(); + assert!(file.is_object(), "Match is not object"); + assert!(file["path"].is_object(), "Path is not an object"); + Some( + file["path"]["text"] + .as_str() + .expect("path.text is not a string") + .to_owned(), + ) + }) + .collect(); + + let expected = [ + "a.foo", + "one/b.foo", + "one/two/c.foo", + "one/two/C.Foo2", + "one/two/three/directory_foo", + "one/two/three/d.foo", + ]; + + assert_eq!(found_files.len(), expected.len()); + for f in expected { + assert!(found_files.contains(f), "didn't find {f}"); + } +} + +/// Filenames with invalid UTF-8 sequences +#[cfg(target_os = "linux")] +#[test] +fn test_json_invalid_utf8() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let dirs = &["test1"]; + let files = &[]; + let te = TestEnv::new(dirs, files); + + fs::File::create( + te.test_root() + .join(OsStr::from_bytes(b"test1/test_\xFEinvalid.txt")), + ) + .unwrap(); + + let re = te.assert_success_and_get_output(".", &["", "--json", "test1/"]); + let stdout = String::from_utf8_lossy(&re.stdout); + let files: serde_json::Value = serde_json::from_str(&stdout).unwrap(); + assert!(files.is_object()); + assert_eq!(files["path"]["bytes"], "dGVzdDEvdGVzdF/+aW52YWxpZC50eHQ="); + + te.assert_output(&["invalid", "test1/"], "test1/test_�invalid.txt"); +}