diff --git a/crates/tokf-cli/src/install_cmd.rs b/crates/tokf-cli/src/install_cmd.rs index 8ccee572..16130e88 100644 --- a/crates/tokf-cli/src/install_cmd.rs +++ b/crates/tokf-cli/src/install_cmd.rs @@ -40,14 +40,23 @@ fn install( ) -> anyhow::Result { let client = Client::authed()?; - let (hash, author) = resolve_hash(&client, filter)?; - let downloaded = filter_client::download_filter(&client, &hash)?; + let (url_hash, author) = resolve_hash(&client, filter)?; + let downloaded = filter_client::download_filter(&client, &url_hash)?; // Parse TOML once; derive command pattern and detect Lua in a single pass. let (command_pattern, config) = parse_filter_toml(&downloaded.filter_toml)?; - // Verify the downloaded content matches the requested hash (tamper detection). - verify_content_hash(&hash, &config)?; + // The hash actually used for attribution. Preference order: + // 1. Server-provided `v1_hash` (schema-independent; ADR-0002). + // 2. Server-provided `content_hash` (schema-tied recompute; #351). + // 3. URL hash (legacy fallback for old servers; #350 scaffolding). + let hash = verify_and_resolve_hash( + &url_hash, + downloaded.v1_hash.as_deref(), + downloaded.content_hash.as_deref(), + &config, + &downloaded.filter_toml, + )?; let install_base = resolve_install_base(local)?; let rel_path = command_pattern_to_path(&command_pattern); @@ -117,21 +126,87 @@ fn parse_filter_toml(toml_str: &str) -> anyhow::Result<(String, FilterConfig)> { Ok((pattern, config)) } -/// Verify the downloaded filter matches the expected content hash (tamper detection). +/// Verify the downloaded filter and resolve the authoritative content hash. +/// +/// Preference order, highest to lowest trust: +/// +/// 1. **`server_v1_hash`** — schema-independent canonical TOML hash +/// (ADR-0002). When both client and server speak v1, this is the +/// long-term identity. Verified by recomputing v1 locally and +/// matching. +/// 2. **`server_content_hash`** — server's recomputed `canonical_hash` +/// (#351). Schema-tied; trusted because the server is the wire-tamper +/// boundary, but susceptible to drift across `FilterConfig` schema +/// changes. Verified by recomputing locally. +/// 3. **URL hash** — legacy fallback for servers that emit neither +/// `v1_hash` nor `content_hash`. Equivalent to the pre-#351 behaviour; +/// fails for filters whose URL hash is from a different schema epoch +/// than the client knows about. +/// +/// Returns the hash to use for attribution and display, preferring v1. /// /// # Errors /// -/// Returns an error if the computed hash differs from the expected hash. -fn verify_content_hash(expected_hash: &str, config: &FilterConfig) -> anyhow::Result<()> { - let computed = tokf_common::hash::canonical_hash(config) +/// - Any server-provided hash disagrees with the locally-recomputed value +/// for that same form (wire tamper between server and client). +/// - All server hashes are absent and the URL hash doesn't match the +/// client's `canonical_hash`. +fn verify_and_resolve_hash( + url_hash: &str, + server_v1_hash: Option<&str>, + server_content_hash: Option<&str>, + config: &FilterConfig, + filter_toml: &str, +) -> anyhow::Result { + if let Some(server_v1) = server_v1_hash { + let client_v1 = tokf_common::canonical_v1::hash(filter_toml) + .map_err(|e| anyhow::anyhow!("could not compute v1 hash: {e}"))?; + if client_v1 != server_v1 { + anyhow::bail!( + "v1 hash mismatch with server-provided value: \ + server={server_v1}, computed={client_v1} — \ + the connection may have been tampered with" + ); + } + if url_hash != server_v1 { + eprintln!( + "[tokf] note: filter URL hash {url_hash} differs from current \ + v1 content hash {server_v1}; the filter is now identified \ + under v1." + ); + } + return Ok(server_v1.to_string()); + } + + let client_hash = tokf_common::hash::canonical_hash(config) .map_err(|e| anyhow::anyhow!("could not compute filter hash: {e}"))?; - if computed != expected_hash { + if let Some(server_hash) = server_content_hash { + if client_hash != server_hash { + anyhow::bail!( + "filter content hash mismatch with server-provided value: \ + server={server_hash}, computed={client_hash} — \ + the connection may have been tampered with" + ); + } + if url_hash != server_hash { + eprintln!( + "[tokf] note: filter URL hash {url_hash} differs from current \ + content hash {server_hash}; the filter was published under an \ + older schema and re-identifies under the current one." + ); + } + return Ok(server_hash.to_string()); + } + // Pre-#351 server: the URL hash is the only identity we have. + if client_hash != url_hash { anyhow::bail!( - "filter hash mismatch: expected {expected_hash}, got {computed} — \ - the server may have returned tampered content" + "filter hash mismatch: expected {url_hash}, computed {client_hash} — \ + the server may have returned tampered content, or the filter may \ + have been published under a `FilterConfig` schema this client \ + cannot reproduce (see issue #350)" ); } - Ok(()) + Ok(url_hash.to_string()) } /// Show the filter TOML and prompt for installation consent. @@ -348,6 +423,90 @@ fn write_filter( mod tests { use super::*; + /// Sample TOML, parsed config, and the three relevant hashes for + /// driving `verify_and_resolve_hash` deterministically. + fn sample_filter() -> (String, FilterConfig, String, String) { + let toml = r#"command = "git push""#.to_string(); + let cfg: FilterConfig = toml::from_str(&toml).unwrap(); + let canonical = tokf_common::hash::canonical_hash(&cfg).unwrap(); + let v1 = tokf_common::canonical_v1::hash(&toml).unwrap(); + (toml, cfg, canonical, v1) + } + + #[test] + fn verify_and_resolve_prefers_v1_when_provided() { + let (toml, cfg, _canonical, v1) = sample_filter(); + // URL hash is stale; server provides v1. Client picks v1. + let url = "0".repeat(64); + let resolved = verify_and_resolve_hash(&url, Some(&v1), None, &cfg, &toml).unwrap(); + assert_eq!(resolved, v1); + } + + #[test] + fn verify_and_resolve_v1_takes_precedence_over_content_hash() { + let (toml, cfg, canonical, v1) = sample_filter(); + // Both v1 and content_hash present: v1 wins. + let url = "0".repeat(64); + let resolved = + verify_and_resolve_hash(&url, Some(&v1), Some(&canonical), &cfg, &toml).unwrap(); + assert_eq!(resolved, v1); + assert_ne!(resolved, canonical); + } + + #[test] + fn verify_and_resolve_errors_when_v1_disagrees() { + let (toml, cfg, _canonical, _v1) = sample_filter(); + let url = "0".repeat(64); + let bogus_v1 = "v1:".to_string() + &"1".repeat(64); + let err = verify_and_resolve_hash(&url, Some(&bogus_v1), None, &cfg, &toml).unwrap_err(); + let msg = format!("{err:#}"); + assert!( + msg.contains("v1 hash mismatch with server-provided value"), + "wrong error: {msg}" + ); + } + + #[test] + fn verify_and_resolve_falls_back_to_content_hash_when_v1_absent() { + let (toml, cfg, canonical, _v1) = sample_filter(); + // Pre-v1 server: provides content_hash but no v1. + let url = "0".repeat(64); + let resolved = verify_and_resolve_hash(&url, None, Some(&canonical), &cfg, &toml).unwrap(); + assert_eq!(resolved, canonical); + } + + #[test] + fn verify_and_resolve_errors_when_content_hash_disagrees() { + let (toml, cfg, _canonical, _v1) = sample_filter(); + let url = "0".repeat(64); + let bogus = "1".repeat(64); + let err = verify_and_resolve_hash(&url, None, Some(&bogus), &cfg, &toml).unwrap_err(); + let msg = format!("{err:#}"); + assert!( + msg.contains("filter content hash mismatch with server-provided value"), + "wrong error: {msg}" + ); + } + + #[test] + fn verify_and_resolve_falls_back_to_url_hash_on_pre_351_server() { + let (toml, cfg, canonical, _v1) = sample_filter(); + // Pre-#351 server: no v1_hash, no content_hash; URL hash equals + // the client's canonical_hash. + let resolved = verify_and_resolve_hash(&canonical, None, None, &cfg, &toml).unwrap(); + assert_eq!(resolved, canonical); + } + + #[test] + fn verify_and_resolve_errors_on_pre_351_server_with_url_mismatch() { + let (toml, cfg, _canonical, _v1) = sample_filter(); + let url = "0".repeat(64); + let err = verify_and_resolve_hash(&url, None, None, &cfg, &toml).unwrap_err(); + let msg = format!("{err:#}"); + assert!(msg.contains("hash mismatch"), "wrong error: {msg}"); + assert!(msg.contains("issue #350"), "should reference issue: {msg}"); + } + #[test] fn command_pattern_to_install_path_single_word() { let path = command_pattern_to_path("git"); diff --git a/crates/tokf-cli/src/remote/filter_client.rs b/crates/tokf-cli/src/remote/filter_client.rs index edc817bf..175fa0a1 100644 --- a/crates/tokf-cli/src/remote/filter_client.rs +++ b/crates/tokf-cli/src/remote/filter_client.rs @@ -50,6 +50,18 @@ pub struct TestFilePayload { pub struct DownloadedFilter { pub filter_toml: String, pub test_files: Vec, + /// Server-recomputed canonical hash under the current `FilterConfig` + /// schema. `None` when the server is older than this client (graceful + /// degradation). When present, it's the authoritative identity used by + /// the install command — see issue #350. + #[serde(default)] + pub content_hash: Option, + /// Server-computed canonical v1 hash (ADR-0002). Schema-independent; + /// the long-term identity for every filter. `None` against pre-v1 + /// servers; on those, the client falls back to `content_hash` and + /// then to the URL hash. + #[serde(default)] + pub v1_hash: Option, } /// Search the community filter registry. @@ -153,6 +165,27 @@ mod tests { assert!(dl.test_files.is_empty()); } + /// Old-server compatibility: response without `content_hash` deserialises + /// fine and the field is `None`. + #[test] + fn deserialize_downloaded_filter_without_content_hash() { + let json = r#"{"filter_toml": "command = \"x\"\n", "test_files": []}"#; + let dl: DownloadedFilter = serde_json::from_str(json).unwrap(); + assert!(dl.content_hash.is_none()); + } + + /// New-server response includes `content_hash`. + #[test] + fn deserialize_downloaded_filter_with_content_hash() { + let json = r#"{ + "filter_toml": "command = \"x\"\n", + "test_files": [], + "content_hash": "abc123" + }"#; + let dl: DownloadedFilter = serde_json::from_str(json).unwrap(); + assert_eq!(dl.content_hash.as_deref(), Some("abc123")); + } + #[test] fn deserialize_filter_summary_with_is_stdlib() { let json = r#"{ diff --git a/crates/tokf-common/Cargo.toml b/crates/tokf-common/Cargo.toml index 81f0a501..2660e90c 100644 --- a/crates/tokf-common/Cargo.toml +++ b/crates/tokf-common/Cargo.toml @@ -13,17 +13,21 @@ categories = ["command-line-utilities", "development-tools"] serde = { version = "1", features = ["derive"] } serde_json = "1" sha2 = "0.10" -toml = { version = "1.0", optional = true } +# Required by `canonical_v1`. Pinned exactly because v1's hash output depends +# on the toml crate's emission for canonical bytes; any change to its +# emission (escape choices, float formatting, key ordering) silently changes +# every published hash. Per ADR-0002, version bumps must be gated by the +# frozen v1 corpus; if the corpus drifts, either stay pinned or ship v2. +toml = "=1.0.3" regex = { version = "1", optional = true } unicode-normalization = "0.1" [features] default = [] -validation = ["toml", "regex"] +validation = ["regex"] [lints] workspace = true [dev-dependencies] -toml = "1.0" ts-rs = "12" diff --git a/crates/tokf-common/src/canonical_v1.rs b/crates/tokf-common/src/canonical_v1.rs new file mode 100644 index 00000000..f1bf87b3 --- /dev/null +++ b/crates/tokf-common/src/canonical_v1.rs @@ -0,0 +1,499 @@ +//! Canonical v1 filter hash. +//! +//! Implements the algorithm specified in `docs/adr/0002-canonical-v1-hash.md`: +//! +//! 1. Parse TOML 1.0 to a [`toml::Value`] tree. +//! 2. Walk the tree applying three normalisation passes: +//! - sort arrays whose paths appear in [`UNORDERED_PATHS`]; +//! - collapse `command = ["x"]` to `command = "x"` (single-entry only); +//! - prune entries equal to `false`, `[]`, or `{}` (TOML-level defaults). +//! 3. Re-emit via [`toml::to_string`]. +//! 4. SHA-256 the bytes; format as `v1:`. +//! +//! v1 is FROZEN. Modifying this module's behaviour for an existing input +//! is a v2 trigger, not a fix. The frozen corpus under +//! `crates/tokf-common/tests/canonical_v1_corpus/` is the contract. + +use std::fmt::Write as _; + +use sha2::{Digest, Sha256}; +use toml::Value; + +const VERSION: &str = "v1"; + +/// Paths whose values are unordered arrays (sets) under v1's policy table. +/// +/// Matched structurally against the TOML AST. A path like +/// `"on_success.skip"` matches the `skip` key inside the `on_success` +/// table; `"skip"` matches the top-level `skip` key. New paths added here +/// must use the conservative semantics — once shipped under a v1 hash, +/// the policy is frozen for that path. See ADR-0002 §"Adding to the +/// table". +const UNORDERED_PATHS: &[&str] = &["skip", "keep", "on_success.skip", "on_failure.skip"]; + +/// Compute the canonical v1 hash of a filter TOML. +/// +/// # Errors +/// +/// Returns a [`HashError`] if the input is not valid TOML 1.0, the root +/// is not a table, or any float is non-finite. +pub fn hash(toml_str: &str) -> Result { + let mut value: Value = toml::from_str(toml_str)?; + let table = value.as_table_mut().ok_or(HashError::RootNotTable)?; + reject_non_finite_floats_in_table(table)?; + + sort_unordered_arrays(table, ""); + collapse_command_single_form(table); + prune_defaults_in_table(table); + + let canonical = toml::to_string(&Value::Table(std::mem::take(table))) + .map_err(|e| HashError::Emit(e.to_string()))?; + let digest = Sha256::digest(canonical.as_bytes()); + let mut out = String::with_capacity(VERSION.len() + 1 + 64); + let _ = write!(out, "{VERSION}:"); + for b in &digest { + let _ = write!(out, "{b:02x}"); + } + Ok(out) +} + +#[derive(Debug)] +pub enum HashError { + /// Input was not valid TOML 1.0. + Parse(String), + /// Input parsed but the root was not a table. + RootNotTable, + /// Input contained `inf`, `-inf`, or `nan`. + NonFiniteFloat, + /// `toml::to_string` failed on the canonical value (should not happen + /// for any well-formed parse; defensive). + Emit(String), +} + +impl std::fmt::Display for HashError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Parse(m) => write!(f, "parse: {m}"), + Self::RootNotTable => write!(f, "filter root must be a TOML table"), + Self::NonFiniteFloat => write!(f, "filter contains a non-finite float (inf or nan)"), + Self::Emit(m) => write!(f, "canonical TOML emission failed: {m}"), + } + } +} + +impl std::error::Error for HashError {} + +impl From for HashError { + fn from(e: toml::de::Error) -> Self { + Self::Parse(e.to_string()) + } +} + +// ── Normalisation passes ────────────────────────────────────────────────── + +fn reject_non_finite_floats_in_table(t: &toml::map::Map) -> Result<(), HashError> { + for v in t.values() { + reject_non_finite_floats_in_value(v)?; + } + Ok(()) +} + +fn reject_non_finite_floats_in_value(v: &Value) -> Result<(), HashError> { + match v { + Value::Float(f) if !f.is_finite() => Err(HashError::NonFiniteFloat), + Value::Array(a) => a.iter().try_for_each(reject_non_finite_floats_in_value), + Value::Table(t) => reject_non_finite_floats_in_table(t), + _ => Ok(()), + } +} + +/// Sort arrays whose path appears in [`UNORDERED_PATHS`]. `path_prefix` is +/// the dotted path to `t` from the document root (`""` at root). +fn sort_unordered_arrays(t: &mut toml::map::Map, path_prefix: &str) { + for (key, value) in t.iter_mut() { + let here = if path_prefix.is_empty() { + key.clone() + } else { + format!("{path_prefix}.{key}") + }; + match value { + Value::Array(arr) if UNORDERED_PATHS.contains(&here.as_str()) => { + sort_array_canonically(arr); + } + Value::Array(arr) => { + // Walk array-of-tables to find unordered paths nested inside. + for item in arr.iter_mut() { + if let Value::Table(inner) = item { + sort_unordered_arrays(inner, &here); + } + } + } + Value::Table(inner) => sort_unordered_arrays(inner, &here), + _ => {} + } + } +} + +/// Sort array elements by their canonical TOML byte representation, +/// byte-wise. Stable: equal elements preserve their relative order. +fn sort_array_canonically(arr: &mut [Value]) { + arr.sort_by_cached_key(|v| { + // Wrapping in a single-key table makes any value emittable, including + // bare scalars. + let mut wrapper = toml::map::Map::new(); + wrapper.insert("v".to_string(), v.clone()); + toml::to_string(&Value::Table(wrapper)).unwrap_or_default() + }); +} + +/// Replace `command = ["x"]` with `command = "x"` at the document root. +/// Per ADR-0002 §"Special form: command". +fn collapse_command_single_form(t: &mut toml::map::Map) { + if let Some(Value::Array(arr)) = t.get("command") + && arr.len() == 1 + && let Some(Value::String(s)) = arr.first() + { + let single = Value::String(s.clone()); + t.insert("command".to_string(), single); + } +} + +/// Recursively drop entries equal to `false`, `[]`, or `{}`. Returns +/// whether the surrounding container itself becomes empty after pruning. +fn prune_defaults_in_table(t: &mut toml::map::Map) -> bool { + let keys: Vec = t.keys().cloned().collect(); + for key in keys { + let drop = match t.get_mut(&key) { + Some(Value::Boolean(false)) => true, + Some(Value::Array(arr)) => { + for item in arr.iter_mut() { + if let Value::Table(inner) = item { + // Don't prune array-of-tables entries: their position + // is semantically meaningful. + prune_defaults_in_table(inner); + } + } + arr.is_empty() + } + Some(Value::Table(inner)) => prune_defaults_in_table(inner), + _ => false, + }; + if drop { + t.remove(&key); + } + } + t.is_empty() +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use super::*; + + #[test] + fn output_format_is_v1_prefix_plus_64_hex() { + let h = hash(r#"command = "git push""#).unwrap(); + assert!(h.starts_with("v1:")); + assert_eq!(h.len(), 3 + 64); + assert!( + h[3..] + .chars() + .all(|c| c.is_ascii_hexdigit() && !c.is_uppercase()) + ); + } + + #[test] + fn deterministic_across_calls() { + let toml = r#"command = "git push""#; + assert_eq!(hash(toml).unwrap(), hash(toml).unwrap()); + } + + #[test] + fn whitespace_and_comments_invariant() { + let a = hash(r#"command = "git push""#).unwrap(); + let b = hash("# leading comment\ncommand = \"git push\"\n\n").unwrap(); + assert_eq!(a, b); + } + + #[test] + fn skip_is_unordered() { + let a = hash( + r#"command = "x" +skip = ["aaa", "bbb"]"#, + ) + .unwrap(); + let b = hash( + r#"command = "x" +skip = ["bbb", "aaa"]"#, + ) + .unwrap(); + assert_eq!(a, b); + } + + #[test] + fn keep_is_unordered() { + let a = hash( + r#"command = "x" +keep = ["aaa", "bbb"]"#, + ) + .unwrap(); + let b = hash( + r#"command = "x" +keep = ["bbb", "aaa"]"#, + ) + .unwrap(); + assert_eq!(a, b); + } + + #[test] + fn nested_skip_in_on_success_is_unordered() { + let a = hash( + r#"command = "x" +[on_success] +skip = ["aaa", "bbb"]"#, + ) + .unwrap(); + let b = hash( + r#"command = "x" +[on_success] +skip = ["bbb", "aaa"]"#, + ) + .unwrap(); + assert_eq!(a, b); + } + + #[test] + fn step_is_ordered() { + let a = hash( + r#"command = "x" +[[step]] +run = "a" +[[step]] +run = "b""#, + ) + .unwrap(); + let b = hash( + r#"command = "x" +[[step]] +run = "b" +[[step]] +run = "a""#, + ) + .unwrap(); + assert_ne!(a, b, "[[step]] must preserve source order"); + } + + #[test] + fn replace_is_ordered() { + let a = hash( + r#"command = "x" +[[replace]] +pattern = "a" +output = "1" +[[replace]] +pattern = "b" +output = "2""#, + ) + .unwrap(); + let b = hash( + r#"command = "x" +[[replace]] +pattern = "b" +output = "2" +[[replace]] +pattern = "a" +output = "1""#, + ) + .unwrap(); + assert_ne!(a, b); + } + + #[test] + fn command_single_and_array_of_one_collapse() { + let a = hash(r#"command = "git push""#).unwrap(); + let b = hash(r#"command = ["git push"]"#).unwrap(); + assert_eq!(a, b); + } + + #[test] + fn command_array_of_two_does_not_collapse() { + let single = hash(r#"command = "git""#).unwrap(); + let multi = hash(r#"command = ["git", "git push"]"#).unwrap(); + assert_ne!(single, multi); + } + + #[test] + fn command_array_order_matters_when_multiple() { + let a = hash(r#"command = ["aa", "bb"]"#).unwrap(); + let b = hash(r#"command = ["bb", "aa"]"#).unwrap(); + assert_ne!(a, b); + } + + #[test] + fn dedup_false_omitted() { + let a = hash( + r#"command = "x" +dedup = false"#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_eq!(a, b); + } + + #[test] + fn empty_array_omitted() { + let a = hash( + r#"command = "x" +skip = []"#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_eq!(a, b); + } + + #[test] + fn empty_table_omitted() { + let a = hash( + r#"command = "x" +[on_success]"#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_eq!(a, b); + } + + #[test] + fn nested_table_emptied_after_pruning_is_omitted() { + // on_success contains only default-valued fields → after pruning + // it's an empty table → it should itself be pruned. + let a = hash( + r#"command = "x" +[on_success] +skip = [] +"#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_eq!(a, b); + } + + #[test] + fn dedup_true_preserved() { + let a = hash( + r#"command = "x" +dedup = true"#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_ne!(a, b); + } + + #[test] + fn zero_integer_preserved() { + let a = hash( + r#"command = "x" +dedup_window = 0"#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_ne!(a, b, "0 is a meaningful value, distinct from absent"); + } + + #[test] + fn empty_string_preserved() { + let a = hash( + r#"command = "x" +run = """#, + ) + .unwrap(); + let b = hash(r#"command = "x""#).unwrap(); + assert_ne!(a, b, "empty string is a meaningful value"); + } + + #[test] + fn parse_group_labels_btreemap_invariant() { + let a = hash( + r#"command = "git status" +[parse.group.key] +pattern = "^(.{2}) " +output = "{1}" +[parse.group.labels] +M = "modified" +A = "added" +"#, + ) + .unwrap(); + let b = hash( + r#"command = "git status" +[parse.group.key] +pattern = "^(.{2}) " +output = "{1}" +[parse.group.labels] +A = "added" +M = "modified" +"#, + ) + .unwrap(); + assert_eq!(a, b); + } + + #[test] + fn schema_independent_unknown_field_changes_hash_only_when_used() { + // A field tokf-common doesn't know about: appearing in TOML changes + // the hash; absent has no effect. + let with_field = hash( + r#"command = "x" +some_future_field = "configured""#, + ) + .unwrap(); + let without = hash(r#"command = "x""#).unwrap(); + assert_ne!(with_field, without); + } + + #[test] + fn alias_renames_change_hash() { + // Document the canonical-form-not-semantic-form rule: serde aliases + // produce different canonical bytes, hence different hashes. + let canonical_name = hash( + r#"command = "x" +skip = ["foo"]"#, + ) + .unwrap(); + let alias_name = hash( + r#"command = "x" +strip_lines_matching = ["foo"]"#, + ) + .unwrap(); + assert_ne!(canonical_name, alias_name); + } + + #[test] + fn rejects_non_finite_float() { + let result = hash( + r#"command = "x" +ratio = nan"#, + ); + assert!( + matches!(result, Err(HashError::NonFiniteFloat)), + "got {result:?}" + ); + } + + #[test] + fn rejects_invalid_toml() { + let result = hash("not = valid = toml = at = all"); + assert!(matches!(result, Err(HashError::Parse(_))), "got {result:?}"); + } + + /// Frozen reference: the v1 hash for the smallest valid filter must + /// equal this value, forever. If this test fails, v1's behaviour has + /// drifted — investigate before changing the expected value. + #[test] + fn frozen_reference_minimal_filter() { + let h = hash(r#"command = "git push""#).unwrap(); + assert_eq!( + h, "v1:ca4d87a2bc16ccee27b7007660b810ac02abeb0c917f99ab26b30497d6e52164", + "v1 schema has drifted — this is the canary, do not just update the expected value" + ); + } +} diff --git a/crates/tokf-common/src/lib.rs b/crates/tokf-common/src/lib.rs index abc62f5e..1bb75744 100644 --- a/crates/tokf-common/src/lib.rs +++ b/crates/tokf-common/src/lib.rs @@ -1,3 +1,4 @@ +pub mod canonical_v1; pub mod config; pub mod examples; pub mod hash; diff --git a/crates/tokf-common/tests/canonical_v1.rs b/crates/tokf-common/tests/canonical_v1.rs new file mode 100644 index 00000000..7f0cd7ef --- /dev/null +++ b/crates/tokf-common/tests/canonical_v1.rs @@ -0,0 +1,311 @@ +//! Frozen snapshot + property tests for `canonical_v1::hash`. +//! +//! ## Snapshot corpus +//! +//! Every `.toml` under `crates/tokf-cli/filters/` (excluding `_test/` dirs) +//! is hashed; the result is checked against the recorded value in +//! `tests/canonical_v1_stdlib.txt`. **Do not modify recorded values** — +//! a change in any expected hash means v1's behaviour drifted, which is +//! either a bug to fix or a v2 trigger, never a "just update the +//! expected" case. +//! +//! Authoring: when adding new stdlib filters or building this file for the +//! first time, run: +//! +//! ```sh +//! cargo test -p tokf-common --test canonical_v1 -- dump_stdlib_hashes \ +//! --ignored --nocapture > crates/tokf-common/tests/canonical_v1_stdlib.txt +//! ``` +//! +//! ## Property tests +//! +//! For each of a representative subset of stdlib filters, apply +//! transformations that the spec says are invariant (whitespace, comments, +//! reordered unordered arrays, default omission, `command` collapse) and +//! assert the hash is unchanged. Transformations that the spec says are +//! distinguishing (reordered `[[step]]`, changed values) must change the +//! hash. + +#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] + +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; + +use tokf_common::canonical_v1; + +// ── Discovery ───────────────────────────────────────────────────────────── + +fn stdlib_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../tokf-cli/filters") +} + +/// Walk `stdlib_root()`, return `(relative_path_without_ext, content)` for +/// every filter `.toml`. Excludes `_test/` directories. +fn collect_stdlib_filters() -> Vec<(String, String)> { + let root = stdlib_root(); + let mut out = Vec::new(); + walk(&root, &root, &mut out); + out.sort_by(|a, b| a.0.cmp(&b.0)); + out +} + +fn walk(root: &Path, dir: &Path, out: &mut Vec<(String, String)>) { + for entry in std::fs::read_dir(dir).unwrap() { + let entry = entry.unwrap(); + let path = entry.path(); + let name = entry.file_name(); + let name_str = name.to_string_lossy(); + if path.is_dir() { + if name_str.ends_with("_test") { + continue; + } + walk(root, &path, out); + } else if path.extension().is_some_and(|e| e == "toml") { + let rel = path + .strip_prefix(root) + .unwrap() + .with_extension("") + .to_string_lossy() + .to_string(); + let content = std::fs::read_to_string(&path).unwrap(); + out.push((rel, content)); + } + } +} + +// ── Snapshot corpus ────────────────────────────────────────────────────── + +fn recorded_hashes() -> BTreeMap { + let path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/canonical_v1_stdlib.txt"); + let content = std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("missing {}: {e}", path.display())); + content + .lines() + .filter(|l| !l.trim().is_empty() && !l.starts_with('#')) + .map(|l| { + let (k, v) = l + .split_once(": ") + .unwrap_or_else(|| panic!("malformed line: {l:?}")); + (k.to_string(), v.to_string()) + }) + .collect() +} + +#[test] +fn stdlib_corpus_round_trip() { + let filters = collect_stdlib_filters(); + assert!( + !filters.is_empty(), + "no stdlib filters discovered under {}", + stdlib_root().display() + ); + let recorded = recorded_hashes(); + + let mut missing = Vec::new(); + let mut drifted = Vec::new(); + for (rel, content) in &filters { + let actual = + canonical_v1::hash(content).unwrap_or_else(|e| panic!("{rel}: hash failed: {e}")); + match recorded.get(rel) { + None => missing.push(format!("{rel}: {actual}")), + Some(expected) if expected != &actual => { + drifted.push(format!( + "{rel}\n expected {expected}\n actual {actual}" + )); + } + _ => {} + } + } + + let stdlib_paths: std::collections::BTreeSet<&str> = + filters.iter().map(|(p, _)| p.as_str()).collect(); + let orphans: Vec<&str> = recorded + .keys() + .filter(|k| !stdlib_paths.contains(k.as_str())) + .map(String::as_str) + .collect(); + + let mut errors = Vec::new(); + if !drifted.is_empty() { + errors.push(format!( + "hash drift in {} filter(s):\n{}", + drifted.len(), + drifted.join("\n") + )); + } + if !missing.is_empty() { + errors.push(format!( + "{} filter(s) missing from canonical_v1_stdlib.txt:\n{}\n\ + (run dump_stdlib_hashes to refresh — see test file header)", + missing.len(), + missing.join("\n") + )); + } + if !orphans.is_empty() { + errors.push(format!( + "{} orphan entries in canonical_v1_stdlib.txt (filter no longer exists):\n {}", + orphans.len(), + orphans.join("\n ") + )); + } + assert!(errors.is_empty(), "{}", errors.join("\n\n")); +} + +/// Authoring helper. Print the canonical-v1 hash of every stdlib filter in +/// the recorded-file format, sorted. Pipe to `tests/canonical_v1_stdlib.txt` +/// when adding new filters or rebuilding the corpus. +#[test] +#[ignore = "authoring helper; run with --ignored to capture expected values"] +fn dump_stdlib_hashes() { + println!("# canonical_v1 hashes for all stdlib filters under crates/tokf-cli/filters/"); + println!( + "# generated by `cargo test -p tokf-common --test canonical_v1 -- dump_stdlib_hashes --ignored --nocapture`" + ); + println!("# DO NOT edit this file by hand."); + println!(); + for (rel, content) in collect_stdlib_filters() { + let h = canonical_v1::hash(&content).unwrap(); + println!("{rel}: {h}"); + } +} + +// ── Property tests ──────────────────────────────────────────────────────── + +/// A small representative subset of the stdlib for property tests. We don't +/// run them across all 51 — that's the corpus's job — but we want enough +/// shape diversity that the invariants are exercised on real-world inputs. +const PROP_TEST_FILTERS: &[&str] = &[ + "git/push", + "git/commit", + "git/status", + "git/log", + "cargo/test", + "cargo/clippy", + "playwright", + "kubectl/get/pods", +]; + +fn load(rel: &str) -> String { + let path = stdlib_root().join(format!("{rel}.toml")); + std::fs::read_to_string(&path).unwrap_or_else(|e| panic!("{}: {e}", path.display())) +} + +fn assert_invariant String>(label: &str, transform: F) { + for rel in PROP_TEST_FILTERS { + let original = load(rel); + let transformed = transform(&original); + let h_original = canonical_v1::hash(&original).unwrap(); + let h_transformed = canonical_v1::hash(&transformed).unwrap_or_else(|e| { + panic!("{rel} after {label}: hash failed: {e}\n--- transformed ---\n{transformed}") + }); + assert_eq!( + h_original, h_transformed, + "{label} should not change the v1 hash for {rel}\n\ + original ─────\n{original}\ntransformed ──\n{transformed}\n" + ); + } +} + +/// Idempotence: running the filter through a `toml` round-trip +/// (`from_str` → `to_string`) before hashing must produce the same v1 +/// hash. This proves the canonicaliser is stable against arbitrary +/// reformatting that the toml crate itself produces. +#[test] +fn invariant_toml_roundtrip_idempotent() { + for rel in PROP_TEST_FILTERS { + let original = load(rel); + let parsed: toml::Value = toml::from_str(&original).unwrap(); + let roundtripped = toml::to_string(&parsed).unwrap(); + let h_original = canonical_v1::hash(&original).unwrap(); + let h_round = canonical_v1::hash(&roundtripped).unwrap(); + assert_eq!( + h_original, h_round, + "toml round-trip changed v1 hash for {rel}" + ); + } +} + +/// Adding leading blank lines and a leading comment must not change the +/// hash. Both are pure file-level affordances stripped at parse time. +/// Note: only the prefix of the file is transformed, so we never modify +/// anything inside a multi-line string body. +#[test] +fn invariant_leading_comments_and_blanks() { + assert_invariant("leading comments + blank lines", |s| { + format!("# leading comment\n#\n# more comment\n\n\n{s}") + }); +} + +/// Reorder unordered-array entries via the AST and assert hash unchanged. +/// We work at the AST layer rather than text so we don't accidentally +/// touch multi-line string content (some filters embed Lua scripts). +#[test] +fn invariant_skip_keep_reversed_via_ast() { + for rel in PROP_TEST_FILTERS { + let original = load(rel); + let mut value: toml::Value = toml::from_str(&original).unwrap(); + let mut mutated = false; + for field in ["skip", "keep"] { + if let Some(toml::Value::Array(arr)) = + value.as_table_mut().and_then(|t| t.get_mut(field)) + && arr.len() >= 2 + { + arr.reverse(); + mutated = true; + } + } + if !mutated { + continue; + } + let transformed = toml::to_string(&value).unwrap(); + let h_original = canonical_v1::hash(&original).unwrap(); + let h_transformed = canonical_v1::hash(&transformed).unwrap(); + assert_eq!( + h_original, h_transformed, + "reversing skip/keep should not change the v1 hash for {rel}" + ); + } +} + +#[test] +fn invariant_default_false_added() { + // Adding `dedup = false` (or another false-bool) to any filter that + // doesn't already use that key must not change the hash. Pick a key the + // filter doesn't mention. + for rel in PROP_TEST_FILTERS { + let original = load(rel); + if original.contains("dedup =") { + continue; + } + let transformed = format!("dedup = false\n{original}"); + let h_original = canonical_v1::hash(&original).unwrap(); + let h_transformed = canonical_v1::hash(&transformed).unwrap(); + assert_eq!( + h_original, h_transformed, + "adding `dedup = false` must not change the v1 hash for {rel}" + ); + } +} + +#[test] +fn distinguishing_value_change() { + // Sanity check the property tests aren't trivially passing: a real + // value change must change the hash. Modify the `command` field. + for rel in PROP_TEST_FILTERS { + let original = load(rel); + let transformed = original.replace("command =", "command = \"this-changes-it\" #"); + if transformed == original { + continue; + } + // The transformation may not always result in valid TOML; only + // assert when both parse successfully. + let h_original = canonical_v1::hash(&original).unwrap(); + let Ok(h_transformed) = canonical_v1::hash(&transformed) else { + continue; + }; + assert_ne!( + h_original, h_transformed, + "changing `command` must change the v1 hash for {rel}" + ); + } +} diff --git a/crates/tokf-common/tests/canonical_v1_stdlib.txt b/crates/tokf-common/tests/canonical_v1_stdlib.txt new file mode 100644 index 00000000..1f7031d6 --- /dev/null +++ b/crates/tokf-common/tests/canonical_v1_stdlib.txt @@ -0,0 +1,55 @@ +# canonical_v1 hashes for all stdlib filters under crates/tokf-cli/filters/ +# generated by `cargo test -p tokf-common --test canonical_v1 -- dump_stdlib_hashes --ignored --nocapture` +# DO NOT edit this file by hand. + +cargo/build: v1:1c61547d1e2486dce90ccd0db36b9bda666b9c96a9604af917d86b6394ec84f8 +cargo/check: v1:2eb2b385dd07f785a9337cd4f3b222d730de40fb185d76ab07a12d72f0174667 +cargo/clippy: v1:11f9cba91585caf27d7771fb5447b99838710510c5a566ca02169e0cd4918f1a +cargo/fmt: v1:33066edb19d76a1893b4f10ac753c4625e212e6cdc1df164acb2d963f35f0e2f +cargo/install: v1:09379b00251b14479d1269659c4e97be11c6bcfb92d917f7444ce0f816408a1d +cargo/nextest: v1:4bef45df79f34b332b96c6084c8bcd3383c19ca2aba6c40265a1e7814b29feb1 +cargo/test: v1:961ebabbae75ae7f4a6d55aeec92761042330aa1d9bd3d6329b9de25fc64d241 +docker/build: v1:9d812bf0eb9a913a7a17973255ecdf897f754d8490db59578670c399580ec423 +docker/compose: v1:3db2a2498d3119a5151c21ff84e030f04da1d965cbccaaf262ea759d84395bd0 +docker/images: v1:2dfce37a9e6293efabee7a0c9a191e8db8150bb6892fec053b2f183df2e2d826 +docker/ps: v1:f43577b02f895600f7de6d223096603f6d5bdbc1b23f590fe71e854114297098 +eslint/check: v1:94da7bba0a388b78d78619fb405a1e3447635feafc8a8e765239729d1510be7b +firebase/deploy: v1:bb365ad470cb2706b43883c38fb7d8ce6de3d8af80fec2f7535647c6b2930f31 +gh/issue/list: v1:92cef87f7b68310a84cbd13294428ced06fb8090d84072e4d49aadbfeeecc537 +gh/issue/view: v1:ee767ea1be51912b94089e1c1a8a7e78c46da48a614c9b283e419d33a54a3578 +gh/pr/checks: v1:0570260baed70d9404364aa7435ed1d0d7c3ee3f051ab8f12f85b089530a6896 +gh/pr/list: v1:a8b9c85561238ee6d87c877e46442f0ab163bb6439426597a120cb73f3de8ff4 +gh/pr/view: v1:9c41d9926e0dbae0194c94b7d15f3e8d57df7ea3510ef0dbaef0763d72ca2aed +git/add: v1:68484dae0f0229ad14f6a93d4bc26c1fd1e909e2623c48f646cbf264fc274498 +git/commit: v1:6bb9a79b66d9512d6fd172bdf1998e581400d5995b3dfc7819248500c53ad891 +git/diff: v1:cc5d7a455440386da2542f58a5eca861e01d935e281570315294a37370483c82 +git/diff-name-list: v1:31364422227b2309bc68472729cb145e93f6dd52185d8d459622556f94ba75f0 +git/log: v1:bbdbb096ba96493e3b7ea790738ebfed618344e85095a1ef8f854f17b69b5c6a +git/log-name-list: v1:c1d6b17aa6a2f7ed5980ececfd436784f00add56e9e54d0fcbef03855ac5cc1f +git/push: v1:85834aa380ae40004a3b66bb154fb3a043b5f5ad64d11f2bc1d31d662c2a6960 +git/show: v1:edb59157e427b429c86ea9fdef4507978413583a1a15afac3935787e8dd7fdb7 +git/status: v1:386caa5858431184ee0e61b8a31202bcda6892cc2d55e4a68261a444de626ca4 +go/build: v1:aff1f672e07d55e637b4bb4d9b8ca43072c9aa7dc405d87da7f54fe42e134c8a +go/test: v1:c43739c961b9fec7ae0c1765a5ce738ffeedf2224ed3e557f4803559e1af43a8 +go/vet: v1:8790c1174ee8cc14f547bb6cd740c5036e99f3623c6646ecfbd62bff319b794d +gradle/build: v1:c063448977fc5c0c876915abc57a3d048f55c2798bd13166074836082600baee +gradle/dependencies: v1:b9700e599ebcd68ec566085471474f2694ffb87ef89ffc36a5196769b0aba923 +gradle/test: v1:ab6822ed1c37aea27a7c776f972b3ec5ce0875b107191aae6ecb8a3b6f35b85c +kubectl/get/pods: v1:a9d39c9df3a81986feecc51fadbce8d7b5f19e30ac65c98d4baf2d3a8540a3f2 +ls: v1:54d7d0cb8d17eed7451a5e1b2d28a2fbedb6206edf363ebcc01656708b4d4ea0 +next/build: v1:c83d04ead0971bf51e4ec692019cee8f8eee2ca3854716617d5c9287668ce5c0 +npm/run: v1:ef62ce6c4896fd539128f6e69647c381a3417af2d5df63207e2817800b7941f4 +npm/test: v1:6dc3737fd5a7e3240ead6aa947af582607a741c9ad862f4159e66d96f3880e38 +npm/test-jest: v1:88d4c96af0c4010d3ca811b5af57767bce4d3ba7f9beccadfbbe4d0ece6b60fe +npm/test-vitest: v1:0a8eadbe1ad2fcd3ed592bb008891726bdb933a8aedf10c1b950d71a202c3a0d +playwright: v1:2c46bddfd8770856fc699e13b2c7d977ad94c663e01cedd54f8dd2effa5f1267 +pnpm/add: v1:6c833af163080a4150031d14748a0d2c7718917633388812f4ae6fd9c29d99ff +pnpm/install: v1:ed0baf9a07a3c8aec86730aded93ced842149a90d7e5c699bca9aa932f8a62e2 +prettier/check: v1:83221e871deb4f5d2a56135e5e67ed21db851a5dac0281f1365050f35b47539b +prisma/generate: v1:003474fd3d9912120ae45e4b3eb3953d14ca1dbde161add29281dbfaf62d6238 +pytest: v1:2492294e44b15d7ba838e3c5c034eb0f500fd59a68928090693e4d69288aef5f +ruff/check: v1:e76b510bd2cb7ddc7fe50153cf8dcac80b1e65e9d705d6802c0fffdae8d4a590 +ruff/format: v1:1351f167ce0c117bc0735077f3d52ae8a314eb9e77c82b928930011a8beb0261 +tsc: v1:e5e608ade50007d1f33e51c5613db81e3274e6bb94cbddd424f89096d73c76b1 +vite/build: v1:4dca59a8bab5af60debcfa3c9734d7ec8b531abed5e4aaa0a617e4b20cc35f06 +vue-tsc: v1:3e3f3708a4db2999d745fdfc3e96438758946eba157adb85fe60da50d03e1428 diff --git a/crates/tokf-server/migrations/20260428000000_add_v1_hash.sql b/crates/tokf-server/migrations/20260428000000_add_v1_hash.sql new file mode 100644 index 00000000..8d76d50e --- /dev/null +++ b/crates/tokf-server/migrations/20260428000000_add_v1_hash.sql @@ -0,0 +1,14 @@ +-- Schema-independent canonical TOML hash (ADR-0002). +-- +-- Nullable on purpose: existing rows are NULL until the backfill endpoint +-- (`POST /api/filters/backfill-v1-hashes`) has populated them by re-reading +-- each row's TOML from R2. NOT NULL or UNIQUE here would either fail the +-- migration or break the backfill mid-run on the first historical duplicate; +-- both belong to the dedup migration (separate PR) which also collapses +-- duplicate v1 rows. +-- +-- Non-partial index because CockroachDB rejects a partial index on a column +-- added in the same migration ("column ... is not public", error 0A000). +-- B-tree NULL handling makes the space cost negligible. +ALTER TABLE filters ADD COLUMN v1_hash TEXT; +CREATE INDEX filters_v1_hash_idx ON filters(v1_hash); diff --git a/crates/tokf-server/src/routes/filters/backfill.rs b/crates/tokf-server/src/routes/filters/backfill.rs index 4307e578..057341a8 100644 --- a/crates/tokf-server/src/routes/filters/backfill.rs +++ b/crates/tokf-server/src/routes/filters/backfill.rs @@ -27,8 +27,9 @@ pub struct BackfillVersionsResponse { pub skipped: usize, } -/// Maximum number of entries in a single backfill request. -const MAX_BACKFILL_ENTRIES: usize = 500; +/// Maximum batch size for either backfill endpoint. Bounds operator load +/// against the DB / R2. +const MAX_BACKFILL_BATCH: usize = 500; /// `POST /api/filters/backfill-versions` — Backfill version data for filters. /// @@ -39,9 +40,9 @@ pub async fn backfill_versions( State(state): State, Json(req): Json, ) -> Result<(StatusCode, Json), AppError> { - if req.entries.len() > MAX_BACKFILL_ENTRIES { + if req.entries.len() > MAX_BACKFILL_BATCH { return Err(AppError::BadRequest(format!( - "batch size {} exceeds maximum of {MAX_BACKFILL_ENTRIES}", + "batch size {} exceeds maximum of {MAX_BACKFILL_BATCH}", req.entries.len() ))); } @@ -99,3 +100,114 @@ pub async fn backfill_versions( Json(BackfillVersionsResponse { updated, skipped }), )) } + +// ── v1_hash backfill ────────────────────────────────────────────────────── + +#[derive(Debug, Deserialize)] +pub struct BackfillV1Request { + /// Maximum rows to process in this call. Operator iterates until + /// `processed == 0`. Defaults to 100; capped at `MAX_BACKFILL_BATCH`. + #[serde(default = "default_v1_batch_size")] + pub limit: usize, +} + +const fn default_v1_batch_size() -> usize { + 100 +} + +#[derive(Debug, Serialize)] +pub struct BackfillV1Response { + pub processed: usize, + pub updated: usize, + pub failed: Vec, +} + +#[derive(Debug, Serialize)] +pub struct BackfillV1Failure { + pub content_hash: String, + pub error: String, +} + +/// `POST /api/filters/backfill-v1-hashes` — populate `v1_hash` for legacy rows. +/// +/// Service-token auth. Idempotent. Picks up to `limit` rows where +/// `v1_hash IS NULL`, fetches the TOML from R2, computes the v1 hash +/// (ADR-0002), writes it back. Rows whose R2 object is missing or whose +/// TOML fails to canonicalise are reported in `failed` and skipped — the +/// operator inspects the response and triages those manually. +/// +/// Operator runbook: invoke repeatedly until `processed == 0`. Failures +/// don't stop the batch. +pub async fn backfill_v1_hashes( + _auth: ServiceAuth, + State(state): State, + Json(req): Json, +) -> Result<(StatusCode, Json), AppError> { + let limit = req.limit.clamp(1, MAX_BACKFILL_BATCH); + // Safe: `limit` is clamped to [1, MAX_BACKFILL_BATCH] (= 500) which fits in i64. + #[allow(clippy::cast_possible_wrap)] + let limit_i64 = limit as i64; + + let rows: Vec<(String, String)> = sqlx::query_as( + "SELECT content_hash, r2_key FROM filters + WHERE v1_hash IS NULL + ORDER BY created_at ASC + LIMIT $1", + ) + .bind(limit_i64) + .fetch_all(&state.db) + .await?; + + tracing::info!( + candidates = rows.len(), + limit, + "backfill-v1-hashes request received" + ); + + let mut updated = 0usize; + let mut failed = Vec::new(); + + for (content_hash, r2_key) in &rows { + match compute_and_store_v1(&state, content_hash, r2_key).await { + Ok(()) => updated += 1, + Err(e) => { + tracing::warn!(hash = %content_hash, "backfill-v1 failed: {e}"); + failed.push(BackfillV1Failure { + content_hash: content_hash.clone(), + error: e.to_string(), + }); + } + } + } + + tracing::info!( + processed = rows.len(), + updated, + failed = failed.len(), + "backfill-v1-hashes complete" + ); + + Ok(( + StatusCode::OK, + Json(BackfillV1Response { + processed: rows.len(), + updated, + failed, + }), + )) +} + +async fn compute_and_store_v1( + state: &AppState, + content_hash: &str, + r2_key: &str, +) -> anyhow::Result<()> { + let toml_str = crate::storage::get_utf8(&*state.storage, r2_key).await?; + let v1 = tokf_common::canonical_v1::hash(&toml_str)?; + sqlx::query("UPDATE filters SET v1_hash = $1 WHERE content_hash = $2") + .bind(&v1) + .bind(content_hash) + .execute(&state.db) + .await?; + Ok(()) +} diff --git a/crates/tokf-server/src/routes/filters/backfill_tests.rs b/crates/tokf-server/src/routes/filters/backfill_tests.rs new file mode 100644 index 00000000..bdebfad5 --- /dev/null +++ b/crates/tokf-server/src/routes/filters/backfill_tests.rs @@ -0,0 +1,306 @@ +use std::sync::Arc; + +use axum::{ + body::Body, + http::{Request, StatusCode}, +}; +use http_body_util::BodyExt; +use sqlx::PgPool; +use tower::ServiceExt; + +use crate::routes::test_helpers::insert_service_token; +use crate::storage::StorageClient as _; +use crate::storage::mock::InMemoryStorageClient; + +use super::test_helpers::{ + expected_v1, insert_test_user, make_state_with_storage, post_json, publish_filter_helper, +}; + +const VALID_FILTER_TOML: &[u8] = b"command = \"my-tool\"\n"; + +const URI: &str = "/api/filters/backfill-v1-hashes"; + +/// Publish a filter, then NULL out its `v1_hash` to simulate a row created +/// before this PR shipped. +async fn publish_then_null_v1( + pool: &PgPool, + storage: Arc, + user_token: &str, + toml: &[u8], +) -> String { + let state = make_state_with_storage(pool.clone(), storage); + let app = crate::routes::create_router(state); + let hash = publish_filter_helper(app, user_token, toml, &[]).await; + sqlx::query("UPDATE filters SET v1_hash = NULL WHERE content_hash = $1") + .bind(&hash) + .execute(pool) + .await + .unwrap(); + hash +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_populates_null_rows(pool: PgPool) { + let storage = Arc::new(InMemoryStorageClient::new()); + let (_, alice) = insert_test_user(&pool, "alice_bf").await; + let (_, bob) = insert_test_user(&pool, "bob_bf").await; + let service_token = insert_service_token(&pool, "bf-test").await; + + let alice_toml: &[u8] = b"command = \"alice-tool\"\n"; + let bob_toml: &[u8] = b"command = \"bob-tool\"\n"; + + let alice_hash = publish_then_null_v1(&pool, Arc::clone(&storage), &alice, alice_toml).await; + let bob_hash = publish_then_null_v1(&pool, Arc::clone(&storage), &bob, bob_toml).await; + + let app = + crate::routes::create_router(make_state_with_storage(pool.clone(), Arc::clone(&storage))); + let resp = post_json( + app, + &service_token, + URI, + &serde_json::json!({ "limit": 100 }), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + + let body = resp.into_body().collect().await.unwrap().to_bytes(); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["processed"], 2); + assert_eq!(json["updated"], 2); + assert!(json["failed"].as_array().unwrap().is_empty()); + + let alice_v1: Option = + sqlx::query_scalar("SELECT v1_hash FROM filters WHERE content_hash = $1") + .bind(&alice_hash) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(alice_v1.unwrap(), expected_v1(alice_toml)); + + let bob_v1: Option = + sqlx::query_scalar("SELECT v1_hash FROM filters WHERE content_hash = $1") + .bind(&bob_hash) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(bob_v1.unwrap(), expected_v1(bob_toml)); +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_skips_already_populated(pool: PgPool) { + let storage = Arc::new(InMemoryStorageClient::new()); + let (_, user) = insert_test_user(&pool, "alice_skip").await; + let service_token = insert_service_token(&pool, "bf-skip").await; + + publish_then_null_v1(&pool, Arc::clone(&storage), &user, VALID_FILTER_TOML).await; + + let app = + crate::routes::create_router(make_state_with_storage(pool.clone(), Arc::clone(&storage))); + let resp = post_json( + app, + &service_token, + URI, + &serde_json::json!({ "limit": 10 }), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + let body = resp.into_body().collect().await.unwrap().to_bytes(); + let first: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(first["processed"], 1); + assert_eq!(first["updated"], 1); + + let app = + crate::routes::create_router(make_state_with_storage(pool.clone(), Arc::clone(&storage))); + let resp = post_json( + app, + &service_token, + URI, + &serde_json::json!({ "limit": 10 }), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + let body = resp.into_body().collect().await.unwrap().to_bytes(); + let second: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!( + second["processed"], 0, + "second invocation should find no candidates" + ); +} + +/// Drive a backfill against a single corrupted-R2 row and assert that: +/// the call succeeds with `processed=1, updated=0`, the row appears in +/// `failed[]`, and the row's `v1_hash` remains NULL. Used by every +/// per-failure-mode test below. +async fn run_backfill_expecting_one_failure( + pool: &PgPool, + storage: Arc, + service_token: &str, + hash: &str, +) { + let app = + crate::routes::create_router(make_state_with_storage(pool.clone(), Arc::clone(&storage))); + let resp = post_json(app, service_token, URI, &serde_json::json!({ "limit": 10 })).await; + assert_eq!(resp.status(), StatusCode::OK); + + let body = resp.into_body().collect().await.unwrap().to_bytes(); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["processed"], 1); + assert_eq!(json["updated"], 0); + let failed = json["failed"].as_array().unwrap(); + assert_eq!(failed.len(), 1); + assert_eq!(failed[0]["content_hash"], hash); + + let v1: Option = + sqlx::query_scalar("SELECT v1_hash FROM filters WHERE content_hash = $1") + .bind(hash) + .fetch_one(pool) + .await + .unwrap(); + assert!( + v1.is_none(), + "v1_hash must remain NULL when backfill failed" + ); +} + +async fn r2_key_for(pool: &PgPool, hash: &str) -> String { + sqlx::query_scalar("SELECT r2_key FROM filters WHERE content_hash = $1") + .bind(hash) + .fetch_one(pool) + .await + .unwrap() +} + +/// Each kind of corrupted R2 state that `compute_and_store_v1` should +/// surface as a per-row failure (without aborting the batch or writing +/// `v1_hash`). +enum CorruptR2 { + /// Object deleted — `storage.get` returns None. + Missing, + /// Bytes that fail both UTF-8 decode and TOML parse, exercising both + /// post-fetch failure branches in `compute_and_store_v1`. + Unparseable, +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_reports_per_row_failures(pool: PgPool) { + let service_token = insert_service_token(&pool, "bf-failures").await; + + for (idx, kind) in [CorruptR2::Missing, CorruptR2::Unparseable] + .into_iter() + .enumerate() + { + let storage = Arc::new(InMemoryStorageClient::new()); + let (_, user) = insert_test_user(&pool, &format!("alice_fail_{idx}")).await; + let hash = + publish_then_null_v1(&pool, Arc::clone(&storage), &user, VALID_FILTER_TOML).await; + let r2_key = r2_key_for(&pool, &hash).await; + match kind { + CorruptR2::Missing => storage.delete(&r2_key).await.unwrap(), + CorruptR2::Unparseable => { + storage + .put(&r2_key, b"\xff\xfe not valid toml [[[".to_vec()) + .await + .unwrap(); + } + } + run_backfill_expecting_one_failure(&pool, storage, &service_token, &hash).await; + // Reset for next iteration so the second case starts from a clean + // "v1_hash IS NULL" set (the failure left the row NULL — we just + // want to make sure the backfill's `LIMIT` finds the right row). + sqlx::query("DELETE FROM filter_tests") + .execute(&pool) + .await + .unwrap(); + sqlx::query("DELETE FROM filters") + .execute(&pool) + .await + .unwrap(); + } +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_requires_service_token(pool: PgPool) { + let storage = Arc::new(InMemoryStorageClient::new()); + let app = crate::routes::create_router(make_state_with_storage(pool, storage)); + + let resp = app + .oneshot( + Request::builder() + .method("POST") + .uri("/api/filters/backfill-v1-hashes") + .header("content-type", "application/json") + .body(Body::from(b"{}".to_vec())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(resp.status(), StatusCode::UNAUTHORIZED); +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_rejects_invalid_bearer(pool: PgPool) { + let storage = Arc::new(InMemoryStorageClient::new()); + let app = crate::routes::create_router(make_state_with_storage(pool, storage)); + + let resp = post_json( + app, + "not-a-real-token", + URI, + &serde_json::json!({ "limit": 10 }), + ) + .await; + assert_eq!(resp.status(), StatusCode::UNAUTHORIZED); +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_respects_limit(pool: PgPool) { + let storage = Arc::new(InMemoryStorageClient::new()); + let (_, user) = insert_test_user(&pool, "alice_limit").await; + let service_token = insert_service_token(&pool, "bf-limit").await; + + // Publish 5 distinct filters; null their v1_hash. + for i in 0..5 { + let toml = format!("command = \"tool-{i}\"\n"); + publish_then_null_v1(&pool, Arc::clone(&storage), &user, toml.as_bytes()).await; + } + + let app = + crate::routes::create_router(make_state_with_storage(pool.clone(), Arc::clone(&storage))); + let resp = post_json(app, &service_token, URI, &serde_json::json!({ "limit": 2 })).await; + assert_eq!(resp.status(), StatusCode::OK); + + let body = resp.into_body().collect().await.unwrap().to_bytes(); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["processed"], 2, "should respect requested limit"); + assert_eq!(json["updated"], 2); + + let still_null: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM filters WHERE v1_hash IS NULL") + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(still_null, 3, "3 rows should still be unprocessed"); +} + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn backfill_v1_caps_limit_at_max(pool: PgPool) { + let storage = Arc::new(InMemoryStorageClient::new()); + let service_token = insert_service_token(&pool, "bf-cap").await; + let app = + crate::routes::create_router(make_state_with_storage(pool.clone(), Arc::clone(&storage))); + + // No filters seeded; just confirm the call succeeds with an out-of-range + // limit (clamped, not rejected). + let resp = post_json( + app, + &service_token, + URI, + &serde_json::json!({ "limit": 1_000_000 }), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + + let body = resp.into_body().collect().await.unwrap().to_bytes(); + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["processed"], 0); +} diff --git a/crates/tokf-server/src/routes/filters/mod.rs b/crates/tokf-server/src/routes/filters/mod.rs index 68befa3b..583310af 100644 --- a/crates/tokf-server/src/routes/filters/mod.rs +++ b/crates/tokf-server/src/routes/filters/mod.rs @@ -1,4 +1,7 @@ mod backfill; +#[cfg(test)] +#[allow(clippy::unwrap_used, clippy::expect_used)] +mod backfill_tests; mod publish; mod regenerate; mod search; @@ -9,7 +12,7 @@ mod search_tests; pub mod test_helpers; mod update_tests; -pub use backfill::backfill_versions; +pub use backfill::{backfill_v1_hashes, backfill_versions}; pub use publish::publish_filter; pub use publish::stdlib::publish_stdlib; pub use regenerate::regenerate_examples; diff --git a/crates/tokf-server/src/routes/filters/publish/mod.rs b/crates/tokf-server/src/routes/filters/publish/mod.rs index d6d4a786..1184298d 100644 --- a/crates/tokf-server/src/routes/filters/publish/mod.rs +++ b/crates/tokf-server/src/routes/filters/publish/mod.rs @@ -88,6 +88,7 @@ async fn parse_multipart( /// Grouped fields for inserting a filter record. struct FilterInsert<'a> { content_hash: &'a str, + v1_hash: &'a str, command_pattern: &'a str, canonical_command: &'a str, author_id: i64, @@ -95,23 +96,62 @@ struct FilterInsert<'a> { safety_passed: bool, } -/// Returns `(author_username, was_new)` for the filter with `content_hash`. +/// Resolved view of a publish attempt: the author and `content_hash` of +/// the row that ends up in the registry. For v1-equivalent duplicates +/// these come from the *existing* row, not the rejected submission. +struct UpsertResult { + author: String, + content_hash: String, + is_new: bool, +} + +/// Insert a new filter row, or resolve a duplicate. +/// +/// Two duplicate paths: byte-identical (caught by `ON CONFLICT (content_hash)`) +/// and v1-equivalent (caught by the pre-check below — same canonical TOML +/// shape, different `content_hash`). Both return the existing row's author +/// and canonical hash. Legacy rows with NULL `v1_hash` are excluded from the +/// v1-equivalence check via SQL three-valued logic until they are backfilled. /// -/// Attempts an INSERT and uses the result to distinguish new vs duplicate. -/// Uploading to R2 before this call means orphaned objects are possible on DB -/// failure, but they are harmless (no user-visible state and R2 uploads are -/// idempotent on retry). +/// Uploading to R2 before this call means orphaned objects are possible on +/// DB failure, but they are harmless (no user-visible state and R2 uploads +/// are idempotent on retry). async fn upsert_filter_record( state: &AppState, insert: &FilterInsert<'_>, author_username: &str, -) -> Result<(String, bool), AppError> { +) -> Result { + if let Some((existing_hash, existing_author)) = sqlx::query_as::<_, (String, String)>( + "SELECT f.content_hash, u.username FROM filters f + JOIN users u ON u.id = f.author_id + WHERE f.v1_hash = $1 AND f.content_hash <> $2 + LIMIT 1", + ) + .bind(insert.v1_hash) + .bind(insert.content_hash) + .fetch_optional(&state.db) + .await? + { + tracing::info!( + new_hash = %insert.content_hash, + existing_hash = %existing_hash, + v1 = %insert.v1_hash, + "publish rejected as v1-equivalent of existing filter", + ); + return Ok(UpsertResult { + author: existing_author, + content_hash: existing_hash, + is_new: false, + }); + } + let result = sqlx::query( - "INSERT INTO filters (content_hash, command_pattern, canonical_command, author_id, r2_key, safety_passed) - VALUES ($1, $2, $3, $4, $5, $6) + "INSERT INTO filters (content_hash, v1_hash, command_pattern, canonical_command, author_id, r2_key, safety_passed) + VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (content_hash) DO NOTHING", ) .bind(insert.content_hash) + .bind(insert.v1_hash) .bind(insert.command_pattern) .bind(insert.canonical_command) .bind(insert.author_id) @@ -121,7 +161,7 @@ async fn upsert_filter_record( .await?; if result.rows_affected() == 0 { - // Duplicate — fetch the original author's username + // Byte-identical duplicate — fetch the original author's username. let existing_author: String = sqlx::query_scalar( "SELECT u.username FROM filters f JOIN users u ON u.id = f.author_id @@ -130,10 +170,18 @@ async fn upsert_filter_record( .bind(insert.content_hash) .fetch_one(&state.db) .await?; - return Ok((existing_author, false)); + return Ok(UpsertResult { + author: existing_author, + content_hash: insert.content_hash.to_string(), + is_new: false, + }); } - Ok((author_username.to_string(), true)) + Ok(UpsertResult { + author: author_username.to_string(), + content_hash: insert.content_hash.to_string(), + is_new: true, + }) } pub async fn upload_tests( @@ -195,6 +243,7 @@ pub(super) async fn set_examples_generated_at(pool: &sqlx::PgPool, content_hash: /// Validated, hashed filter ready for storage. pub(super) struct PreparedFilter { pub(super) content_hash: String, + pub(super) v1_hash: String, pub(super) command_pattern: String, pub(super) canonical_command: String, pub(super) config: FilterConfig, @@ -252,8 +301,11 @@ pub(super) fn validate_and_prepare( } let content_hash = canonical_hash(&config).map_err(|e| format!("hash error: {e}"))?; + let v1_hash = + tokf_common::canonical_v1::hash(toml_str).map_err(|e| format!("v1 hash error: {e}"))?; Ok(PreparedFilter { content_hash, + v1_hash, command_pattern, canonical_command, config, @@ -411,15 +463,19 @@ pub async fn publish_filter( let insert = FilterInsert { content_hash: &prepared.content_hash, + v1_hash: &prepared.v1_hash, command_pattern: &prepared.command_pattern, canonical_command: &prepared.canonical_command, author_id: auth.user_id, r2_key: &r2_key, safety_passed, }; - let (author, is_new) = upsert_filter_record(&state, &insert, &auth.username).await?; + let upserted = upsert_filter_record(&state, &insert, &auth.username).await?; - // Upload examples to R2 AFTER insert so set_examples_generated_at finds the row. + // Below, examples + test rows are keyed by `prepared.content_hash` (the + // submitted hash) while the response uses `upserted.content_hash` (the + // existing-row hash on a v1-collision). Safe because examples uploads + // tolerate orphans and `insert_filter_tests` is gated on `is_new`. if let Ok((examples_json, _)) = examples_result { if let Err(e) = storage::upload_examples(&*state.storage, &prepared.content_hash, examples_json).await @@ -430,7 +486,7 @@ pub async fn publish_filter( } } - if is_new { + if upserted.is_new { insert_filter_tests(&state.db, &prepared.content_hash, &test_r2_keys).await?; // Fire-and-forget: materialize per-filter metadata + catalog index to R2 @@ -441,8 +497,8 @@ pub async fn publish_filter( ); } - let registry_url = format!("{}/filters/{}", state.public_url, prepared.content_hash); - let status = if is_new { + let registry_url = format!("{}/filters/{}", state.public_url, upserted.content_hash); + let status = if upserted.is_new { StatusCode::CREATED } else { StatusCode::OK @@ -451,9 +507,9 @@ pub async fn publish_filter( status, crate::routes::ip::rate_limit_headers(&rl), Json(PublishFilterResponse { - content_hash: prepared.content_hash, + content_hash: upserted.content_hash, command_pattern: prepared.command_pattern, - author, + author: upserted.author, registry_url, }), )) diff --git a/crates/tokf-server/src/routes/filters/publish/stdlib/mod.rs b/crates/tokf-server/src/routes/filters/publish/stdlib/mod.rs index 3f782d78..b3ff2c86 100644 --- a/crates/tokf-server/src/routes/filters/publish/stdlib/mod.rs +++ b/crates/tokf-server/src/routes/filters/publish/stdlib/mod.rs @@ -369,11 +369,12 @@ async fn persist_filter( upload_tests(state, &prepared.content_hash, prepared.test_files.clone()).await?; sqlx::query( - "INSERT INTO filters (content_hash, command_pattern, canonical_command, author_id, r2_key, is_stdlib) - VALUES ($1, $2, $3, $4, $5, TRUE) - ON CONFLICT (content_hash) DO UPDATE SET is_stdlib = TRUE", + "INSERT INTO filters (content_hash, v1_hash, command_pattern, canonical_command, author_id, r2_key, is_stdlib) + VALUES ($1, $2, $3, $4, $5, $6, TRUE) + ON CONFLICT (content_hash) DO UPDATE SET is_stdlib = TRUE, v1_hash = COALESCE(filters.v1_hash, EXCLUDED.v1_hash)", ) .bind(&prepared.content_hash) + .bind(&prepared.v1_hash) .bind(&prepared.command_pattern) .bind(&prepared.canonical_command) .bind(author_id) diff --git a/crates/tokf-server/src/routes/filters/publish/stdlib/tests.rs b/crates/tokf-server/src/routes/filters/publish/stdlib/tests.rs index 375853bb..6e387436 100644 --- a/crates/tokf-server/src/routes/filters/publish/stdlib/tests.rs +++ b/crates/tokf-server/src/routes/filters/publish/stdlib/tests.rs @@ -7,7 +7,7 @@ use axum::{ use http_body_util::BodyExt; use tower::ServiceExt; -use crate::routes::filters::test_helpers::make_state; +use crate::routes::filters::test_helpers::{expected_v1, make_state}; use crate::routes::test_helpers::insert_service_token; use crate::storage::mock::InMemoryStorageClient; @@ -303,3 +303,22 @@ async fn publish_stdlib_rejects_invalid_username(pool: PgPool) { "error should mention username: {error}" ); } + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn publish_stdlib_stores_v1_hash(pool: PgPool) { + let token = insert_service_token(&pool, "ci-test").await; + let state = make_state(pool.clone()); + let app = crate::routes::create_router(state); + + let req = make_valid_request(); + let resp = post_stdlib(app, &token, &req).await; + assert_eq!(resp.status(), StatusCode::CREATED); + + // Stdlib publish goes through its own INSERT; assert v1_hash is populated. + let v1_hash: Option = sqlx::query_scalar("SELECT v1_hash FROM filters LIMIT 1") + .fetch_one(&pool) + .await + .unwrap(); + let v1_hash = v1_hash.expect("stdlib publish must populate v1_hash"); + assert_eq!(v1_hash, expected_v1(&req.filters[0].filter_toml)); +} diff --git a/crates/tokf-server/src/routes/filters/publish/tests.rs b/crates/tokf-server/src/routes/filters/publish/tests.rs index aae86b80..618764b8 100644 --- a/crates/tokf-server/src/routes/filters/publish/tests.rs +++ b/crates/tokf-server/src/routes/filters/publish/tests.rs @@ -15,8 +15,8 @@ use crate::storage::mock::InMemoryStorageClient; use crate::storage::StorageClient as _; use super::super::test_helpers::{ - DEFAULT_PASSING_TEST, MIT_ACCEPT, insert_test_user, make_multipart, make_state, - make_state_with_storage, post_filter, + DEFAULT_PASSING_TEST, MIT_ACCEPT, expected_v1, insert_test_user, make_multipart, make_state, + make_state_with_storage, post_filter, publish_filter_helper, }; const VALID_FILTER_TOML: &[u8] = b"command = \"my-tool\"\n"; @@ -536,3 +536,106 @@ async fn publish_filter_stores_examples_in_r2(pool: PgPool) { examples_json["safety"] ); } + +// ── v1_hash persistence + collision tests ───────────────────────────────── + +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn publish_stores_v1_hash_on_new_row(pool: PgPool) { + let (_, token) = insert_test_user(&pool, "alice_v1_store").await; + let app = crate::routes::create_router(make_state(pool.clone())); + let hash = publish_filter_helper(app, &token, VALID_FILTER_TOML, &[]).await; + + let v1: Option = + sqlx::query_scalar("SELECT v1_hash FROM filters WHERE content_hash = $1") + .bind(&hash) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(v1.unwrap(), expected_v1(VALID_FILTER_TOML)); +} + +/// Two byte-different but canonically-equivalent filters published by +/// different authors: the second is rejected as a v1-equivalent of the +/// first; `is_new` is false; only one row exists in the DB; the response +/// reports the original author. +#[crdb_test_macro::crdb_test(migrations = "./migrations")] +async fn publish_v1_collision_returns_existing_author(pool: PgPool) { + let (_, alice_token) = insert_test_user(&pool, "alice_v1").await; + let (_, bob_token) = insert_test_user(&pool, "bob_v1").await; + + // `command = "x"` vs `command = ["x"]` — these parse to different + // `CommandPattern` enum variants (`Single` vs `Multiple`), so + // `canonical_hash` (which serialises the parsed FilterConfig) yields + // different content_hashes. canonical_v1 collapses both into the + // single-string form, so they share a v1_hash. This is the smallest + // fixture that exercises the v1-collision branch in + // `upsert_filter_record` rather than the byte-equal duplicate path. + let alice_toml: &[u8] = b"command = \"my-tool\"\n"; + let bob_toml: &[u8] = b"command = [\"my-tool\"]\n"; + + // Sanity check: same v1, different content_hash. + let v1_alice = expected_v1(alice_toml); + let v1_bob = expected_v1(bob_toml); + assert_eq!( + v1_alice, v1_bob, + "test setup: filters must canonicalise to the same v1 hash" + ); + let alice_cfg: tokf_common::config::types::FilterConfig = + toml::from_str(std::str::from_utf8(alice_toml).unwrap()).unwrap(); + let bob_cfg: tokf_common::config::types::FilterConfig = + toml::from_str(std::str::from_utf8(bob_toml).unwrap()).unwrap(); + assert_ne!( + tokf_common::hash::canonical_hash(&alice_cfg).unwrap(), + tokf_common::hash::canonical_hash(&bob_cfg).unwrap(), + "test setup: content_hashes must differ — otherwise the test exercises \ + the byte-duplicate path, not the v1-collision branch", + ); + + let app = crate::routes::create_router(make_state(pool.clone())); + let alice_resp = post_filter( + app, + &alice_token, + &[("filter", alice_toml), MIT_ACCEPT, DEFAULT_PASSING_TEST], + ) + .await; + assert_eq!(alice_resp.status(), StatusCode::CREATED); + let alice_body = alice_resp.into_body().collect().await.unwrap().to_bytes(); + let alice_json: serde_json::Value = serde_json::from_slice(&alice_body).unwrap(); + let alice_content_hash = alice_json["content_hash"].as_str().unwrap().to_string(); + + let app = crate::routes::create_router(make_state(pool.clone())); + let bob_resp = post_filter( + app, + &bob_token, + &[("filter", bob_toml), MIT_ACCEPT, DEFAULT_PASSING_TEST], + ) + .await; + assert_eq!( + bob_resp.status(), + StatusCode::OK, + "v1-equivalent republish should be 200 OK (treated as duplicate)" + ); + + let bob_body = bob_resp.into_body().collect().await.unwrap().to_bytes(); + let bob_json: serde_json::Value = serde_json::from_slice(&bob_body).unwrap(); + assert_eq!( + bob_json["author"], "alice_v1", + "response should report the original author, not bob" + ); + assert_eq!( + bob_json["content_hash"].as_str().unwrap(), + alice_content_hash, + "response should report the existing content_hash, not bob's" + ); + + // Only one row exists in DB. + let row_count: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM filters WHERE v1_hash = $1") + .bind(&v1_alice) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!( + row_count, 1, + "only one row should exist for canonically-equivalent filters" + ); +} diff --git a/crates/tokf-server/src/routes/filters/regenerate_tests.rs b/crates/tokf-server/src/routes/filters/regenerate_tests.rs index de5da728..26907d37 100644 --- a/crates/tokf-server/src/routes/filters/regenerate_tests.rs +++ b/crates/tokf-server/src/routes/filters/regenerate_tests.rs @@ -1,23 +1,21 @@ use std::sync::Arc; -use axum::{ - body::Body, - http::{Request, StatusCode}, -}; +use axum::http::StatusCode; use http_body_util::BodyExt; use sqlx::PgPool; -use tower::ServiceExt; use crate::routes::test_helpers::insert_service_token; use crate::storage::StorageClient as _; use crate::storage::mock::InMemoryStorageClient; use super::super::test_helpers::{ - insert_test_user, make_state_with_storage, publish_filter_helper, + insert_test_user, make_state_with_storage, post_json, publish_filter_helper, }; const VALID_FILTER_TOML: &[u8] = b"command = \"my-tool\"\n"; +const URI: &str = "/api/filters/regenerate-examples"; + fn make_state_with_mem_storage( pool: PgPool, storage: Arc, @@ -30,17 +28,7 @@ async fn post_regenerate( token: &str, body: &serde_json::Value, ) -> axum::response::Response { - app.oneshot( - Request::builder() - .method("POST") - .uri("/api/filters/regenerate-examples") - .header("authorization", format!("Bearer {token}")) - .header("content-type", "application/json") - .body(Body::from(serde_json::to_vec(body).unwrap())) - .unwrap(), - ) - .await - .unwrap() + post_json(app, token, URI, body).await } // ── Tests ──────────────────────────────────────────────────────────────────── diff --git a/crates/tokf-server/src/routes/filters/search.rs b/crates/tokf-server/src/routes/filters/search.rs index bbf8790e..23023458 100644 --- a/crates/tokf-server/src/routes/filters/search.rs +++ b/crates/tokf-server/src/routes/filters/search.rs @@ -70,6 +70,17 @@ pub struct TestFilePayload { pub struct DownloadPayload { pub filter_toml: String, pub test_files: Vec, + /// `canonical_hash` recomputed by the current server binary on the + /// downloaded TOML. The URL hash is a stable lookup key; this is the + /// authoritative identity under the current `FilterConfig` schema. May + /// differ from the URL hash when the filter was published before recent + /// schema additions — see issue #350. + pub content_hash: String, + /// Canonical v1 hash (schema-independent; see ADR-0002). Stable + /// across `FilterConfig` schema additions, the long-term identity for + /// all filters going forward. Clients aware of v1 verify against + /// this; older clients ignore the field. + pub v1_hash: String, } // ── Helpers ─────────────────────────────────────────────────────────────────── @@ -328,6 +339,22 @@ pub async fn download_filter( let filter_toml = String::from_utf8(filter_bytes) .map_err(|_| AppError::Internal("filter TOML is not valid UTF-8".to_string()))?; + // Recompute the canonical hash under the current `FilterConfig` schema. + // For filters published before schema additions this differs from the URL + // hash; the client uses this as the authoritative identity. See #350. + let content_hash = recompute_content_hash(&filter_toml).map_err(|e| { + tracing::warn!("hash recompute failed for {hash}: {e}"); + AppError::Internal("filter content cannot be re-hashed".to_string()) + })?; + + // Compute the schema-independent v1 hash (ADR-0002). Stable across + // future `FilterConfig` schema additions; the long-term identity for + // every filter going forward. + let v1_hash = tokf_common::canonical_v1::hash(&filter_toml).map_err(|e| { + tracing::warn!("v1 hash computation failed for {hash}: {e}"); + AppError::Internal("filter content cannot be hashed under v1".to_string()) + })?; + let test_keys: Vec = sqlx::query_scalar("SELECT r2_key FROM filter_tests WHERE filter_hash = $1") .bind(&hash) @@ -364,17 +391,67 @@ pub async fn download_filter( Json(DownloadPayload { filter_toml, test_files, + content_hash, + v1_hash, }), )) } +/// Parse `toml_str` into a [`FilterConfig`] and return its current +/// `canonical_hash`. Pulled out as a free function so the download handler +/// stays under the line-count guideline and so unit tests can exercise it +/// without standing up the full HTTP stack. +/// +/// We compute on every download rather than backfilling the `filters.content_hash` +/// DB column. The DB column is the **lookup key** (URL identity, immutable); +/// the recomputed value is the **content identity** under the current +/// schema and may change as `FilterConfig` evolves. Backfilling would +/// require a coordinated re-key / migration we explicitly defer; per-request +/// cost is one TOML parse + one SHA-256, negligible against R2 latency. +fn recompute_content_hash(toml_str: &str) -> Result { + let config: tokf_common::config::types::FilterConfig = + toml::from_str(toml_str).map_err(|e| format!("invalid filter TOML: {e}"))?; + tokf_common::hash::canonical_hash(&config).map_err(|e| format!("hash error: {e}")) +} + // ── Tests ───────────────────────────────────────────────────────────────────── // DB integration tests live in `search_tests.rs` (same pattern as sync/sync_tests). #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { - use super::escape_ilike; + use super::{escape_ilike, recompute_content_hash}; + + #[test] + fn recompute_content_hash_matches_canonical_hash() { + let toml = r#"command = "git push""#; + let cfg: tokf_common::config::types::FilterConfig = toml::from_str(toml).unwrap(); + let direct = tokf_common::hash::canonical_hash(&cfg).unwrap(); + let recomputed = recompute_content_hash(toml).unwrap(); + assert_eq!(direct, recomputed); + } + + #[test] + fn recompute_content_hash_strips_comments_and_whitespace() { + // Two TOMLs that parse to the same FilterConfig must produce the same + // hash — proving the recompute is over the parsed shape, not the raw + // bytes. This is the property the download endpoint relies on. + let a = r#"command = "git push""#; + let b = "# attribution comment\ncommand = \"git push\"\n\n"; + assert_eq!( + recompute_content_hash(a).unwrap(), + recompute_content_hash(b).unwrap() + ); + } + + #[test] + fn recompute_content_hash_errors_on_invalid_toml() { + let err = recompute_content_hash("this is not [[[ valid toml").unwrap_err(); + assert!( + err.contains("invalid filter TOML"), + "expected parse-error message, got: {err}" + ); + } #[test] fn escape_ilike_leaves_normal_text_unchanged() { diff --git a/crates/tokf-server/src/routes/filters/search_tests.rs b/crates/tokf-server/src/routes/filters/search_tests.rs index 7e02199a..f9bd869b 100644 --- a/crates/tokf-server/src/routes/filters/search_tests.rs +++ b/crates/tokf-server/src/routes/filters/search_tests.rs @@ -206,6 +206,21 @@ async fn download_returns_toml_content(pool: PgPool) { json["filter_toml"].as_str().unwrap(), std::str::from_utf8(filter_toml).unwrap() ); + // For a freshly-published filter the URL hash and the server's + // recomputed content_hash must agree — they're computed by the same + // canonical_hash on the same parsed FilterConfig. + assert_eq!( + json["content_hash"].as_str().unwrap(), + hash, + "content_hash should match URL hash for a freshly-published filter" + ); + // v1_hash is schema-independent (ADR-0002) — always present, always + // formatted `v1:<64-hex>`. + let v1_hash = json["v1_hash"].as_str().expect("v1_hash field present"); + assert!( + v1_hash.starts_with("v1:") && v1_hash.len() == 67, + "v1_hash must be `v1:<64-hex>`, got {v1_hash}" + ); // publish_filter_helper auto-adds a default passing test when none provided let test_files = json["test_files"].as_array().unwrap(); assert_eq!(test_files.len(), 1, "expected 1 auto-added default test"); diff --git a/crates/tokf-server/src/routes/filters/test_helpers.rs b/crates/tokf-server/src/routes/filters/test_helpers.rs index 09efd16c..eb031b9c 100644 --- a/crates/tokf-server/src/routes/filters/test_helpers.rs +++ b/crates/tokf-server/src/routes/filters/test_helpers.rs @@ -148,6 +148,35 @@ pub async fn get_request(app: axum::Router, token: &str, uri: &str) -> axum::res .unwrap() } +/// Compute the canonical-v1 hash of TOML for assertion in tests. Accepts +/// `&[u8]` or `&str`. Panics if the bytes aren't UTF-8 or +/// `canonical_v1::hash` errors — both impossible for the small inline +/// fixtures used here. +pub fn expected_v1(toml: impl AsRef<[u8]>) -> String { + tokf_common::canonical_v1::hash(std::str::from_utf8(toml.as_ref()).unwrap()).unwrap() +} + +/// POST a JSON body to a URI with a bearer token. Used by every endpoint +/// test whose handler accepts `Json<...>` over `application/json`. +pub async fn post_json( + app: axum::Router, + token: &str, + uri: &str, + body: &serde_json::Value, +) -> axum::response::Response { + app.oneshot( + Request::builder() + .method("POST") + .uri(uri) + .header("authorization", format!("Bearer {token}")) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(body).unwrap())) + .unwrap(), + ) + .await + .unwrap() +} + /// POST `/api/filters` helper that returns the full response (for publish-specific tests). pub async fn post_filter( app: axum::Router, diff --git a/crates/tokf-server/src/routes/mod.rs b/crates/tokf-server/src/routes/mod.rs index 002635df..8c862c15 100644 --- a/crates/tokf-server/src/routes/mod.rs +++ b/crates/tokf-server/src/routes/mod.rs @@ -55,6 +55,10 @@ pub fn create_router(state: AppState) -> Router { "/api/filters/backfill-versions", post(filters::backfill_versions), ) + .route( + "/api/filters/backfill-v1-hashes", + post(filters::backfill_v1_hashes), + ) .route("/api/sync", post(sync::sync_usage)) .route("/api/catalog/refresh", post(catalog::refresh_catalog)) .route("/api/catalog/grouped", get(catalog::get_grouped_catalog)) diff --git a/crates/tokf-server/src/storage/mod.rs b/crates/tokf-server/src/storage/mod.rs index 0ca4f742..690d1084 100644 --- a/crates/tokf-server/src/storage/mod.rs +++ b/crates/tokf-server/src/storage/mod.rs @@ -18,6 +18,21 @@ pub trait StorageClient: Send + Sync { async fn delete(&self, key: &str) -> anyhow::Result<()>; } +/// Fetch an object as a UTF-8 string. Errors when the key is missing or +/// when the bytes are not valid UTF-8. +/// +/// # Errors +/// +/// Returns an error if the storage call fails, the key does not exist, or +/// the content is not valid UTF-8. +pub async fn get_utf8(storage: &dyn StorageClient, key: &str) -> anyhow::Result { + let bytes = storage + .get(key) + .await? + .ok_or_else(|| anyhow::anyhow!("storage object missing: {key}"))?; + String::from_utf8(bytes).map_err(|e| anyhow::anyhow!("storage object not UTF-8: {e}")) +} + /// Upload a filter TOML if not already stored. Returns the R2 key. /// /// Key format: `filters/{content_hash}/filter.toml` diff --git a/docs/adr/0002-canonical-v1-hash.md b/docs/adr/0002-canonical-v1-hash.md new file mode 100644 index 00000000..3e4b568c --- /dev/null +++ b/docs/adr/0002-canonical-v1-hash.md @@ -0,0 +1,171 @@ +# ADR-0002: Canonical v1 Filter Hash + +## Status + +Accepted (2026-04-28). + +## Context + +`tokf_common::hash::canonical_hash(&FilterConfig)` produces a SHA-256 over `serde_json::to_vec(filter_config)`. The output is sensitive to every detail of the in-memory `FilterConfig` shape: adding a default-valued field with `#[serde(default)]` silently changes the hash for every filter that doesn't reference the new field, because that field is now serialised into the JSON. + +This caused issue #350: filters published before recent schema additions cannot be re-verified by current clients, even though their content is unchanged. The current `filters` table also contains accidental duplicates — multiple rows for the same TOML, each with its own `content_hash` because they were published under different schema generations. Statistics, leaderboards, and search results are split across these duplicates. + +This ADR specifies **canonical-v1**: a frozen, byte-stable, schema-decoupled hash function that the project commits to going forward. v1 hashes a normalised TOML byte stream derived from the input filter, not the parsed Rust structure. Adding fields to `FilterConfig` does not affect v1 hashes for filters that do not use those fields. + +## Hash output format + +``` +v1:<64-character lowercase hexadecimal SHA-256 digest> +``` + +Total length: 67 characters. Consumers identify the hash version by the `v1:` prefix. + +## Algorithm + +``` +canonical_v1(toml_str: bytes) -> "v1:" + +1. Parse toml_str as TOML 1.0 to a `toml::Value` tree. On parse error, return Err. +2. Walk the tree, applying these transformations in order: + a. For each path in the policy table marked unordered, sort that array. + b. For each `command` field: if its value is a single-entry array, replace + with the scalar form. + c. Recursively prune entries equal to `false`, `[]`, or `{}` (TOML-level + defaults). +3. Emit via `toml::to_string(&value)`. +4. Compute SHA-256 over the emitted UTF-8 bytes. +5. Format as "v1:" + 64 lowercase hex digits. +``` + +This is parse → normalise → emit → hash. Conceptually a TOML round-trip with three normalisation passes. The implementation is ~50 lines. + +## Input contract + +- TOML 1.0, UTF-8 encoded, no leading byte-order mark. +- The root must be a table (TOML's normal case). +- Floats must be finite. `inf`, `-inf`, and `nan` cause an error rather than a hash. + +The canonicaliser never inspects `FilterConfig`. Any TOML the parser accepts is hashable; the output is independent of whether the document deserialises into the current Rust structure. + +## Walk: it's a `toml::Value` tree + +"Walk the tree" means recursing through `toml::Value::Table` and `toml::Value::Array` nodes — a generic TOML AST, no Rust-type knowledge. The policy table matches by **structural path in the TOML**, not by `FilterConfig` field type. The path `skip` matches the top-level `skip` key in the document; `[on_success].skip` matches the `skip` key inside the `on_success` table. New fields added to `FilterConfig` later have no effect on the canonicaliser unless the user writes them in their TOML — and even then, default policy (preserve order) handles them safely. + +## Policy table + +For each known field path, the policy is **ordered** (preserve source order, the safe default for new fields too) or **unordered** (sort by the values' canonical representation). + +| Path | Policy | Reason | +|---|---|---| +| `skip` | unordered | match-any: line is skipped if any pattern matches; order is irrelevant | +| `keep` | unordered | match-any | +| `[on_success].skip` | unordered | same | +| `[on_failure].skip` | unordered | same | +| `[[step]]` | ordered | pipeline; runs top-to-bottom | +| `[[match_output]]` | ordered | first-match-wins | +| `[[section]]` | ordered | state-machine transitions, evaluated in order | +| `[[replace]]` | ordered | applied sequentially; later rules see earlier rules' output | +| `[[variant]]` | ordered | priority; first match delegates | +| `[[chunk]]` | ordered | sub-block parsing depends on document position | +| `[[chunk]].[[body_extract]]` | ordered | same | +| `[[chunk]].[[aggregate]]` | ordered | same | +| `passthrough_args` | ordered | shell argument order matters | +| `command` (when `Multiple`) | ordered | patterns matched left-to-right by specificity | + +Default policy for any path not in the table: **ordered**. New fields the canonicaliser doesn't recognise are emitted in source order, which is always safe. + +Sort key for unordered arrays: each array element's canonical TOML byte representation, compared byte-wise. For arrays of strings (the common case — `skip = ["a", "b"]`), this is a lexicographic byte sort of the strings. For arrays of objects, it would be a byte sort of each object's emitted form (in practice, no unordered array-of-tables fields exist, so this case is reserved for future use). + +`parse.group.labels` is already a `BTreeMap` and emits in key order naturally; it does not need to appear in the policy table. + +## Special form: `command` + +The TOML schema admits two equivalent forms: + +```toml +command = "git push" # Single +command = ["git push"] # Multiple with one entry +``` + +These produce identical filtering behaviour. Canonical form: emit `command = "x"` whenever there is exactly one pattern; emit `command = ["x", "y"]` only when there are two or more. A filter author who switches between the two forms does not unintentionally change the hash. + +## Default omission + +Recursively, in the AST, drop: + +- Boolean values equal to `false`. +- Arrays with zero elements. +- Tables with zero key/value pairs. + +This makes `dedup = false` indistinguishable from omitting `dedup`, and `skip = []` indistinguishable from omitting `skip`. Exact consequences: + +- `0`-valued integers are **not** omitted. `dedup_window = 0` is meaningfully different from `dedup_window` being absent. +- Empty strings `""` are **not** omitted. They are a value in their own right. +- Boolean `true` is never omitted. + +The omission is recursive: a table whose only contents become empty after pruning (e.g., `[on_success]` with `output = ""` and `skip = []` only) becomes an empty table itself and is then pruned. This collapses chains of trivial wrappers cleanly. + +## Dependency on the `toml` crate + +v1's emission is delegated to `toml::to_string(&value)`. This is a deliberate choice that trades implementation simplicity for a dependency on the crate's emission stability. To make the dependency explicit and controlled: + +- Pin `toml` in `Cargo.toml` to an **exact** version: `toml = "=1.0.X"` rather than `toml = "1.0"`. +- Treat every `toml` version bump as a CI gate. The frozen test corpus (see below) is the contract: if any corpus entry's hash changes after the bump, either stay pinned or ship v2. +- In practice the toml crate's emission is stable for the inputs we care about — sorted-key tables (`toml::Value`'s `Table` is a `BTreeMap`), straightforward scalar values, no exotic Unicode patterns. Risk surface is low. + +If a future toml release does break the corpus and we cannot pin (because of, say, a security fix we need to take), v2 ships at that moment. v1 hashes already in the wild remain forever computable by checking out the older crate version. + +## Test corpus + +The implementation is paired with a frozen corpus at `crates/tokf-common/tests/hash_corpus/v1/`. Each fixture is a `.toml` with a `.expected` containing the recorded `v1:` output. CI asserts every fixture still produces its expected hash on every build. **Modifying an existing `.expected` value in place is forbidden** — that silently invalidates a real-world filter's identity. + +The corpus must include, at minimum, fixtures exercising: + +- A minimal filter (`command = "x"` only). +- Each unordered field with values written in non-canonical order, plus a fixture in canonical order — both must hash identically. +- Each ordered field with multiple entries; reordering them must produce a *different* hash. +- The `command` collapse: `command = "x"`, `command = ["x"]`, and `command = ["x", "y"]` — first two equal, third different. +- Default-omission cases: `dedup = false`, `skip = []`, empty `[fallback]` table — all must hash identically to the same TOML with those keys absent. +- Strings exercising every escape: backslash, quote, newline, tab, control characters. +- Integers including negative, large, and the `i64` boundary; equivalent inputs from hex/octal/binary forms (which the parser normalises) hash identically to their decimal form. +- Floats including subnormal, very-small, very-large, integer-valued. +- Unicode strings written equivalently in NFC and NFD — the parser normalises so they should hash identically. + +A fixture-listing test asserts that every directory under `tests/hash_corpus//` corresponds to a registered hash version, that every `.toml` has a sibling `.expected`, and that no orphan `.expected` files exist. + +## When to ship v2 + +A new version (v2, v3, …) is shipped only when one of the following is true: + +1. **A bug is discovered in v1's canonicaliser** that produces non-deterministic output for some valid input. v2 is the corrected version; v1 remains as-is for filters already published. +2. **A `toml` crate upgrade** changes our corpus output and the upgrade cannot be deferred (security fix, etc.). v2 lives with the new emission; v1 is computed by checking out the old crate. +3. **The Layer 2 policy table needs an existing entry's policy *changed*** (e.g., reclassifying `[[step]]` as unordered). This would alter v1 output for existing filters, so it requires a new version. + +Adding new entries to the policy table for fields that did not exist when v1 shipped is **not** a v2 trigger, provided the addition uses the conservative default (ordered) or the obviously-correct semantic (unordered for set-like fields). Such additions are recorded as v1 spec amendments in this ADR's revision history below. + +When v2 ships, both v1 and v2 hashers exist in `tokf_common::hash::KNOWN_VERSIONS`. The server emits both during the migration window. Clients verify against whichever they recognise. v1's implementation stays in the codebase — there is no value in losing the ability to verify v1 hashes already in the wild. + +## Worked equivalence examples + +All inputs in each row hash to the same v1 output: + +| Input A | Input B | Reason | +|---|---|---| +| `command = "git push"` | `command = "git push"\n\n` | toml-crate emission collapses whitespace | +| `command = "x"\n# old filter\nskip = []` | `command = "x"` | comments stripped at parse, default omitted | +| `skip = ["a", "b"]` | `skip = ["b", "a"]` | policy table: `skip` unordered | +| `command = "x"` | `command = ["x"]` | special-form collapse | +| `[parse.group.labels]\nA = "added"\nM = "modified"` | `[parse.group.labels]\nM = "modified"\nA = "added"` | `Table` is a `BTreeMap`, sorted on emit | +| `dedup = false\ncommand = "x"` | `command = "x"` | default omission | + +These intentionally hash differently: + +- `command = "x"` vs `command = "y"` — different content. +- `skip = ["a"]` vs `strip_lines_matching = ["a"]` — alias renames change the canonical key bytes; same logical effect, different canonical form. +- `[[step]]\nrun = "a"\n[[step]]\nrun = "b"` vs `[[step]]\nrun = "b"\n[[step]]\nrun = "a"` — `[[step]]` is ordered. +- `command = ["a", "b"]` vs `command = ["b", "a"]` — `Multiple` patterns are ordered (specificity). +- `dedup = true` vs (no `dedup` key) — explicit `true` is preserved; only `false` is omitted. + +## Revision history + +- **2026-04-28** — Initial draft.