diff --git a/diff/INTERNAL.md b/diff/INTERNAL.md new file mode 100644 index 000000000..a3c5d52b1 --- /dev/null +++ b/diff/INTERNAL.md @@ -0,0 +1,175 @@ +# Implementation and Working Principles of Patience Diff + +Patience Diff was first proposed by Bram Cohen. It is essentially a heuristic +text-partitioning strategy that can cooperate with other diff algorithms. Its +core idea is as follows: + +- For the text blocks `old` and `new`, first count the occurrences of each + line, then use the lines whose contents appear exactly once in `old` and + exactly once in `new` as candidate anchors. + +Suppose `old` looks like this: + +``` +1| Ruby: ruby-lang.org +2| # +3| Python: python.org +4| # +5| MoonBit: www.moonbitlang.com +6| # +7| Perl: use.perl.org +``` + +In this example, within `old`, `#` appears more than once, so it is not unique +on the `old` side. The other four lines are unique in `old`, and they can +serve as anchors only if they also appear exactly once in `new`. + +- Then, among the lines that are unique on both sides, select those that + appear in both blocks to form a candidate anchor sequence. + +Suppose `new` looks like this: + +``` +1| Python: python.org +2| # +3| MoonBit: www.moonbitlang.com +4| # +5| Javascript: tc39.es +6| # +7| Ruby: ruby-lang.org +``` + +Then the candidate anchor sequence is: + +``` +"Ruby: ruby-lang.org": old index = 1, new index = 7 +"Python: python.org": old index = 3, new index = 1 +"MoonBit: www.moonbitlang.com": old index = 5, new index = 3 +``` + +The candidate anchor sequence must ensure that one column of `index` values is +ordered. Here we arrange it from top to bottom in ascending order of `old +index`. + +- Next, within this candidate sequence, search for the longest increasing + subsequence by `new index`. Once this is done, the indices on both sides are + in ascending order. + +The sequence found from the candidate anchor sequence above is: + +``` +"Python: python.org": old index = 3, new index = 1 +"MoonBit: www.moonbitlang.com": old index = 5, new index = 3 +``` + +- Finally, split the two text blocks according to the anchors, and apply a + basic diff algorithm to the resulting subranges. + +There is another approach commonly described online: apply patience again to +each subrange until no suitable anchors can be found. This was the first +version of Patience proposed by Bram Cohen. In a later blog post, he argued +that in practical use, a single split did not appear significantly worse than +recursive splitting, while being simpler, so this document still uses the +single-split approach. + +Its basic principle is this simple. In many real code edits, this heuristic +works well. + +Its implementation is basically just the process above. The only relatively +complicated part is finding the longest increasing subsequence, which uses an +algorithm called `Patience sort` (also the origin of the name Patience diff). + +## Patience Sort + +The name `Patience Sort` is said to come from a solitaire card game called +`Patience`. At the start of the game there is a shuffled deck of cards +(corresponding to an unordered `new index` list). By dealing the cards one by +one into a series of piles on the table according to a few rules, the longest +increasing subsequence can be found. + +Below, we use an array containing the numbers 1 through 13 as an example. Each +number appears only once, because they are all filtered candidate anchors. + +``` +5 9 4 6 12 8 7 1 10 11 3 2 13 +``` + +First, take out `5`. Since the table is currently empty, create a new pile to +hold it. + +``` +9 4 6 12 8 7 1 10 11 3 2 13 +------------------------------------------------- + + +5 +``` + +Next, take out `9`. Since `9` is greater than `5`, it cannot be placed on top +of `5`, so it can only go to the right of `5`, forming a new pile. This time, +unlike the first step, we need to record some extra information. The last +number `9` compares against is `5`, so we create a *back pointer* from `9` to +`5`. + +``` +4 6 12 8 7 1 10 11 3 2 13 9 -> 5 +--------------------------------------------- + + +5 9 +``` + +Next, take out `4`. Since `4` is smaller than `5`, place it directly on top of +`5` without recording a back pointer. + +``` +6 12 8 7 1 10 11 3 2 13 9 -> 5 +----------------------------------------- + + +4 +5 9 +``` + +The following steps work in the same way. Take out `6`: `6` is greater than +`4` but smaller than `9`, so place it on top of `9` and record a back pointer +from `6` to `4`. + +``` +12 8 7 1 10 11 3 2 13 9 -> 5 +------------------------------------- 6 -> 4 + + +4 6 +5 9 +``` + +By repeating this process, we eventually get the following piles and back +pointer records: + +``` + 9 -> 5 + 6 -> 4 + 12 -> 6 + 8 -> 6 + 7 -> 6 + 10 -> 7 + 11 -> 10 + 3 -> 1 + 2 -> 1 + 13 -> 11 + + 2 +1 3 7 +4 6 8 +5 9 12 10 11 13 +``` + +Finally, start from `13`, the top card of the rightmost pile, and follow the +back pointers: `13 -> 11 -> 10 -> 7 -> 6 -> 4`. Reverse this sequence to get +one longest increasing subsequence. + +When translating this process into code, there is one optimization that can be +made: because the top elements of these piles are ordered, once the number of +piles becomes large, binary search can be used to find the final position for a +new card. diff --git a/diff/README.mbt.md b/diff/README.mbt.md new file mode 100644 index 000000000..b9389836f --- /dev/null +++ b/diff/README.mbt.md @@ -0,0 +1,119 @@ +# Diff + +Compute edit scripts between two sequences using the Myers diff algorithm by +default, or patience diff when you pass `algorithm=@diff.Patience`. + +`Diff` works with any element type that implements `Hash + Eq`. Constructing a +`Diff[T]` bundles the source arrays with the edit script. Call `group` on the +result to split far-apart changes into separate `Hunk[T]` values for +unified-diff-style output. + +## Compute A Diff + +`Diff(old~, new~)` computes the full sequence of `Delete`, `Insert`, and +`Equal` operations, accessible via the `edits` field. + +```mbt check +///| +test "Diff computes deletes inserts and equals" { + let old = ["apple", "pear", "banana"][:] + let new = ["apple", "banana", "coconut"][:] + + let d = @diff.Diff(old~, new~) + + assert_eq(d.edits.length(), 4) + assert_true( + d.edits[:] + is [ + Equal(old_index=0, new_index=0, len=1), + Delete(old_index=1, new_index=1, len=1), + Equal(old_index=2, new_index=1, len=1), + Insert(old_index=3, new_index=2, len=1), + ], + ) +} +``` + +## Prefer Unique Anchors With Patience Diff + +Pass `algorithm=@diff.Patience` to `Diff(old~, new~, algorithm=@diff.Patience)` to enable +patience diff. This first finds elements that appear exactly once in both +inputs and uses them as anchors, then runs Myers diff on the unmatched ranges +between those anchors. This can produce more stable result when repeated +elements move around. + +```mbt check +///| +test "patience diff keeps unique anchors in place" { + let old = ["unique", "dup", "dup"][:] + let new = ["dup", "unique", "dup"][:] + + let myers = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + + assert_true( + myers.edits[:] + is [ + Delete(old_index=0, new_index=0, len=1), + Equal(old_index=1, new_index=0, len=1), + Insert(old_index=2, new_index=1, len=1), + Equal(old_index=2, new_index=2, len=1), + ], + ) + assert_true( + patience.edits[:] + is [ + Insert(old_index=0, new_index=0, len=1), + Equal(old_index=0, new_index=1, len=2), + Delete(old_index=2, new_index=3, len=1), + ], + ) +} +``` + +## Group Into Hunks And Render + +`group` splits the edit script into `Hunk[T]` values, keeping `radius` lines +of surrounding context (default 3). `radius` must be non-negative, and +`radius=0` emits hunks without surrounding context. Each `Hunk[T]` implements +`Show`, so you can print it directly as unified-diff output. + +```mbt check +///| +test "group splits distant changes into separate hunks" { + let old = [ + " aaaaaaaaaa", " bbbbbbbbbb", " cccccccccc", " dddddddddd", " eeeeeeeeee", + " ffffffffff", " gggggggggg", " hhhhhhhhhh", + ][:] + let new = [ + " aaaaaaaaaa", " xxxxxxxxxx", " cccccccccc", " dddddddddd", " eeeeeeeeee", + " ffffffffff", " yyyyyyyyyy", " hhhhhhhhhh", + ][:] + + let hunks = @diff.Diff(old~, new~).group(radius=1) + + assert_eq(hunks.length(), 2) + assert_eq( + hunks[0].to_string(), + ( + #|@@ -1,3 +1,3 @@ + #| aaaaaaaaaa + #|- bbbbbbbbbb + #|+ xxxxxxxxxx + #| cccccccccc + #| + ), + ) + assert_eq( + hunks[1].to_string(), + ( + #|@@ -6,3 +6,3 @@ + #| ffffffffff + #|- gggggggggg + #|+ yyyyyyyyyy + #| hhhhhhhhhh + #| + ), + ) +} +``` diff --git a/diff/backpointer.mbt b/diff/backpointer.mbt new file mode 100644 index 000000000..15652b479 --- /dev/null +++ b/diff/backpointer.mbt @@ -0,0 +1,38 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// A node in the predecessor chain recovered from the patience-sorting phase. +/// +/// `value` stores the current pile entry, and `prev` points to the entry chosen +/// from the previous pile so the final increasing chain can be reconstructed +/// once the last pile is known. +priv struct BackPointer { + value : (Int, Int) + prev : BackPointer? +} + +///| +/// Materialize the predecessor chain ending at `self`, preserving left-to-right +/// subsequence order. +fn BackPointer::to_array(self : BackPointer) -> Array[(Int, Int)] { + let result = [] + let mut self = self + while self.prev is Some(prev) { + result.push(self.value) + self = prev + } + result.push(self.value) + return result.rev() +} diff --git a/diff/compact.mbt b/diff/compact.mbt new file mode 100644 index 000000000..fa29d91e5 --- /dev/null +++ b/diff/compact.mbt @@ -0,0 +1,50 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// A view over `source` filtered through a stable list of retained indices. +/// +/// `indices[i]` stores the original position in `source` for the `i`th visible +/// element. This lets the Myers implementation skip values that cannot match +/// while still translating results back to the original coordinates. +priv struct CompactedElements[T]((ArrayView[T], FixedArray[Int])) + +///| +/// Read the `index`th retained element. +fn[T] CompactedElements::op_get(self : CompactedElements[T], index : Int) -> T { + let (source, indices) = self.0 + source[indices[index]] +} + +///| +/// Map a retained position back to its original index in `source`. +fn[T] CompactedElements::get_index( + self : CompactedElements[T], + index : Int, +) -> Int { + let (_, indices) = self.0 + indices[index] +} + +///| +/// Return the number of retained elements visible through this compacted view. +fn[T] CompactedElements::indices_length(self : CompactedElements[T]) -> Int { + self.0.1.length() +} + +///| +/// Return the length of the underlying unfiltered source sequence. +fn[T] CompactedElements::source_length(self : CompactedElements[T]) -> Int { + self.0.0.length() +} diff --git a/diff/diag_wbtest.mbt b/diff/diag_wbtest.mbt new file mode 100644 index 000000000..b53a1e4bf --- /dev/null +++ b/diff/diag_wbtest.mbt @@ -0,0 +1,145 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +fn[T] identity_compacted(source : Array[T]) -> CompactedElements[T] { + let indices = FixedArray::make(source.length(), 0) + for i = 0; i < source.length(); i = i + 1 { + indices[i] = i + } + CompactedElements((source[:], indices)) +} + +///| +fn run_diag_range( + old : Array[Int], + new : Array[Int], + lhs_offset : Int, + lhs_limit : Int, + rhs_offset : Int, + rhs_limit : Int, + too_expensive : Int, + find_minimal : Bool, +) -> Partition { + let forward_search_diagonal = FixedArray::make( + old.length() + new.length() + 3, + 0, + ) + let backward_search_diagonal = FixedArray::make( + old.length() + new.length() + 3, + 0, + ) + let lhs = identity_compacted(old) + let rhs = identity_compacted(new) + let diagonal_shift = new.length() + 1 + diag( + forward_search_diagonal~, + backward_search_diagonal~, + diagonal_shift~, + lhs~, + rhs~, + lhs_offset~, + lhs_limit~, + rhs_offset~, + rhs_limit~, + too_expensive~, + find_minimal~, + ) +} + +///| +fn run_diag( + old : Array[Int], + new : Array[Int], + too_expensive : Int, + find_minimal : Bool, +) -> Partition { + run_diag_range( + old, + new, + 0, + old.length(), + 0, + new.length(), + too_expensive, + find_minimal, + ) +} + +///| +fn patterned_array(length : Int, shift : Int, modulo : Int) -> Array[Int] { + let result = Array::make(length, 0) + for i = 0; i < length; i = i + 1 { + result[i] = (i + shift) % modulo + } + result +} + +///| +test "diag reaches lower diagonal boundary when old is longer" { + let partition = run_diag([1, 2, 3, 4, 5], [10], 1024, true) + assert_true(partition.lo_minimal) + assert_true(partition.hi_minimal) + assert_true(partition.lhs_midpoint >= 0 && partition.lhs_midpoint <= 5) + assert_true(partition.rhs_midpoint >= 0 && partition.rhs_midpoint <= 1) +} + +///| +test "diag reaches upper diagonal boundary when new is longer" { + let partition = run_diag([1], [10, 11, 12, 13, 14], 1024, true) + assert_true(partition.lo_minimal) + assert_true(partition.hi_minimal) + assert_true(partition.lhs_midpoint >= 0 && partition.lhs_midpoint <= 1) + assert_true(partition.rhs_midpoint >= 0 && partition.rhs_midpoint <= 5) +} + +///| +test "diag odd parity can return from forward search" { + let partition = run_diag([1, 2], [9], 1024, true) + assert_true(partition.lo_minimal) + assert_true(partition.hi_minimal) +} + +///| +test "diag heuristic path can choose either side" { + let saw_forward = @ref.Ref::new(false) + let saw_backward = @ref.Ref::new(false) + for n = 3; n <= 8; n = n + 1 { + for m = 3; m <= 8; m = m + 1 { + for old_shift = 0; old_shift < 5; old_shift = old_shift + 1 { + for new_shift = 0; new_shift < 5; new_shift = new_shift + 1 { + let old = patterned_array(n, old_shift, 5) + let new = patterned_array(m, new_shift, 5) + for xoff = 0; xoff <= 1; xoff = xoff + 1 { + for yoff = 0; yoff <= 1; yoff = yoff + 1 { + if xoff < n && yoff < m { + let partition = run_diag_range( + old, new, xoff, n, yoff, m, 0, false, + ) + if partition.lo_minimal && !partition.hi_minimal { + saw_forward.val = true + } + if !partition.lo_minimal && partition.hi_minimal { + saw_backward.val = true + } + } + } + } + } + } + } + } + assert_true(saw_forward.val) + assert_true(saw_backward.val) +} diff --git a/diff/diff.mbt b/diff/diff.mbt new file mode 100644 index 000000000..eec1e44ca --- /dev/null +++ b/diff/diff.mbt @@ -0,0 +1,743 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// A partition is the midpoint of the shortest edit script for a specified portion of two +/// vectors. +/// +/// `lhs_midpoint`, `rhs_midpoint` is the midpoint discovered. The diagonal number `lhs_midpoint - rhs_midpoint` +/// equals the number of inserted elements minus the number of deleted elements (counting only elements before the midpoint). +/// +/// `lo_minimal` is true iff the minimal edit script for the left half of the partition is +/// known; similarly for `hi_minimal`. +#valtype +priv struct Partition { + lhs_midpoint : Int // position of midpoint in sequence lhs + rhs_midpoint : Int // position of midpoint in sequence rhs + lo_minimal : Bool // whether left half is optimal + hi_minimal : Bool // whether right half is optimal +} + +///| +/// Find the midpoint of the shortest edit script for a specified portion of the two +/// vectors. +/// +/// Scan from the beginnings of the vectors, and simultaneously from the ends, doing a +/// breadth-first search through the space of edit-sequence. When the two searches meet, we +/// have found the midpoint of the shortest edit sequence. +/// +/// If `find_minimal` is true, find the minimal edit script regardless of expense. +/// Otherwise, if the search is too expensive, use heuristics to stop the search and report +/// a suboptimal answer. +/// +/// This function assumes that the first elements of the specified portions of the two +/// vectors do not match, and likewise that the last elements do not match. The caller must +/// trim matching elements from the beginning and end of the portions it is going to +/// specify. +/// +/// If we return the "wrong" partitions, the worst this can do is cause suboptimal diff +/// output. It cannot cause incorrect diff output. +/// +/// Parameter mapping to Myers algorithm concepts: +/// - `forward_search_diagonal~`: forward search diagonal array; for diagonal `k`, +/// `forward_search_diagonal[k]` stores the farthest reachable x coordinate +/// - `backward_search_diagonal~`: backward search diagonal array; for diagonal `k`, +/// `backward_search_diagonal[k]` stores the nearest reachable x coordinate +/// - `diagonal_shift~`: diagonal-to-array index shift. Diagonal `k` is stored at `sh + k`, +/// so negative diagonals are representable; with `sh = rhs_len + 1`, the algorithm can +/// safely access neighbor/sentinel diagonals `k±1` and `k±2` +/// - `lhs_offset~`, `lhs_limit~`: search range for sequence lhs `[lhs_offset, lhs_limit)` +/// - `rhs_offset~`, `rhs_limit~`: search range for sequence rhs `[rhs_offset, rhs_limit)` +/// - `too_expensive~`: computation cost upper limit, preventing algorithm from running too long in difficult cases +/// - `find_minimal~`: whether to force finding minimal edit script (regardless of computation cost) +/// +fn[T : Eq] diag( + forward_search_diagonal~ : FixedArray[Int], // forward search diagonal array - stores farthest reachable point on each diagonal + backward_search_diagonal~ : FixedArray[Int], // backward search diagonal array - stores nearest reachable point on each diagonal + diagonal_shift~ : Int, // diagonal index shift: store diagonal k at [sh + k], including negative k and k±1/k±2 neighbors + lhs~ : CompactedElements[T], // compacted elements for sequence lhs + rhs~ : CompactedElements[T], // compacted elements for sequence rhs + lhs_offset~ : Int, // starting position of sequence lhs - defines left boundary of search window + lhs_limit~ : Int, // ending position of sequence lhs - defines right boundary of search window + rhs_offset~ : Int, // starting position of sequence rhs - defines upper boundary of search window + rhs_limit~ : Int, // ending position of sequence rhs - defines lower boundary of search window + too_expensive~ : Int, // computation cost threshold - use heuristic algorithm when exceeded + find_minimal~ : Bool, // optimality flag - whether optimal solution must be found +) -> Partition { + // Calculate range of valid diagonals + // In edit graph, diagonal k = x - y, valid range determined by search window + let dmin = lhs_offset - rhs_limit // minimum valid diagonal - corresponds to maximum insertions + let dmax = lhs_limit - rhs_offset // maximum valid diagonal - corresponds to maximum deletions + let fmid = lhs_offset - rhs_offset // center diagonal for forward search - diagonal corresponding to starting point + let bmid = lhs_limit - rhs_limit // center diagonal for backward search - diagonal corresponding to end point + + // Determine parity: whether southeast corner is on odd diagonal relative to northwest corner + // This determines when forward and backward searches meet (they meet in same step only when parity differs) + let odd = ((fmid - bmid) & 1) != 0 + + // Initialize search starting points + // diagonal_shift maps diagonal k to index diagonal_shift+k; with diagonal_shift=rhs_limit + 1 we can index diagonals in [-(rhs_limit + 1), lhs_limit + 1] + forward_search_diagonal[diagonal_shift + fmid] = lhs_offset // forward search starts from (lhs_offset, rhs_offset) + backward_search_diagonal[diagonal_shift + bmid] = lhs_limit // backward search starts from (lhs_limit, rhs_limit) + + // Main loop: alternate forward and backward search, increasing edit distance by one each time + // c: current edit distance (search depth) + // fmin, fmax: diagonal range for forward search + // bmin, bmax: diagonal range for backward search + for c = 1, fmin = fmid, fmax = fmid, bmin = bmid, bmax = bmid { + // Extend diagonal range for forward search + // Each iteration expands search range as edit distance increases by 1 + let fmin = if fmin > dmin { + forward_search_diagonal[diagonal_shift + fmin - 2] = -1 // mark unused diagonal + fmin - 1 // extend to next diagonal + } else { + fmin + 1 // boundary constraint + } + let fmax = if fmax < dmax { + forward_search_diagonal[diagonal_shift + fmax + 2] = -1 // mark unused diagonal + fmax + 1 // extend to previous diagonal + } else { + fmax - 1 // boundary constraint + } + + // Process all diagonals for forward search + let mut forward_result : Partition? = None + let mut d = fmax + while d >= fmin { + // Get farthest reachable points on adjacent diagonals + // tlo: x coordinate on lower-left diagonal (d-1), position after deletion + // thi: x coordinate on upper-right diagonal (d+1), position after insertion + let tlo = forward_search_diagonal[diagonal_shift + d - 1] + let thi = forward_search_diagonal[diagonal_shift + d + 1] + + // Choose better path: transfer from diagonal that reaches farther x coordinate + // This embodies Myers algorithm's greedy strategy: prioritize paths that go farther + let x = if tlo >= thi { tlo + 1 } else { thi } + + // Search for matching sequence along diagonal (Snake) + // This is key optimization of Myers algorithm: freely extend matching elements + let (x_cur, y_cur) = for x_cur = x, y_cur = x - d { + if x_cur < lhs_limit && y_cur < rhs_limit && lhs[x_cur] == rhs[y_cur] { + continue x_cur + 1, y_cur + 1 + } else { + break (x_cur, y_cur) + } + } + + // Update farthest reachable point on current diagonal + forward_search_diagonal[diagonal_shift + d] = x_cur + + // Check if meeting with backward search + // Meeting conditions: parity matches && diagonal is in backward search range && forward/backward search points overlap + if odd && + bmin <= d && + d <= bmax && + backward_search_diagonal[diagonal_shift + d] <= + forward_search_diagonal[diagonal_shift + d] { + forward_result = Some({ + lhs_midpoint: x_cur, + rhs_midpoint: y_cur, + lo_minimal: true, // forward path is optimal + hi_minimal: true, // backward path is also optimal + }) + break // found meeting point, exit early + } + d -= 2 // Myers algorithm characteristic: only check diagonals with same parity + } + + // If forward search found solution, return directly + match forward_result { + Some(result) => break result + None => () + } + + // Similarly extend diagonal range for backward search + let bmin = if bmin > dmin { + backward_search_diagonal[diagonal_shift + bmin - 2] = @int.MAX_VALUE // mark unused diagonal as max value + bmin - 1 + } else { + bmin + 1 + } + let bmax = if bmax < dmax { + backward_search_diagonal[diagonal_shift + bmax + 2] = @int.MAX_VALUE // mark unused diagonal as max value + bmax + 1 + } else { + bmax - 1 + } + + // Process all diagonals for backward search + let mut backward_result : Partition? = None + let mut d = bmax + while d >= bmin { + // Get nearest reachable points on adjacent diagonals (backward search in reverse direction) + let tlo = backward_search_diagonal[diagonal_shift + d - 1] + let thi = backward_search_diagonal[diagonal_shift + d + 1] + + // Backward search selection strategy: choose point with smaller x coordinate (closer to start) + let x = if tlo < thi { tlo } else { thi - 1 } + + // Search backward for matching sequence (reverse Snake) + let (x_cur, y_cur) = for x_cur = x, y_cur = x - d { + if x_cur > lhs_offset && + y_cur > rhs_offset && + lhs[x_cur - 1] == rhs[y_cur - 1] { + continue x_cur - 1, y_cur - 1 + } else { + break (x_cur, y_cur) + } + } + + // Update nearest reachable point on current diagonal + backward_search_diagonal[diagonal_shift + d] = x_cur + + // Check if meeting with forward search (meeting condition for even case) + if !odd && + fmin <= d && + d <= fmax && + backward_search_diagonal[diagonal_shift + d] <= + forward_search_diagonal[diagonal_shift + d] { + backward_result = Some({ + lhs_midpoint: x_cur, + rhs_midpoint: y_cur, + lo_minimal: true, + hi_minimal: true, + }) + break + } + d -= 2 + } + + // If backward search found solution, return directly + match backward_result { + Some(result) => break result + None => () + } + + // Heuristic handling: if computation cost too high, abandon finding optimal solution + // This is important mechanism for Myers algorithm to handle difficult cases + if !find_minimal && c >= too_expensive { + // Find diagonal in forward search that maximizes x + y + // This indicates processing maximum number of elements overall + let mut fxybest = -1 + let mut fxbest = fmax + let mut d = fmax + while d >= fmin { + let x = lhs_limit.min(forward_search_diagonal[diagonal_shift + d]) // limit to valid range + let y = x - d + let (x, y) = if rhs_limit < y { + (rhs_limit + d, rhs_limit) + } else { + (x, y) + } + if fxybest < x + y { + fxybest = x + y + fxbest = x + } + d -= 2 + } + + // Find diagonal in backward search that minimizes x + y + // This indicates reasonable partition point closest to start + let mut bxybest = @int.MAX_VALUE + let mut bxbest = bmax + let mut d = bmax + while d >= bmin { + let x = lhs_offset.max(backward_search_diagonal[diagonal_shift + d]) // limit to valid range + let y = x - d + let (x, y) = if y < rhs_offset { + (rhs_offset + d, rhs_offset) + } else { + (x, y) + } + if x + y < bxybest { + bxybest = x + y + bxbest = x + } + d -= 2 + } + + // Choose better heuristic partition point + // Compare "quality" of forward and backward, choose side that processes more elements + if lhs_limit + rhs_limit - bxybest < fxybest - (lhs_offset + rhs_offset) { + break { + lhs_midpoint: fxbest, + rhs_midpoint: fxybest - fxbest, + lo_minimal: true, + hi_minimal: false, // right half is not optimal + } + } else { + break { + lhs_midpoint: bxbest, + rhs_midpoint: bxybest - bxbest, + lo_minimal: false, // left half is not optimal + hi_minimal: true, + } + } + } else { + // Continue next round of search, edit distance increases by 1 + continue c + 1, fmin, fmax, bmin, bmax + } + } +} + +///| +/// Main diff loop that computes the differences between two arrays +fn[T : Eq] diff_loop( + cutoff : Int?, // computation cost threshold + lhs : CompactedElements[T], + rhs : CompactedElements[T], +) -> (FixedArray[Bool], FixedArray[Bool]) { + // Allocate working arrays for Myers algorithm + // Array size is n+m+3, sufficient to contain all possible diagonals + let forward_search_diagonal = FixedArray::make( + lhs.indices_length() + rhs.indices_length() + 3, + 0, + ) // forward search array + let backward_search_diagonal = FixedArray::make( + lhs.indices_length() + rhs.indices_length() + 3, + 0, + ) // backward search array + // shift k by rhs_indices.length()+1 so + // [-(rhs_indices.length()+1), lhs_indices.length()+1] -> [0, lhs_indices.length()+rhs_indices.length()+2] + let diagonal_shift = rhs.indices_length() + 1 + // Determine computation cost threshold + let too_expensive = match cutoff { + Some(c) => c + None => { + // Default strategy: calculate reasonable threshold based on problem size + // Use bit operations to quickly compute approximate square root + let diags = lhs.indices_length() + rhs.indices_length() + 3 + for acc = 1, diags_cur = diags { + if diags_cur != 0 { + continue acc << 1, diags_cur >> 2 + } else { + break acc.max(4096) // at least 4096, ensuring reasonable performance + } + } + } + } + + // Initialize change marker arrays + // true indicates element is deleted/inserted, false indicates element unchanged + let lhs_change_markers = FixedArray::make(lhs.source_length(), true) + let rhs_change_markers = FixedArray::make(rhs.source_length(), true) + + // Preset common elements as unchanged + for i in 0.. Unit { + // Optimization: skip matching prefix + // This is important optimization of Myers algorithm, reducing data to process + let (lhs_offset, rhs_offset) = for lhs_offset = lhs_offset, rhs_offset = rhs_offset { + if lhs_offset < lhs_limit && + rhs_offset < rhs_limit && + lhs[lhs_offset] == rhs[rhs_offset] { + continue lhs_offset + 1, rhs_offset + 1 + } else { + break (lhs_offset, rhs_offset) + } + } + + // Optimization: skip matching suffix + let (lhs_limit, rhs_limit) = for lhs_limit = lhs_limit, rhs_limit = rhs_limit { + if lhs_limit > lhs_offset && + rhs_limit > rhs_offset && + lhs[lhs_limit - 1] == rhs[rhs_limit - 1] { + continue lhs_limit - 1, rhs_limit - 1 + } else { + break (lhs_limit, rhs_limit) + } + } + + // Handle boundary cases + if lhs_offset == lhs_limit { + // Only insertions: A part exhausted, remaining B part all insertions + for y in rhs_offset.. FixedArray[Int] { + let n = a.length() + let present : Map[T, Bool] = Map::new(capacity=b.length()) + + for i in 0.. { + ai[k] = i // record original index of this element + k += 1 + } + None => () // doesn't exist in b, skip + } + } + let result = FixedArray::make(k, 0) + for i in 0.. (FixedArray[Bool], FixedArray[Bool]) { + // Generate indexers: find elements that also exist in other array + let lhs_indices = make_indexer(lhs, rhs) // indices of lhs elements that exist in rhs + let rhs_indices = make_indexer(rhs, lhs) // indices of rhs elements that exist in lhs + + let lhs = CompactedElements((lhs, lhs_indices)) + let rhs = CompactedElements((rhs, rhs_indices)) + // Call main loop for difference computation + diff_loop(cutoff, lhs, rhs) +} + +///| +/// `iter_matches(old~, new~, cutoff?)` diffs `old` and `new` (as in /usr/bin/diff), +/// and returns index pair of longest common subsequence in increasing order. +/// +/// The `cutoff` is an upper bound on the minimum edit distance between `old~` and `new~`. When +/// `cutoff` is exceeded, `iter_matches` returns a correct, but not necessarily minimal +/// diff. It defaults to about `sqrt(old.length() + new.length())`. +fn[T : Eq + Hash] iter_matches( + cutoff~ : Int?, // optional computation cost limit + old~ : ArrayView[T], // original array + new~ : ArrayView[T], // new array +) -> Iter2[Int, Int] { + // First compute difference markers + let (d1, d2) = diff_(cutoff, old, new) + let mut i1 = 0 + let mut i2 = 0 + fn go() { + // traverse two arrays, find unchanged element pairs + if i1 >= d1.length() || i2 >= d2.length() { + return None // reached end of arrays + } else if !d1[i1] { + // old[i1] unchanged + if !d2[i2] { + // new[i2] also unchanged - found matching pair + let matching_pair = (i1, i2) + i1 += 1 + i2 += 1 + return Some(matching_pair) + } else { + // new[i2] changed (insertion) - skip element in new + i2 += 1 + go() + } + } else if !d2[i2] { + // old[i1] changed but new[i2] didn't (deletion) - skip element in old + i1 += 1 + go() + } else { + // both elements changed - skip both and find next possible match + i1 += 1 + i2 += 1 + go() + } + } + + Iter2::new(go) +} + +///| +fn[T : Eq + Hash] append_myers_matches( + matches : Array[(Int, Int)], + cutoff : Int?, + old : ArrayView[T], + new : ArrayView[T], + old_offset : Int, + new_offset : Int, +) -> Unit { + for old_idx, new_idx in iter_matches(cutoff~, old~, new~) { + matches.push((old_offset + old_idx, new_offset + new_idx)) + } +} + +///| +fn[T : Eq + Hash] append_patience_matches( + matches : Array[(Int, Int)], + cutoff : Int?, + old : ArrayView[T], + new : ArrayView[T], + old_offset : Int, + new_offset : Int, +) -> Unit { + let anchors = unique_lcs(old~, new~) + if anchors.length() == 0 { + append_myers_matches(matches, cutoff, old, new, old_offset, new_offset) + return + } + + // Patience diff only selects one layer of unique anchors here. + // Bram Cohen noted that recursively searching subranges did not feel + // better in practice: https://bramcohen.livejournal.com/73318.html + // so unmatched gaps fall back to Myers instead of recursing. + let mut prev_old_idx = 0 + let mut prev_new_idx = 0 + for i in 0.. Array[(Int, Int)] { + let matches = Array::new(capacity=old.length().min(new.length())) + match algorithm { + Patience => append_patience_matches(matches, cutoff, old, new, 0, 0) + Myers => append_myers_matches(matches, cutoff, old, new, 0, 0) + } + matches +} + +///| +pub struct Diff[T] { + old : ArrayView[T] + new : ArrayView[T] + edits : Array[Edit] + + fn[T : Hash + Eq] new( + old~ : ArrayView[T], + new~ : ArrayView[T], + cutoff? : Int, + algorithm? : DiffAlgorithm, + ) -> Diff[T] +} + +///| +/// `Diff::new(old~, new~, cutoff?, algorithm?)` diffs `old` and `new` +/// (as in /usr/bin/diff), +/// and returns a `Diff[T]` bundling the source arrays with the edit script +/// that transforms `old` into `new`. +/// +/// The `cutoff` is an upper bound on the minimum edit distance between `old~` and `new~`. When +/// `cutoff` is exceeded, the result is a correct, but not necessarily minimal +/// diff. It defaults to about `sqrt(old.length() + new.length())`. +/// +/// The `algorithm` selects the diff strategy. `Myers` (the default) uses the +/// classic Myers O(ND) algorithm. `Patience` first finds elements unique to +/// both inputs as anchors, then runs Myers on the unmatched ranges between them. +/// +fn[T : Hash + Eq] Diff::new( + old~ : ArrayView[T], + new~ : ArrayView[T], + cutoff? : Int, + algorithm? : DiffAlgorithm = Myers, +) -> Diff[T] { + let result : Array[Edit] = Array::new(capacity=old.length() + new.length()) + let matches = collect_matches(cutoff, old, new, algorithm) + let mut prev_old_idx = 0 + let mut prev_new_idx = 0 + let mut equal_old_idx = 0 + let mut equal_new_idx = 0 + let mut consecutive_equal_length = 0 + for pair in matches { + let (old_idx, new_idx) = pair + // emit delete + if prev_old_idx != old_idx { + if consecutive_equal_length != 0 { + result.push( + Equal( + old_index=equal_old_idx, + new_index=equal_new_idx, + len=consecutive_equal_length, + ), + ) + } + consecutive_equal_length = 0 + result.push( + Delete( + old_index=prev_old_idx, + new_index=prev_new_idx, + len=old_idx - prev_old_idx, + ), + ) + prev_old_idx = old_idx + } + // emit insert + if prev_new_idx != new_idx { + if consecutive_equal_length != 0 { + result.push( + Equal( + old_index=equal_old_idx, + new_index=equal_new_idx, + len=consecutive_equal_length, + ), + ) + } + consecutive_equal_length = 0 + result.push( + Insert( + old_index=prev_old_idx, + new_index=prev_new_idx, + len=new_idx - prev_new_idx, + ), + ) + } + prev_old_idx = old_idx + 1 + prev_new_idx = new_idx + 1 + // emit equal + if consecutive_equal_length == 0 { + equal_old_idx = old_idx + equal_new_idx = new_idx + consecutive_equal_length = 1 + } else { + consecutive_equal_length += 1 + } + } nobreak { + // emit remain equal + if consecutive_equal_length != 0 { + result.push( + Equal( + old_index=equal_old_idx, + new_index=equal_new_idx, + len=consecutive_equal_length, + ), + ) + } + // emit remaining deletions + if prev_old_idx != old.length() { + result.push( + Delete( + old_index=prev_old_idx, + new_index=prev_new_idx, + len=old.length() - prev_old_idx, + ), + ) + // update prev_old_idx + prev_old_idx = old.length() + } + + // emit remaining insertions + + if prev_new_idx != new.length() { + result.push( + Insert( + old_index=prev_old_idx, + new_index=prev_new_idx, + len=new.length() - prev_new_idx, + ), + ) + } + return { old, new, edits: result } + } +} + +///| +pub fn[T] Diff::group(self : Diff[T], radius? : Int = 3) -> Array[Hunk[T]] { + group_edits(self.edits, radius~, old=self.old, new=self.new) +} diff --git a/diff/diff_test.mbt b/diff/diff_test.mbt new file mode 100644 index 000000000..a4c631aa9 --- /dev/null +++ b/diff/diff_test.mbt @@ -0,0 +1,444 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +fn[T : Show] edits_to_string(d : @diff.Diff[T]) -> String { + let lines = Array::new() + for edit in d.edits { + let (prefix, slice) = match edit { + Insert(new_index~, len~, ..) => + ("+", d.new.view(start=new_index, end=new_index + len)) + Delete(old_index~, len~, ..) => + ("-", d.old.view(start=old_index, end=old_index + len)) + Equal(old_index~, len~, ..) => + (" ", d.old.view(start=old_index, end=old_index + len)) + } + for s in slice { + lines.push("\{prefix} \{s}") + } + } + lines.join("\n") +} + +///| +fn[T : Show] hunks_to_string(hunks : Array[@diff.Hunk[T]]) -> String { + hunks.map(fn(h) { h.to_string() }).join("\n") +} + +///| +struct User { + id : Int + name : String +} derive(Hash, Eq) + +///| +impl Show for User with output(self, logger) { + logger.write_string( + "User { id: " + self.id.to_string() + ", name: " + self.name + " }", + ) +} + +///| +enum Token { + Start + Stop + Num(Int) + Word(String) +} derive(Hash, Eq) + +///| +impl Show for Token with output(self, logger) { + match self { + Start => logger.write_string("Start") + Stop => logger.write_string("Stop") + Num(n) => logger.write_string("Num(" + n.to_string() + ")") + Word(s) => logger.write_string("Word(\"" + s + "\")") + } +} + +///| +test "condense into one equal" { + let old = [ + "apple", "pineapple", "banana", "tomato", "carrot", "tomato", "potato", "cabbage", + ][:] + let new = [ + "apple", "pineapple", "banana", "tomato", "carrot", "tomato", "potato", "cabbage", + ][:] + assert_eq(old, new) + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| apple + #| pineapple + #| banana + #| tomato + #| carrot + #| tomato + #| potato + #| cabbage + ), + ) + inspect( + edits_to_string(patience), + content=( + #| apple + #| pineapple + #| banana + #| tomato + #| carrot + #| tomato + #| potato + #| cabbage + ), + ) +} + +///| +test "mixed" { + let old = ["carrot", "tomato", "potato", "cabbage"][:] + let new = ["apple", "banana", "tomato", "pineapple"][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #|- carrot + #|+ apple + #|+ banana + #| tomato + #|- potato + #|- cabbage + #|+ pineapple + ), + ) + inspect( + edits_to_string(patience), + content=( + #|- carrot + #|+ apple + #|+ banana + #| tomato + #|- potato + #|- cabbage + #|+ pineapple + ), + ) +} + +///| +test "delete then insert to hunks" { + let old = ["fn main() {", " println(\"foo\")", "}"][:] + let new = ["fn main() {", " println(\"bar\")", "}"][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| fn main() { + #|- println("foo") + #|+ println("bar") + #| } + ), + ) + inspect( + edits_to_string(patience), + content=( + #| fn main() { + #|- println("foo") + #|+ println("bar") + #| } + ), + ) + inspect( + hunks_to_string(d.group()), + content=( + #|@@ -1,3 +1,3 @@ + #| fn main() { + #|- println("foo") + #|+ println("bar") + #| } + #| + ), + ) +} + +///| +test "delete then insert to hunks without context" { + let old = ["fn main() {", " println(\"foo\")", "}"][:] + let new = ["fn main() {", " println(\"bar\")", "}"][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + hunks_to_string(d.group(radius=0)), + content=( + #|@@ -2 +2 @@ + #|- println("foo") + #|+ println("bar") + #| + ), + ) + inspect( + hunks_to_string(patience.group(radius=0)), + content=( + #|@@ -2 +2 @@ + #|- println("foo") + #|+ println("bar") + #| + ), + ) +} + +///| +test "replacement insert uses post-delete old cursor" { + let old = ["x", "a"][:] + let new = ["y", "z", "a"][:] + let d = @diff.Diff(old~, new~) + assert_true( + d.edits[:] + is [ + Delete(old_index=0, new_index=0, len=1), + Insert(old_index=1, new_index=0, len=2), + Equal(old_index=1, new_index=2, len=1), + ], + ) +} + +///| +test "panic group with negative radius" { + let old = ["fn main() {", "}"][:] + let new = old + ignore(@diff.Diff(old~, new~).group(radius=-1)) +} + +///| +test "large text to hunks" { + let old = [ + "alert", "chop", "brave", "arise", "bring", "boy", "answer", "count", "begin", + "casual", "dot", "bottom", + ][:] + let new = [ + "alert", "chop", "brave", "arise", "bring", "joy", "answer", "count", "begin", + "casual", "dot", "bottom", + ][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| alert + #| chop + #| brave + #| arise + #| bring + #|- boy + #|+ joy + #| answer + #| count + #| begin + #| casual + #| dot + #| bottom + ), + ) + inspect( + edits_to_string(patience), + content=( + #| alert + #| chop + #| brave + #| arise + #| bring + #|- boy + #|+ joy + #| answer + #| count + #| begin + #| casual + #| dot + #| bottom + ), + ) + inspect( + hunks_to_string(d.group()), + content=( + #|@@ -3,7 +3,7 @@ + #| brave + #| arise + #| bring + #|-boy + #|+joy + #| answer + #| count + #| begin + #| + ), + ) + inspect( + hunks_to_string(d.group(radius=2)), + content=( + #|@@ -4,5 +4,5 @@ + #| arise + #| bring + #|-boy + #|+joy + #| answer + #| count + #| + ), + ) +} + +///| +test "diff with Int elements" { + let old = [1, 2, 3, 4][:] + let new = [1, 2, 4, 5][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| 1 + #| 2 + #|- 3 + #| 4 + #|+ 5 + ), + ) + inspect( + edits_to_string(patience), + content=( + #| 1 + #| 2 + #|- 3 + #| 4 + #|+ 5 + ), + ) +} + +///| +test "diff with Bool elements" { + let old = [true, false, false][:] + let new = [true, false, true][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| true + #| false + #|- false + #|+ true + ), + ) + inspect( + edits_to_string(patience), + content=( + #| true + #| false + #|- false + #|+ true + ), + ) +} + +///| +test "diff with struct elements" { + let old = [ + { id: 1, name: "alice" }, + { id: 2, name: "bob" }, + { id: 3, name: "charlie" }, + ][:] + let new = [ + { id: 1, name: "alice" }, + { id: 3, name: "charlie" }, + { id: 4, name: "david" }, + ][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| User { id: 1, name: alice } + #|- User { id: 2, name: bob } + #| User { id: 3, name: charlie } + #|+ User { id: 4, name: david } + ), + ) + inspect( + edits_to_string(patience), + content=( + #| User { id: 1, name: alice } + #|- User { id: 2, name: bob } + #| User { id: 3, name: charlie } + #|+ User { id: 4, name: david } + ), + ) +} + +///| +test "diff with enum elements" { + let old = [Start, Word("alpha"), Num(1), Stop][:] + let new = [Start, Num(1), Word("beta"), Stop][:] + let d = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(d), + content=( + #| Start + #|- Word("alpha") + #| Num(1) + #|+ Word("beta") + #| Stop + ), + ) + inspect( + edits_to_string(patience), + content=( + #| Start + #|- Word("alpha") + #| Num(1) + #|+ Word("beta") + #| Stop + ), + ) +} + +///| +test "patience diff prefers unique anchors" { + let old = ["unique", "dup", "dup"][:] + let new = ["dup", "unique", "dup"][:] + let myers = @diff.Diff(old~, new~) + let patience = @diff.Diff(old~, new~, algorithm=@diff.Patience) + inspect( + edits_to_string(myers), + content=( + #|- unique + #| dup + #|+ unique + #| dup + ), + ) + inspect( + edits_to_string(patience), + content=( + #|+ dup + #| unique + #| dup + #|- dup + ), + ) +} diff --git a/diff/diff_wbtest.mbt b/diff/diff_wbtest.mbt new file mode 100644 index 000000000..285e8a3b6 --- /dev/null +++ b/diff/diff_wbtest.mbt @@ -0,0 +1,576 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +fn print_edits( + cutoff? : Int, + old : ArrayView[String], + new : ArrayView[String], +) -> String { + let mut prev_old_idx = 0 + let mut prev_new_idx = 0 + let result = Array::new(capacity=old.length() + new.length()) + let callback = fn(old_idx, new_idx) { + for i in prev_old_idx.. String::make(3, x)) + let new = ['a', 'x', 'c', 'y', 'e', 'z'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #|- bbb + #|+ xxx + #| ccc + #|- ddd + #|+ yyy + #| eee + #|+ zzz + ), + ) +} + +///| +test { + let old = ['a', 'o', 'b', 'c'].map(x => String::make(3, x)) + let new = ['a', 'e', 'b', 'c'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #|- ooo + #|+ eee + #| bbb + #| ccc + ), + ) +} + +///| +test "identical arrays" { + let old = ['a', 'b', 'c', 'd', 'e'].map(x => String::make(3, x)) + let new = ['a', 'b', 'c', 'd', 'e'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| bbb + #| ccc + #| ddd + #| eee + ), + ) +} + +///| +test "completely different arrays" { + let old = ['a', 'b', 'c'].map(x => String::make(3, x)) + let new = ['x', 'y', 'z'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #|- bbb + #|- ccc + #|+ xxx + #|+ yyy + #|+ zzz + ), + ) +} + +///| +test "fully insert" { + let old : Array[String] = [] + let new = ['a', 'b', 'c'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|+ aaa + #|+ bbb + #|+ ccc + ), + ) +} + +///| +test "fully delete" { + let old = ['a', 'b', 'c'].map(x => String::make(3, x)) + let new : Array[String] = [] + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #|- bbb + #|- ccc + ), + ) +} + +///| +test "insert at beginning" { + let old = ['b', 'c', 'd'].map(x => String::make(3, x)) + let new = ['a', 'b', 'c', 'd'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|+ aaa + #| bbb + #| ccc + #| ddd + ), + ) +} + +///| +test "insert at end" { + let old = ['a', 'b', 'c'].map(x => String::make(3, x)) + let new = ['a', 'b', 'c', 'd'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| bbb + #| ccc + #|+ ddd + ), + ) +} + +///| +test "delete at beginning" { + let old = ['a', 'b', 'c', 'd'].map(x => String::make(3, x)) + let new = ['b', 'c', 'd'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #| bbb + #| ccc + #| ddd + ), + ) +} + +///| +test "delete at end" { + let old = ['a', 'b', 'c', 'd'].map(x => String::make(3, x)) + let new = ['a', 'b', 'c'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| bbb + #| ccc + #|- ddd + ), + ) +} + +///| +test "insert multiple in middle" { + let old = ['a', 'b', 'e', 'f'].map(x => String::make(3, x)) + let new = ['a', 'b', 'c', 'd', 'e', 'f'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| bbb + #|+ ccc + #|+ ddd + #| eee + #| fff + ), + ) +} + +///| +test "delete multiple in middle" { + let old = ['a', 'b', 'c', 'd', 'e', 'f'].map(x => String::make(3, x)) + let new = ['a', 'b', 'e', 'f'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| bbb + #|- ccc + #|- ddd + #| eee + #| fff + ), + ) +} + +///| +test "arrays with duplicates" { + let old = ['a', 'b', 'a', 'c'].map(x => String::make(3, x)) + let new = ['a', 'a', 'b', 'c'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #|- bbb + #| aaa + #|+ bbb + #| ccc + ), + ) +} + +///| +test "single element arrays" { + let old = ['a'].map(x => String::make(3, x)) + let new = ['b'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #|+ bbb + ), + ) +} + +///| +test "long sequence LCS" { + let old = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'].map(x0 => { + String::make(3, x0) + }) + let new = ['a', 'x', 'c', 'y', 'e', 'z', 'g', 'w', 'i', 'v'].map(x0 => { + String::make(3, x0) + }) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #|- bbb + #|+ xxx + #| ccc + #|- ddd + #|+ yyy + #| eee + #|- fff + #|+ zzz + #| ggg + #|- hhh + #|+ www + #| iii + #|- jjj + #|+ vvv + ), + ) +} + +///| +test "reverse order" { + let old = ['a', 'b', 'c', 'd'].map(x => String::make(3, x)) + let new = ['d', 'c', 'b', 'a'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #|- bbb + #|- ccc + #| ddd + #|+ ccc + #|+ bbb + #|+ aaa + ), + ) +} + +///| +test "integer arrays diff" { + let old = [1, 2, 3, 4, 5][:] + let new = [1, 3, 5, 6, 7][:] + let result = Array::new() + for old_idx, new_idx in iter_matches(old~, new~, cutoff=None) { + result.push((old_idx, new_idx)) + } + + // Should find matching elements: (0,0)=1, (2,1)=3, (4,2)=5 + inspect(result, content="[(0, 0), (2, 1), (4, 2)]") +} + +///| +test "many duplicate elements" { + let old = ['a', 'a', 'a', 'b', 'a', 'a'].map(x => String::make(3, x)) + let new = ['a', 'a', 'b', 'a', 'a', 'a'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| aaa + #|- aaa + #| bbb + #|+ aaa + #| aaa + #| aaa + ), + ) +} + +///| +test "alternating pattern" { + let old = ['a', 'b', 'a', 'b', 'a', 'b'].map(x => String::make(3, x)) + let new = ['b', 'a', 'b', 'a', 'b', 'a'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #| bbb + #| aaa + #| bbb + #| aaa + #| bbb + #|+ aaa + ), + ) +} + +///| +test "nested insertions and deletions" { + let old = ['a', 'x', 'b', 'y', 'c', 'z', 'd'].map(x => String::make(3, x)) + let new = ['a', 'b', 'c', 'd', 'e'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #|- xxx + #| bbb + #|- yyy + #| ccc + #|- zzz + #| ddd + #|+ eee + ), + ) +} + +///| +test "iter_matches callback verification" { + let old = ['a', 'b', 'c', 'd'].map(x => String::make(3, x))[:] + let new = ['a', 'x', 'c', 'y'].map(x => String::make(3, x))[:] + let matches = Array::new() + let old_indices = Array::new() + let new_indices = Array::new() + for old_idx, new_idx in iter_matches(old~, new~, cutoff=None) { + matches.push((old_idx, new_idx)) + old_indices.push(old_idx) + new_indices.push(new_idx) + } + + // Verify that matched elements are indeed equal + for i = 0; i < matches.length(); i = i + 1 { + let (old_idx, new_idx) = matches[i] + inspect(old[old_idx] == new[new_idx], content="true") + } + + // Verify that indices are increasing + for i = 1; i < old_indices.length(); i = i + 1 { + inspect(old_indices[i] > old_indices[i - 1], content="true") + inspect(new_indices[i] > new_indices[i - 1], content="true") + } +} + +///| +test "string array edge cases" { + let old = ["", "a", "", "b", ""][:] + let new = ["", "b", "", "a", ""][:] + let result = Array::new() + for old_idx, new_idx in iter_matches(old~, new~, cutoff=None) { + result.push((old_idx, new_idx)) + } + + // Should find some matches (empty strings will match) + inspect(result.length() > 0, content="true") +} + +///| +test "large array performance" { + let size = 50 + let old = Array::make(size, 0) + let new = Array::make(size, 0) + + // Create regular pattern data + for i = 0; i < size; i = i + 1 { + old[i] = i + new[i] = if i % 2 == 0 { i } else { i + 100 } // Half same, half different + } + let match_count = @ref.Ref::new(0) + for _old_idx, _new_idx in iter_matches(old=old[:], new=new[:], cutoff=None) { + match_count.val = match_count.val + 1 + } + + // Should find approximately half matches + inspect(match_count.val > 10, content="true") + inspect(match_count.val < 40, content="true") +} + +///| +test "single difference" { + let old = ['a', 'b', 'c', 'd', 'e'].map(x => String::make(3, x)) + let new = ['a', 'b', 'x', 'd', 'e'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #| aaa + #| bbb + #|- ccc + #|+ xxx + #| ddd + #| eee + ), + ) +} + +///| +test "very short arrays" { + let old = ['a'].map(x => String::make(3, x)) + let new : Array[String] = [] + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + ), + ) +} + +///| +test "partial overlap sequence" { + let old = ['a', 'b', 'c', 'd'].map(x => String::make(3, x)) + let new = ['c', 'd', 'e', 'f'].map(x => String::make(3, x)) + inspect( + print_edits(old[:], new[:]), + content=( + #|- aaa + #|- bbb + #| ccc + #| ddd + #|+ eee + #|+ fff + ), + ) +} + +///| +test "iter_matches parameter order" { + let old = ['a', 'b', 'c'].map(x => String::make(3, x))[:] + let new = ['a', 'x', 'c'].map(x => String::make(3, x))[:] + let matches = Array::new() + let all_equal = @ref.Ref::new(true) + for old_idx, new_idx in iter_matches(old~, new~, cutoff=None) { + // Verify parameter order: old_idx should correspond to old array, new_idx to new array + let old_val = old[old_idx] + let new_val = new[new_idx] + if old_val != new_val { + all_equal.val = false + } + matches.push("old[\{old_idx}]=\{old_val}, new[\{new_idx}]=\{new_val}") + } + + // Verify that all matched elements are indeed equal + assert_true(all_equal.val) + // Should have matches for 'a' and 'c' + inspect(matches.length(), content="2") +} + +///| +test "repeated subsequence" { + let old = ['a', 'b', 'a', 'b', 'c'].map(x => String::make(3, x))[:] + let new = ['a', 'b', 'c', 'a', 'b'].map(x => String::make(3, x))[:] + let result = Array::new() + for old_idx, new_idx in iter_matches(old~, new~, cutoff=None) { + result.push((old_idx, new_idx)) + } + + // Verify that some matches were found + inspect(result.length() >= 3, content="true") +} + +///| +test "diff symmetry test" { + let arr1 = ['a', 'b', 'c'].map(x => String::make(3, x))[:] + let arr2 = ['a', 'x', 'c'].map(x => String::make(3, x))[:] + + // Calculate diff from arr1 -> arr2 + let matches1 = Array::new() + for old_idx, new_idx in iter_matches(old=arr1, new=arr2, cutoff=None) { + matches1.push((old_idx, new_idx)) + } + + // Calculate diff from arr2 -> arr1 + let matches2 = Array::new() + for old_idx, new_idx in iter_matches(old=arr2, new=arr1, cutoff=None) { + matches2.push((old_idx, new_idx)) + } + + // Both directions should have the same number of matches + @test.assert_eq(matches1.length(), matches2.length()) +} + +///| +test "diff sub array" { + let old = [ + "Apple", "Banana", "Orange", "Grape", "Strawberry", "Watermelon", "Peach", "Pear", + "Pineapple", "Cherry", + ] + let new = [ + "Apple", "Banana", "Orange", "Grape", "Strawberry", "Watermelon", "Peach", "Pear", + "Pineapple", "Cherry", + ] + @test.assert_eq(old, new) + inspect( + print_edits(old[:4], new[:5]), + content=( + #| Apple + #| Banana + #| Orange + #| Grape + #|+ Strawberry + ), + ) + inspect( + print_edits(old[4:], new[5:]), + content=( + #|- Strawberry + #| Watermelon + #| Peach + #| Pear + #| Pineapple + #| Cherry + ), + ) +} diff --git a/diff/edit.mbt b/diff/edit.mbt new file mode 100644 index 000000000..c759af7af --- /dev/null +++ b/diff/edit.mbt @@ -0,0 +1,151 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +using @debug {type Repr, trait Debug} + +///| +/// One step in an edit script that transforms `old` into `new`. +/// +/// Every constructor stores the start positions of the affected range in both +/// sequences. Length fields then describe how many elements are deleted, +/// inserted, or shared. Storing both coordinates on every step keeps the edit +/// self-contained, which simplifies deriving hunk headers and grouping edits +/// into hunks. +pub enum Edit { + /// Delete `len` elements starting at `old_index` from `old`. + /// + /// `new_index` is the aligned position in `new` where this deletion occurs. + /// Although a deletion contributes an empty range on the `new` side, storing + /// it directly simplifies hunk-header generation and edit grouping. + Delete(old_index~ : Int, new_index~ : Int, len~ : Int) + /// Insert `len` elements starting at `new_index` from `new`. + /// + /// `old_index` is the aligned position in `old` immediately before this + /// insertion point. Although an insertion contributes an empty range on the + /// `old` side, storing it directly simplifies hunk-header generation and + /// edit grouping. + Insert(old_index~ : Int, new_index~ : Int, len~ : Int) + /// Keep `len` equal elements starting at `old_index` in `old` and `new_index` + /// in `new`. + Equal(old_index~ : Int, new_index~ : Int, len~ : Int) +} derive(Debug) + +///| +/// Return the half-open range `[start, end)` covered by this edit in `old`. +/// +/// Insertions consume no elements from `old`, so they map to an empty range. +fn Edit::old_range(self : Edit) -> (Int, Int) { + match self { + Insert(old_index~, ..) => (old_index, old_index) + Delete(old_index~, len~, ..) => (old_index, old_index + len) + Equal(old_index~, len~, ..) => (old_index, old_index + len) + } +} + +///| +/// Return the half-open range `[start, end)` covered by this edit in `new`. +/// +/// Deletions consume no elements from `new`, so they map to an empty range. +fn Edit::new_range(self : Edit) -> (Int, Int) { + match self { + Insert(new_index~, len~, ..) => (new_index, new_index + len) + Delete(new_index~, ..) => (new_index, new_index) + Equal(new_index~, len~, ..) => (new_index, new_index + len) + } +} + +///| +fn[T] Edit::view_from( + self : Edit, + old~ : ArrayView[T], + new~ : ArrayView[T], +) -> ArrayView[T] { + match self { + Insert(new_index~, len~, ..) => + new.view(start=new_index, end=new_index + len) + Delete(old_index~, len~, ..) => + old.view(start=old_index, end=old_index + len) + Equal(old_index~, len~, ..) => + old.view(start=old_index, end=old_index + len) + } +} + +///| +/// Isolate change clusters by eliminating ranges with no changes. +/// +/// This will leave holes behind in long periods of equal ranges so that +/// you can build things like unified diffs. +fn[T] group_edits( + edits : Array[Edit], + radius? : Int = 3, + old~ : ArrayView[T], + new~ : ArrayView[T], +) -> Array[Hunk[T]] { + guard radius >= 0 else { abort("radius must be non-negative") } + if edits.is_empty() { + return [] + } + let n = edits.length() + let mut pending = Array::new() + let result = Array::new() + for i, edit in edits { + match edit { + Equal(old_index~, new_index~, len~) => { + // Trim leading context for first edit + let (old_index, new_index, len) = if i == 0 { + let offset = len.saturating_sub(radius) + (old_index + offset, new_index + offset, len - offset) + } else { + (old_index, new_index, len) + } + // Trim trailing context for last edit + let len = if i == n - 1 { + len - len.saturating_sub(radius) + } else { + len + } + // Split if this equal range is large enough + if len > radius * 2 { + pending.push(Edit::Equal(old_index~, new_index~, len=radius)) + result.push(Hunk::{ edits: pending, old, new }) + let offset = len.saturating_sub(radius) + pending = [ + Edit::Equal( + old_index=old_index + offset, + new_index=new_index + offset, + len=len - offset, + ), + ] + } else if len > 0 { + pending.push(Edit::Equal(old_index~, new_index~, len~)) + } + } + _ => pending.push(edit) + } + } + if !(pending is [] || pending is [Equal(_)]) { + result.push(Hunk::{ edits: pending, old, new }) + } + result +} + +///| +fn Int::saturating_sub(self : Int, subtrahend : Int) -> Int { + if self < subtrahend { + 0 + } else { + self - subtrahend + } +} diff --git a/diff/edit_wbtest.mbt b/diff/edit_wbtest.mbt new file mode 100644 index 000000000..279f6a87a --- /dev/null +++ b/diff/edit_wbtest.mbt @@ -0,0 +1,72 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +test "group_edit handles empty edits" { + assert_true( + group_edits([], old=([] : ArrayView[Int]), new=([] : ArrayView[Int])) is [], + ) +} + +///| +test "group_edit splits long equal blocks into multiple hunks" { + let edits : Array[Edit] = [ + Edit::Delete(old_index=0, new_index=0, len=1), + Edit::Equal(old_index=1, new_index=0, len=10), + Edit::Insert(old_index=11, new_index=10, len=2), + ] + let groups = group_edits( + edits, + old=([] : ArrayView[Int]), + new=([] : ArrayView[Int]), + ) + @test.assert_eq(groups.length(), 2) + assert_true( + groups[0].edits + is [ + Delete(old_index=0, new_index=0, len=1), + Equal(old_index=1, new_index=0, len=3), + ], + ) + assert_true( + groups[1].edits + is [ + Equal(old_index=8, new_index=7, len=3), + Insert(old_index=11, new_index=10, len=2), + ], + ) +} + +///| +test "range display for single and empty span" { + inspect( + HunkHeader::new([Equal(old_index=4, new_index=2, len=1)]).to_string(), + content="@@ -5 +3 @@", + ) + inspect( + HunkHeader::new([Insert(old_index=4, new_index=2, len=1)]).to_string(), + content="@@ -4,0 +3 @@", + ) +} + +///| +test "range display spans replacement ending with insert" { + inspect( + HunkHeader::new([ + Delete(old_index=1, new_index=1, len=1), + Insert(old_index=1, new_index=1, len=1), + ]).to_string(), + content="@@ -2 +2 @@", + ) +} diff --git a/diff/hunk.mbt b/diff/hunk.mbt new file mode 100644 index 000000000..f2861b503 --- /dev/null +++ b/diff/hunk.mbt @@ -0,0 +1,95 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +priv struct Range(Int, Int) // line number, length + +///| +impl Show for Range with output(self, logger) { + let mut beginning = self.0 + 1 // from array index to line number + let len = self.1 - self.0 + if len == 1 { + logger.write_string(beginning.to_string()) + } else { + if len == 0 { + // empty ranges begin at line just before the range + beginning -= 1 + } + logger.write_string(beginning.to_string()) + logger.write_char(',') + logger.write_string(len.to_string()) + } +} + +///| +priv struct HunkHeader(Range, Range) + +///| +/// Creates a hunk header from a (non empty) array of edit +fn HunkHeader::new(edits : ArrayView[Edit]) -> Self { + let (first_old_start, first_old_end) = edits[0].old_range() + let (first_new_start, first_new_end) = edits[0].new_range() + let mut old_end = first_old_end + let mut new_end = first_new_end + let mut found_old_end = false + let mut found_new_end = false + let mut i = edits.length() + while i > 0 { + i -= 1 + let edit = edits[i] + let (edit_old_start, edit_old_end) = edit.old_range() + let (edit_new_start, edit_new_end) = edit.new_range() + if !found_old_end && edit_old_start != edit_old_end { + old_end = edit_old_end + found_old_end = true + } + if !found_new_end && edit_new_start != edit_new_end { + new_end = edit_new_end + found_new_end = true + } + if found_old_end && found_new_end { + break + } + } + HunkHeader(Range(first_old_start, old_end), Range(first_new_start, new_end)) +} + +///| +impl Show for HunkHeader with output(self, logger) { + logger.write_string("@@ -\{self.0} +\{self.1} @@") +} + +///| +pub struct Hunk[T] { + edits : ArrayView[Edit] + old : ArrayView[T] + new : ArrayView[T] +} + +///| +pub impl[T : Show] Show for Hunk[T] with output(self, logger) { + let header = HunkHeader::new(self.edits).to_string() + logger.write_string(header) + logger.write_char('\n') + for edit in self.edits { + let (prefix, slice) = match edit { + Insert(..) => ("+", edit.view_from(old=self.old, new=self.new)) + Delete(..) => ("-", edit.view_from(old=self.old, new=self.new)) + Equal(..) => (" ", edit.view_from(old=self.old, new=self.new)) + } + for s in slice { + logger.write_string("\{prefix}\{s}\n") + } + } +} diff --git a/diff/moon.pkg b/diff/moon.pkg new file mode 100644 index 000000000..31dbaea08 --- /dev/null +++ b/diff/moon.pkg @@ -0,0 +1,12 @@ +import { + "moonbitlang/core/builtin", + "moonbitlang/core/int", + "moonbitlang/core/debug", + "moonbitlang/core/hashmap", +} + +import { + "moonbitlang/core/json", + "moonbitlang/core/ref", + "moonbitlang/core/test", +} for "wbtest" diff --git a/diff/pile.mbt b/diff/pile.mbt new file mode 100644 index 000000000..6a22c5cbf --- /dev/null +++ b/diff/pile.mbt @@ -0,0 +1,32 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// `Pile[T]` models one pile in patience sorting. +priv struct Pile(Array[BackPointer]) + +///| +fn Pile::new(top : BackPointer) -> Pile { + Pile([top]) +} + +///| +fn Pile::push(self : Pile, t : BackPointer) -> Unit { + self.0.push(t) +} + +///| +fn Pile::top(self : Pile) -> BackPointer { + self.0[self.0.length() - 1] +} diff --git a/diff/piles.mbt b/diff/piles.mbt new file mode 100644 index 000000000..2c642ddc7 --- /dev/null +++ b/diff/piles.mbt @@ -0,0 +1,79 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// The pile array maintained while selecting a monotone chain of unique +/// matches. +/// +/// The patience-sorting step only compares pile tops, so each pile stores the +/// current tail candidate for one subsequence length. +priv struct Piles(Array[Pile]) + +///| +fn Piles::is_empty(self : Piles) -> Bool { + self.0.is_empty() +} + +///| +/// Start a new pile with a single visible card. +fn Piles::put_back(self : Piles, singleton : BackPointer) -> Unit { + self.0.push(Pile::new(singleton)) +} + +///| +fn Piles::last(self : Piles) -> Pile? { + self.0.last() +} + +///| +fn Piles::op_get(self : Piles, i : Int) -> Pile { + self.0[i] +} + +///| +/// Insert a unique-match candidate into the pile structure. +/// +/// `new_idx` is the candidate index in `new~`. `place` points back into the +/// array returned by `find_unique`, so the chosen chain can later be mapped +/// back to `(old_idx, new_idx)` pairs. +fn Piles::put_by_binary_search( + self : Piles, + new_idx~ : Int, + place~ : Int, +) -> Unit { + let mut lo = -1 + let mut hi = self.0.length() + while lo + 1 < hi { + let mid = (lo + hi) / 2 + // Find the rightmost pile whose top still ends before `new_idx`. + if self[mid].top().value.0 < new_idx { + lo = mid + } else { + hi = mid + } + } + if lo >= 0 { + let prev = Some(self[lo].top()) + // Place the candidate on the next pile; appending a new pile means the + // candidate extends the longest chain seen so far. + if lo + 1 < self.0.length() { + self[lo + 1].push(BackPointer::{ value: (new_idx, place), prev }) + } else { + self.put_back(BackPointer::{ value: (new_idx, place), prev }) + } + } else { + // new_idx is smaller than all pile tops; place on pile 0 with no predecessor. + self[0].push(BackPointer::{ value: (new_idx, place), prev: None }) + } +} diff --git a/diff/pkg.generated.mbti b/diff/pkg.generated.mbti new file mode 100644 index 000000000..e16743f5c --- /dev/null +++ b/diff/pkg.generated.mbti @@ -0,0 +1,43 @@ +// Generated using `moon info`, DON'T EDIT IT +package "moonbitlang/core/diff" + +import { + "moonbitlang/core/debug", +} + +// Values + +// Errors + +// Types and methods +pub struct Diff[T] { + old : ArrayView[T] + new : ArrayView[T] + edits : Array[Edit] + + fn[T : Hash + Eq] new(old~ : ArrayView[T], new~ : ArrayView[T], cutoff? : Int, algorithm? : DiffAlgorithm) -> Diff[T] +} +pub fn[T] Diff::group(Self[T], radius? : Int) -> Array[Hunk[T]] + +pub(all) enum DiffAlgorithm { + Myers + Patience +} + +pub enum Edit { + Delete(old_index~ : Int, new_index~ : Int, len~ : Int) + Insert(old_index~ : Int, new_index~ : Int, len~ : Int) + Equal(old_index~ : Int, new_index~ : Int, len~ : Int) +} derive(@debug.Debug) + +pub struct Hunk[T] { + edits : ArrayView[Edit] + old : ArrayView[T] + new : ArrayView[T] +} +pub impl[T : Show] Show for Hunk[T] + +// Type aliases + +// Traits + diff --git a/diff/unique.mbt b/diff/unique.mbt new file mode 100644 index 000000000..633ff2e89 --- /dev/null +++ b/diff/unique.mbt @@ -0,0 +1,66 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// Collect elements that occur exactly once in both sequences. +/// +/// The returned pairs are sorted by `old~` index so `unique_lcs` can run the +/// patience-sorting step on the corresponding `new~` indices. +fn[T : Eq + Hash] find_unique( + old~ : ArrayView[T], + new~ : ArrayView[T], +) -> Array[(Int, Int)] { + // Track where a value first appears on each side and how often it appears. + struct CountRecord { + old_idx : Int + mut old_count : Int + mut new_idx : Int + mut new_count : Int + } + let match_lines = @hashmap.new() + for i = 0; i < old.length(); i = i + 1 { + if match_lines.get(old[i]) is Some((count_record : CountRecord)) { + count_record.old_count = count_record.old_count + 1 + } else { + match_lines[old[i]] = CountRecord::{ + old_idx: i, + old_count: 1, + new_idx: -1, + new_count: 0, + } + } + } + for i = 0; i < new.length(); i = i + 1 { + if match_lines.get(new[i]) is Some((count_record : CountRecord)) { + count_record.new_count = count_record.new_count + 1 + if count_record.new_idx == -1 { + count_record.new_idx = i + } + } + } + // Only unique-on-both-sides values can serve as stable patience-diff + // anchors. + let match_lines = match_lines + .iter() + .filter(p => p.1.new_count == 1 && p.1.old_count == 1) + .map(p => p.1) + let unique_match_lines = [] + for record in match_lines { + unique_match_lines.push((record.old_idx, record.new_idx)) + } + // Sorting by `old_idx` turns the LIS over `new_idx` into a common + // subsequence in both arrays. + unique_match_lines.sort_by_key(pair => pair.0) + return unique_match_lines +} diff --git a/diff/unique_lcs.mbt b/diff/unique_lcs.mbt new file mode 100644 index 000000000..5366e43a4 --- /dev/null +++ b/diff/unique_lcs.mbt @@ -0,0 +1,52 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +/// `unique_lcs(old~, new~)` computes the chain of unique matching index pairs used by +/// patience diff. +/// +/// It first collects elements that appear exactly once in both `old~` and `new~`, ordered +/// by their position in `old~`. It then runs the patience sorting step of patience diff on +/// the corresponding indices in `new~`, which is equivalent to finding the longest +/// increasing subsequence of those unique matches. +/// +/// The returned `(old_idx, new_idx)` pairs are increasing in both arrays and serve as the +/// anchor matches for diffing the remaining unmatched ranges. +fn[T : Eq + Hash] unique_lcs( + old~ : ArrayView[T], + new~ : ArrayView[T], +) -> ArrayView[(Int, Int)] { + let matches = find_unique(old~, new~) + let piles = Piles(Array::new(capacity=matches.length())) + for place = 0; place < matches.length(); place = place + 1 { + let (_, new_idx) = matches[place] + // Each pile top stores `(new_idx, place)`: the candidate tail in `new~` + // and the position needed to recover the original `(old_idx, new_idx)` + // pair after backtracking. + if piles.is_empty() { + piles.put_back(BackPointer::{ value: (new_idx, place), prev: None }) + } else { + piles.put_by_binary_search(new_idx~, place~) + } + } + guard piles.last() is Some(head) else { return [] } + // The top of the last pile ends one longest chain; following the + // backpointers yields the full chain in order. + let seq = [] + for pair in head.top().to_array().iter() { + let (_, place) = pair + seq.push(matches[place]) + } + return seq +} diff --git a/diff/unique_lcs_wbtest.mbt b/diff/unique_lcs_wbtest.mbt new file mode 100644 index 000000000..85356821e --- /dev/null +++ b/diff/unique_lcs_wbtest.mbt @@ -0,0 +1,214 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +test "basic case with common suffix" { + let old = ['1', '2', '3', '4', '5'] + let new = ['3', '4', '5', '6', '7'] + @json.json_inspect(unique_lcs(old~, new~), content=[[2, 0], [3, 1], [4, 2]]) +} + +///| +test "reordered elements" { + let old = ['1', '2', '3'] + let new = ['3', '2', '1'] + @json.json_inspect(unique_lcs(old~, new~), content=[[2, 0]]) +} + +///| +test "string array reordering" { + let old = ["foo", "bar", "baz", "qux"] + let new = ["baz", "qux", "foo", "bar"] + @json.json_inspect(unique_lcs(old~, new~), content=[[2, 0], [3, 1]]) +} + +///| +test "partial reordering with common elements" { + let old = ["apple", "banana", "cherry", "date"] + let new = ["apple", "cherry", "banana", "date"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 0], [2, 1], [3, 3]]) +} + +///| +test "empty sequences" { + let old : Array[String] = [] + let new : Array[String] = [] + @json.json_inspect(unique_lcs(old=old[:], new=new[:]), content=[]) +} + +///| +test "one empty sequence" { + let old = ["a", "b", "c"] + let new : Array[String] = [] + @json.json_inspect(unique_lcs(old=old[:], new=new[:]), content=[]) +} + +///| +test "other empty sequence" { + let old : Array[String] = [] + let new = ["x", "y", "z"] + @json.json_inspect(unique_lcs(old=old[:], new=new[:]), content=[]) +} + +///| +test "identical sequences" { + let old = ["a", "b", "c", "d"] + let new = ["a", "b", "c", "d"] + @json.json_inspect(unique_lcs(old~, new~), content=[ + [0, 0], + [1, 1], + [2, 2], + [3, 3], + ]) +} + +///| +test "completely different sequences" { + let old = ["a", "b", "c"] + let new = ["x", "y", "z"] + @json.json_inspect(unique_lcs(old~, new~), content=[]) +} + +///| +test "single element match" { + let old = ["a", "b", "c"] + let new = ["x", "b", "y"] + @json.json_inspect(unique_lcs(old~, new~), content=[[1, 1]]) +} + +///| +test "duplicate elements in old sequence" { + let old = ["a", "b", "a", "c"] + let new = ["x", "a", "y", "c"] + @json.json_inspect(unique_lcs(old~, new~), content=[[3, 3]]) +} + +///| +test "duplicate elements in new sequence" { + let old = ["a", "b", "c"] + let new = ["x", "b", "b", "y"] + @json.json_inspect(unique_lcs(old~, new~), content=[]) +} + +///| +test "duplicate elements in both sequences" { + let old = ["a", "b", "b", "c"] + let new = ["x", "b", "b", "y"] + @json.json_inspect(unique_lcs(old~, new~), content=[]) +} + +///| +test "long common subsequence" { + let old = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] + let new = ["0", "2", "4", "6", "8", "10", "11"] + @json.json_inspect(unique_lcs(old~, new~), content=[ + [1, 1], + [3, 2], + [5, 3], + [7, 4], + [9, 5], + ]) +} + +///| +test "reverse order" { + let old = ["a", "b", "c", "d"] + let new = ["d", "c", "b", "a"] + @json.json_inspect(unique_lcs(old~, new~), content=[[3, 0]]) +} + +///| +test "single character sequences" { + let old = ["a"] + let new = ["a"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 0]]) +} + +///| +test "single character no match" { + let old = ["a"] + let new = ["b"] + @json.json_inspect(unique_lcs(old~, new~), content=[]) +} + +///| +test "prefix and suffix overlap" { + let old = ["start", "middle1", "middle2", "end"] + let new = ["start", "new1", "new2", "end"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 0], [3, 3]]) +} + +///| +test "interleaved unique elements" { + let old = ["a1", "b1", "a2", "b2", "a3"] + let new = ["b1", "a1", "b2", "a2", "a3"] + @json.json_inspect(unique_lcs(old~, new~), content=[[1, 0], [3, 2], [4, 4]]) +} + +///| +test "numeric sequences" { + let old = [1, 3, 5, 7, 9] + let new = [2, 3, 4, 7, 8, 9] + @json.json_inspect(unique_lcs(old~, new~), content=[[1, 1], [3, 3], [4, 5]]) +} + +///| +test "mixed types - booleans" { + let old = [true, false, true, false] + let new = [false, true, false, true] + // TODO: check this + @json.json_inspect(unique_lcs(old~, new~), content=[]) +} + +///| +test "large gap in middle" { + let old = ["start", "gap1", "gap2", "gap3", "gap4", "end"] + let new = ["start", "different", "end"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 0], [5, 2]]) +} + +///| +test "insertion at beginning" { + let old = ["b", "c", "d"] + let new = ["a", "b", "c", "d"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 1], [1, 2], [2, 3]]) +} + +///| +test "insertion at end" { + let old = ["a", "b", "c"] + let new = ["a", "b", "c", "d"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 0], [1, 1], [2, 2]]) +} + +///| +test "deletion from beginning" { + let old = ["a", "b", "c", "d"] + let new = ["b", "c", "d"] + @json.json_inspect(unique_lcs(old~, new~), content=[[1, 0], [2, 1], [3, 2]]) +} + +///| +test "deletion from end" { + let old = ["a", "b", "c", "d"] + let new = ["a", "b", "c"] + @json.json_inspect(unique_lcs(old~, new~), content=[[0, 0], [1, 1], [2, 2]]) +} + +///| +test "complex pattern with multiple unique matches" { + let old = ["alpha", "beta", "gamma", "delta", "epsilon", "zeta"] + let new = ["gamma", "alpha", "theta", "epsilon", "beta", "eta"] + @json.json_inspect(unique_lcs(old~, new~), content=[[2, 0], [4, 3]]) +} diff --git a/diff/unique_wbtest.mbt b/diff/unique_wbtest.mbt new file mode 100644 index 000000000..dff4d6338 --- /dev/null +++ b/diff/unique_wbtest.mbt @@ -0,0 +1,34 @@ +// Copyright 2026 International Digital Economy Academy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +///| +test { + let old = [1, 2, 3, 4, 5] + let new = [3, 4, 5, 6, 7] + @json.json_inspect(find_unique(old~, new~), content=[[2, 0], [3, 1], [4, 2]]) +} + +///| +test { + let old : Array[Int] = [] + let new = [] + @json.json_inspect(find_unique(old~, new~), content=[]) +} + +///| +test { + let old = [1, 2, 3] + let new : Array[Int] = [3, 2, 1] + @json.json_inspect(find_unique(old~, new~), content=[[0, 2], [1, 1], [2, 0]]) +}