1 //! Myers' diff algorithm.
3 //! * time: `O((N+M)D)`
6 //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
9 //! The implementation of this algorithm is based on the implementation by
14 //! At present this implementation of Myers' does not implement any more advanced
15 //! heuristics that would solve some pathological cases. For instance passing two
16 //! large and completely distinct sequences to the algorithm will make it spin
17 //! without making reasonable progress. Currently the only protection in the
18 //! library against this is to pass a deadline to the diffing algorithm.
20 //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
22 use std
::ops
::{Index, IndexMut, Range}
;
23 use std
::time
::Instant
;
25 use crate::algorithms
::utils
::{common_prefix_len, common_suffix_len, is_empty_range}
;
26 use crate::algorithms
::DiffHook
;
28 /// Myers' diff algorithm.
30 /// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
31 pub fn diff
<Old
, New
, D
>(
34 old_range
: Range
<usize>,
36 new_range
: Range
<usize>,
37 ) -> Result
<(), D
::Error
>
39 Old
: Index
<usize> + ?Sized
,
40 New
: Index
<usize> + ?Sized
,
42 New
::Output
: PartialEq
<Old
::Output
>,
44 diff_deadline(d
, old
, old_range
, new
, new_range
, None
)
47 /// Myers' diff algorithm with deadline.
49 /// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
51 /// This diff is done with an optional deadline that defines the maximal
52 /// execution time permitted before it bails and falls back to an approximation.
53 pub fn diff_deadline
<Old
, New
, D
>(
56 old_range
: Range
<usize>,
58 new_range
: Range
<usize>,
59 deadline
: Option
<Instant
>,
60 ) -> Result
<(), D
::Error
>
62 Old
: Index
<usize> + ?Sized
,
63 New
: Index
<usize> + ?Sized
,
65 New
::Output
: PartialEq
<Old
::Output
>,
67 let max_d
= max_d(old_range
.len(), new_range
.len());
68 let mut vb
= V
::new(max_d
);
69 let mut vf
= V
::new(max_d
);
71 d
, old
, old_range
, new
, new_range
, &mut vf
, &mut vb
, deadline
,
76 // A D-path is a path which starts at (0,0) that has exactly D non-diagonal
77 // edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
78 // and then a possibly empty sequence of diagonal edges called a snake.
80 /// `V` contains the endpoints of the furthest reaching `D-paths`. For each
81 /// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
82 /// `y` can be computed from `x - k`. In other words, `V` is an array of integers
83 /// where `V[k]` contains the row index of the endpoint of the furthest reaching
84 /// path in diagonal `k`.
86 /// We can't use a traditional Vec to represent `V` since we use `k` as an index
87 /// and it can take on negative values. So instead `V` is represented as a
88 /// light-weight wrapper around a Vec plus an `offset` which is the maximum value
89 /// `k` can take on in order to map negative `k`'s back to a value >= 0.
93 v
: Vec
<usize>, // Look into initializing this to -1 and storing isize
97 fn new(max_d
: usize) -> Self {
99 offset
: max_d
as isize,
100 v
: vec
![0; 2 * max_d
],
104 fn len(&self) -> usize {
109 impl Index
<isize> for V
{
112 fn index(&self, index
: isize) -> &Self::Output
{
113 &self.v
[(index
+ self.offset
) as usize]
117 impl IndexMut
<isize> for V
{
118 fn index_mut(&mut self, index
: isize) -> &mut Self::Output
{
119 &mut self.v
[(index
+ self.offset
) as usize]
123 fn max_d(len1
: usize, len2
: usize) -> usize {
124 // XXX look into reducing the need to have the additional '+ 1'
125 (len1
+ len2
+ 1) / 2 + 1
129 fn split_at(range
: Range
<usize>, at
: usize) -> (Range
<usize>, Range
<usize>) {
130 (range
.start
..at
, at
..range
.end
)
133 /// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
134 /// a snake has a start end end point (and it is possible for a snake to have
135 /// a length of zero, meaning the start and end points are the same) however
136 /// we do not need the end point which is why it's not implemented here.
138 /// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
139 /// some of which may be empty. The divide step requires finding the ceil(D/2) +
140 /// 1 or middle snake of an optimal D-path. The idea for doing so is to
141 /// simultaneously run the basic algorithm in both the forward and reverse
142 /// directions until furthest reaching forward and reverse paths starting at
143 /// opposing corners 'overlap'.
144 fn find_middle_snake
<Old
, New
>(
146 old_range
: Range
<usize>,
148 new_range
: Range
<usize>,
151 deadline
: Option
<Instant
>,
152 ) -> Option
<(usize, usize)>
154 Old
: Index
<usize> + ?Sized
,
155 New
: Index
<usize> + ?Sized
,
156 New
::Output
: PartialEq
<Old
::Output
>,
158 let n
= old_range
.len();
159 let m
= new_range
.len();
161 // By Lemma 1 in the paper, the optimal edit script length is odd or even as
162 // `delta` is odd or even.
163 let delta
= n
as isize - m
as isize;
164 let odd
= delta
& 1 == 1;
166 // The initial point at (0, -1)
168 // The initial point at (N, M+1)
171 // We only need to explore ceil(D/2) + 1
172 let d_max
= max_d(n
, m
);
173 assert
!(vf
.len() >= d_max
);
174 assert
!(vb
.len() >= d_max
);
176 for d
in 0..d_max
as isize {
177 // are we running for too long?
178 if let Some(deadline
) = deadline
{
179 if Instant
::now() > deadline
{
185 for k
in (-d
..=d
).rev().step_by(2) {
186 let mut x
= if k
== -d
|| (k
!= d
&& vf
[k
- 1] < vf
[k
+ 1]) {
191 let y
= (x
as isize - k
) as usize;
193 // The coordinate of the start of a snake
194 let (x0
, y0
) = (x
, y
);
195 // While these sequences are identical, keep moving through the
196 // graph with no cost
197 if x
< old_range
.len() && y
< new_range
.len() {
198 let advance
= common_prefix_len(
200 old_range
.start
+ x
..old_range
.end
,
202 new_range
.start
+ y
..new_range
.end
,
207 // This is the new best x value
210 // Only check for connections from the forward search when N - M is
211 // odd and when there is a reciprocal k line coming from the other
213 if odd
&& (k
- delta
).abs() <= (d
- 1) {
214 // TODO optimize this so we don't have to compare against n
215 if vf
[k
] + vb
[-(k
- delta
)] >= n
{
217 return Some((x0
+ old_range
.start
, y0
+ new_range
.start
));
223 for k
in (-d
..=d
).rev().step_by(2) {
224 let mut x
= if k
== -d
|| (k
!= d
&& vb
[k
- 1] < vb
[k
+ 1]) {
229 let mut y
= (x
as isize - k
) as usize;
231 // The coordinate of the start of a snake
233 let advance
= common_suffix_len(
235 old_range
.start
..old_range
.start
+ n
- x
,
237 new_range
.start
..new_range
.start
+ m
- y
,
243 // This is the new best x value
246 if !odd
&& (k
- delta
).abs() <= d
{
247 // TODO optimize this so we don't have to compare against n
248 if vb
[k
] + vf
[-(k
- delta
)] >= n
{
250 return Some((n
- x
+ old_range
.start
, m
- y
+ new_range
.start
));
255 // TODO: Maybe there's an opportunity to optimize and bail early?
262 #[allow(clippy::too_many_arguments)]
263 fn conquer
<Old
, New
, D
>(
266 mut old_range
: Range
<usize>,
268 mut new_range
: Range
<usize>,
271 deadline
: Option
<Instant
>,
272 ) -> Result
<(), D
::Error
>
274 Old
: Index
<usize> + ?Sized
,
275 New
: Index
<usize> + ?Sized
,
277 New
::Output
: PartialEq
<Old
::Output
>,
279 // Check for common prefix
280 let common_prefix_len
= common_prefix_len(old
, old_range
.clone(), new
, new_range
.clone());
281 if common_prefix_len
> 0 {
282 d
.equal(old_range
.start
, new_range
.start
, common_prefix_len
)?
;
284 old_range
.start
+= common_prefix_len
;
285 new_range
.start
+= common_prefix_len
;
287 // Check for common suffix
288 let common_suffix_len
= common_suffix_len(old
, old_range
.clone(), new
, new_range
.clone());
289 let common_suffix
= (
290 old_range
.end
- common_suffix_len
,
291 new_range
.end
- common_suffix_len
,
293 old_range
.end
-= common_suffix_len
;
294 new_range
.end
-= common_suffix_len
;
296 if is_empty_range(&old_range
) && is_empty_range(&new_range
) {
298 } else if is_empty_range(&new_range
) {
299 d
.delete(old_range
.start
, old_range
.len(), new_range
.start
)?
;
300 } else if is_empty_range(&old_range
) {
301 d
.insert(old_range
.start
, new_range
.start
, new_range
.len())?
;
302 } else if let Some((x_start
, y_start
)) = find_middle_snake(
311 let (old_a
, old_b
) = split_at(old_range
, x_start
);
312 let (new_a
, new_b
) = split_at(new_range
, y_start
);
313 conquer(d
, old
, old_a
, new
, new_a
, vf
, vb
, deadline
)?
;
314 conquer(d
, old
, old_b
, new
, new_b
, vf
, vb
, deadline
)?
;
318 old_range
.end
- old_range
.start
,
324 new_range
.end
- new_range
.start
,
328 if common_suffix_len
> 0 {
329 d
.equal(common_suffix
.0, common_suffix
.1, common_suffix_len
)?
;
336 fn test_find_middle_snake() {
337 let a
= &b
"ABCABBA"[..];
338 let b
= &b
"CBABAC"[..];
339 let max_d
= max_d(a
.len(), b
.len());
340 let mut vf
= V
::new(max_d
);
341 let mut vb
= V
::new(max_d
);
342 let (x_start
, y_start
) =
343 find_middle_snake(a
, 0..a
.len(), b
, 0..b
.len(), &mut vf
, &mut vb
, None
).unwrap();
344 assert_eq
!(x_start
, 4);
345 assert_eq
!(y_start
, 1);
350 let a
: &[usize] = &[0, 1, 2, 3, 4];
351 let b
: &[usize] = &[0, 1, 2, 9, 4];
353 let mut d
= crate::algorithms
::Replace
::new(crate::algorithms
::Capture
::new());
354 diff(&mut d
, a
, 0..a
.len(), b
, 0..b
.len()).unwrap();
355 insta
::assert_debug_snapshot
!(d
.into_inner().ops());
359 fn test_contiguous() {
360 let a
: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5];
361 let b
: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
363 let mut d
= crate::algorithms
::Replace
::new(crate::algorithms
::Capture
::new());
364 diff(&mut d
, a
, 0..a
.len(), b
, 0..b
.len()).unwrap();
365 insta
::assert_debug_snapshot
!(d
.into_inner().ops());
370 let a
: &[usize] = &[0, 1, 3, 4, 5];
371 let b
: &[usize] = &[0, 1, 4, 5, 8, 9];
373 let mut d
= crate::algorithms
::Capture
::new();
374 diff(&mut d
, a
, 0..a
.len(), b
, 0..b
.len()).unwrap();
375 insta
::assert_debug_snapshot
!(d
.ops());
379 fn test_deadline_reached() {
381 use std
::time
::Duration
;
383 let a
= (0..100).collect
::<Vec
<_
>>();
384 let mut b
= (0..100).collect
::<Vec
<_
>>();
389 struct SlowIndex
<'a
>(&'a
[usize]);
391 impl<'a
> Index
<usize> for SlowIndex
<'a
> {
394 fn index(&self, index
: usize) -> &Self::Output
{
395 std
::thread
::sleep(Duration
::from_millis(1));
400 let slow_a
= SlowIndex(&a
);
401 let slow_b
= SlowIndex(&b
);
403 // don't give it enough time to do anything interesting
404 let mut d
= crate::algorithms
::Replace
::new(crate::algorithms
::Capture
::new());
411 Some(Instant
::now() + Duration
::from_millis(50)),
414 insta
::assert_debug_snapshot
!(d
.into_inner().ops());