]> git.proxmox.com Git - rustc.git/blob - vendor/similar/src/algorithms/myers.rs
New upstream version 1.71.1+dfsg1
[rustc.git] / vendor / similar / src / algorithms / myers.rs
1 //! Myers' diff algorithm.
2 //!
3 //! * time: `O((N+M)D)`
4 //! * space `O(N+M)`
5 //!
6 //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
7 //! describing it.
8 //!
9 //! The implementation of this algorithm is based on the implementation by
10 //! Brandon Williams.
11 //!
12 //! # Heuristics
13 //!
14 //! At present this implementation of Myers' does not implement any more advanced
15 //! heuristics that would solve some pathological cases. For instance passing two
16 //! large and completely distinct sequences to the algorithm will make it spin
17 //! without making reasonable progress. Currently the only protection in the
18 //! library against this is to pass a deadline to the diffing algorithm.
19 //!
20 //! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
21
22 use std::ops::{Index, IndexMut, Range};
23 use std::time::Instant;
24
25 use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range};
26 use crate::algorithms::DiffHook;
27
28 /// Myers' diff algorithm.
29 ///
30 /// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
31 pub fn diff<Old, New, D>(
32 d: &mut D,
33 old: &Old,
34 old_range: Range<usize>,
35 new: &New,
36 new_range: Range<usize>,
37 ) -> Result<(), D::Error>
38 where
39 Old: Index<usize> + ?Sized,
40 New: Index<usize> + ?Sized,
41 D: DiffHook,
42 New::Output: PartialEq<Old::Output>,
43 {
44 diff_deadline(d, old, old_range, new, new_range, None)
45 }
46
47 /// Myers' diff algorithm with deadline.
48 ///
49 /// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
50 ///
51 /// This diff is done with an optional deadline that defines the maximal
52 /// execution time permitted before it bails and falls back to an approximation.
53 pub fn diff_deadline<Old, New, D>(
54 d: &mut D,
55 old: &Old,
56 old_range: Range<usize>,
57 new: &New,
58 new_range: Range<usize>,
59 deadline: Option<Instant>,
60 ) -> Result<(), D::Error>
61 where
62 Old: Index<usize> + ?Sized,
63 New: Index<usize> + ?Sized,
64 D: DiffHook,
65 New::Output: PartialEq<Old::Output>,
66 {
67 let max_d = max_d(old_range.len(), new_range.len());
68 let mut vb = V::new(max_d);
69 let mut vf = V::new(max_d);
70 conquer(
71 d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
72 )?;
73 d.finish()
74 }
75
76 // A D-path is a path which starts at (0,0) that has exactly D non-diagonal
77 // edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
78 // and then a possibly empty sequence of diagonal edges called a snake.
79
80 /// `V` contains the endpoints of the furthest reaching `D-paths`. For each
81 /// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
82 /// `y` can be computed from `x - k`. In other words, `V` is an array of integers
83 /// where `V[k]` contains the row index of the endpoint of the furthest reaching
84 /// path in diagonal `k`.
85 ///
86 /// We can't use a traditional Vec to represent `V` since we use `k` as an index
87 /// and it can take on negative values. So instead `V` is represented as a
88 /// light-weight wrapper around a Vec plus an `offset` which is the maximum value
89 /// `k` can take on in order to map negative `k`'s back to a value >= 0.
90 #[derive(Debug)]
91 struct V {
92 offset: isize,
93 v: Vec<usize>, // Look into initializing this to -1 and storing isize
94 }
95
96 impl V {
97 fn new(max_d: usize) -> Self {
98 Self {
99 offset: max_d as isize,
100 v: vec![0; 2 * max_d],
101 }
102 }
103
104 fn len(&self) -> usize {
105 self.v.len()
106 }
107 }
108
109 impl Index<isize> for V {
110 type Output = usize;
111
112 fn index(&self, index: isize) -> &Self::Output {
113 &self.v[(index + self.offset) as usize]
114 }
115 }
116
117 impl IndexMut<isize> for V {
118 fn index_mut(&mut self, index: isize) -> &mut Self::Output {
119 &mut self.v[(index + self.offset) as usize]
120 }
121 }
122
123 fn max_d(len1: usize, len2: usize) -> usize {
124 // XXX look into reducing the need to have the additional '+ 1'
125 (len1 + len2 + 1) / 2 + 1
126 }
127
128 #[inline(always)]
129 fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
130 (range.start..at, at..range.end)
131 }
132
133 /// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
134 /// a snake has a start end end point (and it is possible for a snake to have
135 /// a length of zero, meaning the start and end points are the same) however
136 /// we do not need the end point which is why it's not implemented here.
137 ///
138 /// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
139 /// some of which may be empty. The divide step requires finding the ceil(D/2) +
140 /// 1 or middle snake of an optimal D-path. The idea for doing so is to
141 /// simultaneously run the basic algorithm in both the forward and reverse
142 /// directions until furthest reaching forward and reverse paths starting at
143 /// opposing corners 'overlap'.
144 fn find_middle_snake<Old, New>(
145 old: &Old,
146 old_range: Range<usize>,
147 new: &New,
148 new_range: Range<usize>,
149 vf: &mut V,
150 vb: &mut V,
151 deadline: Option<Instant>,
152 ) -> Option<(usize, usize)>
153 where
154 Old: Index<usize> + ?Sized,
155 New: Index<usize> + ?Sized,
156 New::Output: PartialEq<Old::Output>,
157 {
158 let n = old_range.len();
159 let m = new_range.len();
160
161 // By Lemma 1 in the paper, the optimal edit script length is odd or even as
162 // `delta` is odd or even.
163 let delta = n as isize - m as isize;
164 let odd = delta & 1 == 1;
165
166 // The initial point at (0, -1)
167 vf[1] = 0;
168 // The initial point at (N, M+1)
169 vb[1] = 0;
170
171 // We only need to explore ceil(D/2) + 1
172 let d_max = max_d(n, m);
173 assert!(vf.len() >= d_max);
174 assert!(vb.len() >= d_max);
175
176 for d in 0..d_max as isize {
177 // are we running for too long?
178 if let Some(deadline) = deadline {
179 if Instant::now() > deadline {
180 break;
181 }
182 }
183
184 // Forward path
185 for k in (-d..=d).rev().step_by(2) {
186 let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
187 vf[k + 1]
188 } else {
189 vf[k - 1] + 1
190 };
191 let y = (x as isize - k) as usize;
192
193 // The coordinate of the start of a snake
194 let (x0, y0) = (x, y);
195 // While these sequences are identical, keep moving through the
196 // graph with no cost
197 if x < old_range.len() && y < new_range.len() {
198 let advance = common_prefix_len(
199 old,
200 old_range.start + x..old_range.end,
201 new,
202 new_range.start + y..new_range.end,
203 );
204 x += advance;
205 }
206
207 // This is the new best x value
208 vf[k] = x;
209
210 // Only check for connections from the forward search when N - M is
211 // odd and when there is a reciprocal k line coming from the other
212 // direction.
213 if odd && (k - delta).abs() <= (d - 1) {
214 // TODO optimize this so we don't have to compare against n
215 if vf[k] + vb[-(k - delta)] >= n {
216 // Return the snake
217 return Some((x0 + old_range.start, y0 + new_range.start));
218 }
219 }
220 }
221
222 // Backward path
223 for k in (-d..=d).rev().step_by(2) {
224 let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
225 vb[k + 1]
226 } else {
227 vb[k - 1] + 1
228 };
229 let mut y = (x as isize - k) as usize;
230
231 // The coordinate of the start of a snake
232 if x < n && y < m {
233 let advance = common_suffix_len(
234 old,
235 old_range.start..old_range.start + n - x,
236 new,
237 new_range.start..new_range.start + m - y,
238 );
239 x += advance;
240 y += advance;
241 }
242
243 // This is the new best x value
244 vb[k] = x;
245
246 if !odd && (k - delta).abs() <= d {
247 // TODO optimize this so we don't have to compare against n
248 if vb[k] + vf[-(k - delta)] >= n {
249 // Return the snake
250 return Some((n - x + old_range.start, m - y + new_range.start));
251 }
252 }
253 }
254
255 // TODO: Maybe there's an opportunity to optimize and bail early?
256 }
257
258 // deadline reached
259 None
260 }
261
262 #[allow(clippy::too_many_arguments)]
263 fn conquer<Old, New, D>(
264 d: &mut D,
265 old: &Old,
266 mut old_range: Range<usize>,
267 new: &New,
268 mut new_range: Range<usize>,
269 vf: &mut V,
270 vb: &mut V,
271 deadline: Option<Instant>,
272 ) -> Result<(), D::Error>
273 where
274 Old: Index<usize> + ?Sized,
275 New: Index<usize> + ?Sized,
276 D: DiffHook,
277 New::Output: PartialEq<Old::Output>,
278 {
279 // Check for common prefix
280 let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
281 if common_prefix_len > 0 {
282 d.equal(old_range.start, new_range.start, common_prefix_len)?;
283 }
284 old_range.start += common_prefix_len;
285 new_range.start += common_prefix_len;
286
287 // Check for common suffix
288 let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
289 let common_suffix = (
290 old_range.end - common_suffix_len,
291 new_range.end - common_suffix_len,
292 );
293 old_range.end -= common_suffix_len;
294 new_range.end -= common_suffix_len;
295
296 if is_empty_range(&old_range) && is_empty_range(&new_range) {
297 // Do nothing
298 } else if is_empty_range(&new_range) {
299 d.delete(old_range.start, old_range.len(), new_range.start)?;
300 } else if is_empty_range(&old_range) {
301 d.insert(old_range.start, new_range.start, new_range.len())?;
302 } else if let Some((x_start, y_start)) = find_middle_snake(
303 old,
304 old_range.clone(),
305 new,
306 new_range.clone(),
307 vf,
308 vb,
309 deadline,
310 ) {
311 let (old_a, old_b) = split_at(old_range, x_start);
312 let (new_a, new_b) = split_at(new_range, y_start);
313 conquer(d, old, old_a, new, new_a, vf, vb, deadline)?;
314 conquer(d, old, old_b, new, new_b, vf, vb, deadline)?;
315 } else {
316 d.delete(
317 old_range.start,
318 old_range.end - old_range.start,
319 new_range.start,
320 )?;
321 d.insert(
322 old_range.start,
323 new_range.start,
324 new_range.end - new_range.start,
325 )?;
326 }
327
328 if common_suffix_len > 0 {
329 d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?;
330 }
331
332 Ok(())
333 }
334
335 #[test]
336 fn test_find_middle_snake() {
337 let a = &b"ABCABBA"[..];
338 let b = &b"CBABAC"[..];
339 let max_d = max_d(a.len(), b.len());
340 let mut vf = V::new(max_d);
341 let mut vb = V::new(max_d);
342 let (x_start, y_start) =
343 find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap();
344 assert_eq!(x_start, 4);
345 assert_eq!(y_start, 1);
346 }
347
348 #[test]
349 fn test_diff() {
350 let a: &[usize] = &[0, 1, 2, 3, 4];
351 let b: &[usize] = &[0, 1, 2, 9, 4];
352
353 let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
354 diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
355 insta::assert_debug_snapshot!(d.into_inner().ops());
356 }
357
358 #[test]
359 fn test_contiguous() {
360 let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5];
361 let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
362
363 let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
364 diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
365 insta::assert_debug_snapshot!(d.into_inner().ops());
366 }
367
368 #[test]
369 fn test_pat() {
370 let a: &[usize] = &[0, 1, 3, 4, 5];
371 let b: &[usize] = &[0, 1, 4, 5, 8, 9];
372
373 let mut d = crate::algorithms::Capture::new();
374 diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
375 insta::assert_debug_snapshot!(d.ops());
376 }
377
378 #[test]
379 fn test_deadline_reached() {
380 use std::ops::Index;
381 use std::time::Duration;
382
383 let a = (0..100).collect::<Vec<_>>();
384 let mut b = (0..100).collect::<Vec<_>>();
385 b[10] = 99;
386 b[50] = 99;
387 b[25] = 99;
388
389 struct SlowIndex<'a>(&'a [usize]);
390
391 impl<'a> Index<usize> for SlowIndex<'a> {
392 type Output = usize;
393
394 fn index(&self, index: usize) -> &Self::Output {
395 std::thread::sleep(Duration::from_millis(1));
396 &self.0[index]
397 }
398 }
399
400 let slow_a = SlowIndex(&a);
401 let slow_b = SlowIndex(&b);
402
403 // don't give it enough time to do anything interesting
404 let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
405 diff_deadline(
406 &mut d,
407 &slow_a,
408 0..a.len(),
409 &slow_b,
410 0..b.len(),
411 Some(Instant::now() + Duration::from_millis(50)),
412 )
413 .unwrap();
414 insta::assert_debug_snapshot!(d.into_inner().ops());
415 }