]> git.proxmox.com Git - rustc.git/blob - vendor/gix/src/object/tree/diff/tracked.rs
New upstream version 1.70.0+dfsg2
[rustc.git] / vendor / gix / src / object / tree / diff / tracked.rs
1 use std::ops::Range;
2
3 use gix_diff::tree::visit::Change;
4 use gix_object::tree::EntryMode;
5
6 use crate::{
7 bstr::BStr,
8 ext::ObjectIdExt,
9 object::tree::diff::{
10 change::DiffLineStats,
11 rewrites::{CopySource, Outcome},
12 Rewrites,
13 },
14 Repository, Tree,
15 };
16
17 /// A set of tracked items allows to figure out their relations by figuring out their similarity.
18 pub struct Item {
19 /// The underlying raw change
20 change: Change,
21 /// That slice into the backing for paths.
22 location: Range<usize>,
23 /// If true, this item was already emitted, i.e. seen by the caller.
24 emitted: bool,
25 }
26
27 impl Item {
28 fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
29 backing[self.location.clone()].as_ref()
30 }
31 fn entry_mode_compatible(&self, mode: EntryMode) -> bool {
32 use EntryMode::*;
33 matches!(
34 (mode, self.change.entry_mode()),
35 (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link)
36 )
37 }
38
39 fn is_source_for_destination_of(&self, kind: visit::Kind, dest_item_mode: EntryMode) -> bool {
40 self.entry_mode_compatible(dest_item_mode)
41 && match kind {
42 visit::Kind::RenameTarget => !self.emitted && matches!(self.change, Change::Deletion { .. }),
43 visit::Kind::CopyDestination => {
44 matches!(self.change, Change::Modification { .. })
45 }
46 }
47 }
48 }
49
50 pub struct State {
51 items: Vec<Item>,
52 path_backing: Vec<u8>,
53 rewrites: Rewrites,
54 tracking: Option<gix_diff::tree::recorder::Location>,
55 }
56
57 pub mod visit {
58 use crate::{bstr::BStr, object::tree::diff::change::DiffLineStats};
59
60 pub struct Source<'a> {
61 pub mode: gix_object::tree::EntryMode,
62 pub id: gix_hash::ObjectId,
63 pub kind: Kind,
64 pub location: &'a BStr,
65 pub diff: Option<DiffLineStats>,
66 }
67
68 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
69 pub enum Kind {
70 RenameTarget,
71 CopyDestination,
72 }
73
74 pub struct Destination<'a> {
75 pub change: gix_diff::tree::visit::Change,
76 pub location: &'a BStr,
77 }
78 }
79
80 impl State {
81 pub(crate) fn new(renames: Rewrites, tracking: Option<gix_diff::tree::recorder::Location>) -> Self {
82 State {
83 items: vec![],
84 path_backing: vec![],
85 rewrites: renames,
86 tracking,
87 }
88 }
89 }
90
91 /// build state and find matches.
92 impl State {
93 /// We may refuse the push if that information isn't needed for what we have to track.
94 pub fn try_push_change(&mut self, change: Change, location: &BStr) -> Option<Change> {
95 if !change.entry_mode().is_blob_or_symlink() {
96 return Some(change);
97 }
98 let keep = match (self.rewrites.copies, &change) {
99 (Some(_find_copies), _) => true,
100 (None, Change::Modification { .. }) => false,
101 (None, _) => true,
102 };
103
104 if !keep {
105 return Some(change);
106 }
107
108 let start = self.path_backing.len();
109 self.path_backing.extend_from_slice(location);
110 self.items.push(Item {
111 location: start..self.path_backing.len(),
112 change,
113 emitted: false,
114 });
115 None
116 }
117
118 /// Can only be called once effectively as it alters its own state.
119 ///
120 /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
121 /// the destination of a copy or rename, or with `None` for source if no relation to other
122 /// items in the tracked set exist.
123 pub fn emit(
124 &mut self,
125 mut cb: impl FnMut(visit::Destination<'_>, Option<visit::Source<'_>>) -> gix_diff::tree::visit::Action,
126 src_tree: &Tree<'_>,
127 ) -> Result<Outcome, crate::object::tree::diff::for_each::Error> {
128 fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering {
129 a.change.oid().cmp(b.change.oid()).then_with(|| {
130 a.location
131 .start
132 .cmp(&b.location.start)
133 .then(a.location.end.cmp(&b.location.end))
134 })
135 }
136 self.items.sort_by(by_id_and_location);
137
138 let mut out = Outcome {
139 options: self.rewrites,
140 ..Default::default()
141 };
142 out = self.match_pairs_of_kind(
143 visit::Kind::RenameTarget,
144 &mut cb,
145 self.rewrites.percentage,
146 out,
147 src_tree.repo,
148 )?;
149
150 if let Some(copies) = self.rewrites.copies {
151 out = self.match_pairs_of_kind(
152 visit::Kind::CopyDestination,
153 &mut cb,
154 copies.percentage,
155 out,
156 src_tree.repo,
157 )?;
158
159 match copies.source {
160 CopySource::FromSetOfModifiedFiles => {}
161 CopySource::FromSetOfModifiedFilesAndSourceTree => {
162 src_tree
163 .traverse()
164 .breadthfirst(&mut tree_to_events::Delegate::new(self))?;
165 self.items.sort_by(by_id_and_location);
166
167 out = self.match_pairs_of_kind(
168 visit::Kind::CopyDestination,
169 &mut cb,
170 copies.percentage,
171 out,
172 src_tree.repo,
173 )?;
174 }
175 }
176 }
177
178 self.items
179 .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
180 for item in self.items.drain(..).filter(|item| !item.emitted) {
181 if cb(
182 visit::Destination {
183 location: item.location(&self.path_backing),
184 change: item.change,
185 },
186 None,
187 ) == gix_diff::tree::visit::Action::Cancel
188 {
189 break;
190 }
191 }
192 Ok(out)
193 }
194
195 fn match_pairs_of_kind(
196 &mut self,
197 kind: visit::Kind,
198 cb: &mut impl FnMut(visit::Destination<'_>, Option<visit::Source<'_>>) -> gix_diff::tree::visit::Action,
199 percentage: Option<f32>,
200 mut out: Outcome,
201 repo: &Repository,
202 ) -> Result<Outcome, crate::object::tree::diff::for_each::Error> {
203 // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
204 let needs_second_pass = !needs_exact_match(percentage);
205 if self.match_pairs(cb, None /* by identity */, kind, repo, &mut out)? == gix_diff::tree::visit::Action::Cancel
206 {
207 return Ok(out);
208 }
209 if needs_second_pass {
210 let is_limited = if self.rewrites.limit == 0 {
211 false
212 } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) {
213 match kind {
214 visit::Kind::RenameTarget => {
215 out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
216 }
217 visit::Kind::CopyDestination => {
218 out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
219 }
220 }
221 true
222 } else {
223 false
224 };
225 if !is_limited {
226 self.match_pairs(cb, self.rewrites.percentage, kind, repo, &mut out)?;
227 }
228 }
229 Ok(out)
230 }
231
232 fn match_pairs(
233 &mut self,
234 cb: &mut impl FnMut(visit::Destination<'_>, Option<visit::Source<'_>>) -> gix_diff::tree::visit::Action,
235 percentage: Option<f32>,
236 kind: visit::Kind,
237 repo: &Repository,
238 stats: &mut Outcome,
239 ) -> Result<gix_diff::tree::visit::Action, crate::object::tree::diff::for_each::Error> {
240 // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()`
241 let mut dest_ofs = 0;
242 while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
243 (!item.emitted && matches!(item.change, Change::Addition { .. })).then_some((idx, item))
244 }) {
245 dest_idx += dest_ofs;
246 dest_ofs = dest_idx + 1;
247 let src =
248 find_match(&self.items, dest, dest_idx, percentage, kind, repo, stats)?.map(|(src_idx, src, diff)| {
249 let (id, mode) = src.change.oid_and_entry_mode();
250 let id = id.to_owned();
251 let location = src.location(&self.path_backing);
252 (
253 visit::Source {
254 mode,
255 id,
256 kind,
257 location,
258 diff,
259 },
260 src_idx,
261 )
262 });
263 if src.is_none() {
264 continue;
265 }
266 let location = dest.location(&self.path_backing);
267 let change = dest.change.clone();
268 let dest = visit::Destination { change, location };
269 self.items[dest_idx].emitted = true;
270 if let Some(src_idx) = src.as_ref().map(|t| t.1) {
271 self.items[src_idx].emitted = true;
272 }
273 if cb(dest, src.map(|t| t.0)) == gix_diff::tree::visit::Action::Cancel {
274 return Ok(gix_diff::tree::visit::Action::Cancel);
275 }
276 }
277 Ok(gix_diff::tree::visit::Action::Continue)
278 }
279 }
280
281 fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::Kind) -> Option<usize> {
282 let (sources, destinations) = items
283 .iter()
284 .filter(|item| match kind {
285 visit::Kind::RenameTarget => !item.emitted,
286 visit::Kind::CopyDestination => true,
287 })
288 .fold((0, 0), |(mut src, mut dest), item| {
289 match item.change {
290 Change::Addition { .. } => {
291 dest += 1;
292 }
293 Change::Deletion { .. } => {
294 if kind == visit::Kind::RenameTarget {
295 src += 1
296 }
297 }
298 Change::Modification { .. } => {
299 if kind == visit::Kind::CopyDestination {
300 src += 1
301 }
302 }
303 }
304 (src, dest)
305 });
306 let permutations = sources * destinations;
307 (permutations > limit * limit).then_some(permutations)
308 }
309
310 fn needs_exact_match(percentage: Option<f32>) -> bool {
311 percentage.map_or(true, |p| p >= 1.0)
312 }
313
314 /// <src_idx, src, possibly diff stat>
315 type SourceTuple<'a> = (usize, &'a Item, Option<DiffLineStats>);
316
317 /// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
318 /// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
319 /// We also ignore emitted items entirely.
320 /// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
321 /// any non-deletion otherwise.
322 /// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
323 /// of items to be searched.
324 fn find_match<'a>(
325 items: &'a [Item],
326 item: &Item,
327 item_idx: usize,
328 percentage: Option<f32>,
329 kind: visit::Kind,
330 repo: &Repository,
331 stats: &mut Outcome,
332 ) -> Result<Option<SourceTuple<'a>>, crate::object::tree::diff::for_each::Error> {
333 let (item_id, item_mode) = item.change.oid_and_entry_mode();
334 if needs_exact_match(percentage) || item_mode == gix_object::tree::EntryMode::Link {
335 let first_idx = items.partition_point(|a| a.change.oid() < item_id);
336 let range = match items.get(first_idx..).map(|items| {
337 let end = items
338 .iter()
339 .position(|a| a.change.oid() != item_id)
340 .map(|idx| first_idx + idx)
341 .unwrap_or(items.len());
342 first_idx..end
343 }) {
344 Some(range) => range,
345 None => return Ok(None),
346 };
347 if range.is_empty() {
348 return Ok(None);
349 }
350 let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
351 src_idx += range.start;
352 (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
353 });
354 if let Some(src) = res {
355 return Ok(Some(src));
356 }
357 } else {
358 let new = item_id.to_owned().attach(repo).object()?;
359 let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
360 debug_assert!(
361 item.change.entry_mode().is_blob(),
362 "symlinks are matched exactly, and trees aren't used here"
363 );
364 let algo = repo.config.diff_algorithm()?;
365 for (can_idx, src) in items
366 .iter()
367 .enumerate()
368 .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
369 {
370 let old = src.change.oid().to_owned().attach(repo).object()?;
371 // TODO: make sure we get attribute handling and binary skips and filters right here. There is crate::object::blob::diff::Platform
372 // which should have facilities for that one day, but we don't use it because we need newlines in our tokens.
373 let tokens = gix_diff::blob::intern::InternedInput::new(
374 gix_diff::blob::sources::byte_lines_with_terminator(&old.data),
375 gix_diff::blob::sources::byte_lines_with_terminator(&new.data),
376 );
377 let counts = gix_diff::blob::diff(
378 algo,
379 &tokens,
380 gix_diff::blob::sink::Counter::new(diff::Statistics {
381 removed_bytes: 0,
382 input: &tokens,
383 }),
384 );
385 let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32;
386 stats.num_similarity_checks += 1;
387 if similarity >= percentage {
388 return Ok(Some((
389 can_idx,
390 src,
391 DiffLineStats {
392 removals: counts.removals,
393 insertions: counts.insertions,
394 before: tokens.before.len().try_into().expect("interner handles only u32"),
395 after: tokens.after.len().try_into().expect("interner handles only u32"),
396 }
397 .into(),
398 )));
399 }
400 }
401 }
402 Ok(None)
403 }
404
405 mod diff {
406 use std::ops::Range;
407
408 pub struct Statistics<'a, 'data> {
409 pub removed_bytes: usize,
410 pub input: &'a gix_diff::blob::intern::InternedInput<&'data [u8]>,
411 }
412
413 impl<'a, 'data> gix_diff::blob::Sink for Statistics<'a, 'data> {
414 type Out = usize;
415
416 fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
417 self.removed_bytes = self.input.before[before.start as usize..before.end as usize]
418 .iter()
419 .map(|token| self.input.interner[*token].len())
420 .sum();
421 }
422
423 fn finish(self) -> Self::Out {
424 self.removed_bytes
425 }
426 }
427 }
428
429 mod tree_to_events {
430 use gix_diff::tree::visit::Change;
431 use gix_object::tree::EntryRef;
432
433 use crate::bstr::BStr;
434
435 pub struct Delegate<'a> {
436 parent: &'a mut super::State,
437 recorder: gix_traverse::tree::Recorder,
438 }
439
440 impl<'a> Delegate<'a> {
441 pub fn new(parent: &'a mut super::State) -> Self {
442 let tracking = parent.tracking.map(|t| match t {
443 gix_diff::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName,
444 gix_diff::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path,
445 });
446 Self {
447 parent,
448 recorder: gix_traverse::tree::Recorder::default().track_location(tracking),
449 }
450 }
451 }
452
453 impl gix_traverse::tree::Visit for Delegate<'_> {
454 fn pop_front_tracked_path_and_set_current(&mut self) {
455 self.recorder.pop_front_tracked_path_and_set_current()
456 }
457
458 fn push_back_tracked_path_component(&mut self, component: &BStr) {
459 self.recorder.push_back_tracked_path_component(component)
460 }
461
462 fn push_path_component(&mut self, component: &BStr) {
463 self.recorder.push_path_component(component)
464 }
465
466 fn pop_path_component(&mut self) {
467 self.recorder.pop_path_component();
468 }
469
470 fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action {
471 gix_traverse::tree::visit::Action::Continue
472 }
473
474 fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action {
475 if entry.mode.is_blob() {
476 self.parent.try_push_change(
477 Change::Modification {
478 previous_entry_mode: entry.mode,
479 previous_oid: gix_hash::ObjectId::null(entry.oid.kind()),
480 entry_mode: entry.mode,
481 oid: entry.oid.to_owned(),
482 },
483 self.recorder.path(),
484 );
485 // make sure these aren't viable to be emitted anymore.
486 self.parent.items.last_mut().expect("just pushed").emitted = true;
487 }
488 gix_traverse::tree::visit::Action::Continue
489 }
490 }
491 }