3 use gix_diff
::tree
::visit
::Change
;
4 use gix_object
::tree
::EntryMode
;
10 change
::DiffLineStats
,
11 rewrites
::{CopySource, Outcome}
,
17 /// A set of tracked items allows to figure out their relations by figuring out their similarity.
19 /// The underlying raw change
21 /// That slice into the backing for paths.
22 location
: Range
<usize>,
23 /// If true, this item was already emitted, i.e. seen by the caller.
28 fn location
<'a
>(&self, backing
: &'a
[u8]) -> &'a BStr
{
29 backing
[self.location
.clone()].as_ref()
31 fn entry_mode_compatible(&self, mode
: EntryMode
) -> bool
{
34 (mode
, self.change
.entry_mode()),
35 (Blob
| BlobExecutable
, Blob
| BlobExecutable
) | (Link
, Link
)
39 fn is_source_for_destination_of(&self, kind
: visit
::Kind
, dest_item_mode
: EntryMode
) -> bool
{
40 self.entry_mode_compatible(dest_item_mode
)
42 visit
::Kind
::RenameTarget
=> !self.emitted
&& matches
!(self.change
, Change
::Deletion { .. }
),
43 visit
::Kind
::CopyDestination
=> {
44 matches
!(self.change
, Change
::Modification { .. }
)
52 path_backing
: Vec
<u8>,
54 tracking
: Option
<gix_diff
::tree
::recorder
::Location
>,
58 use crate::{bstr::BStr, object::tree::diff::change::DiffLineStats}
;
60 pub struct Source
<'a
> {
61 pub mode
: gix_object
::tree
::EntryMode
,
62 pub id
: gix_hash
::ObjectId
,
64 pub location
: &'a BStr
,
65 pub diff
: Option
<DiffLineStats
>,
68 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
74 pub struct Destination
<'a
> {
75 pub change
: gix_diff
::tree
::visit
::Change
,
76 pub location
: &'a BStr
,
81 pub(crate) fn new(renames
: Rewrites
, tracking
: Option
<gix_diff
::tree
::recorder
::Location
>) -> Self {
91 /// build state and find matches.
93 /// We may refuse the push if that information isn't needed for what we have to track.
94 pub fn try_push_change(&mut self, change
: Change
, location
: &BStr
) -> Option
<Change
> {
95 if !change
.entry_mode().is_blob_or_symlink() {
98 let keep
= match (self.rewrites
.copies
, &change
) {
99 (Some(_find_copies
), _
) => true,
100 (None
, Change
::Modification { .. }
) => false,
108 let start
= self.path_backing
.len();
109 self.path_backing
.extend_from_slice(location
);
110 self.items
.push(Item
{
111 location
: start
..self.path_backing
.len(),
118 /// Can only be called once effectively as it alters its own state.
120 /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
121 /// the destination of a copy or rename, or with `None` for source if no relation to other
122 /// items in the tracked set exist.
125 mut cb
: impl FnMut(visit
::Destination
<'_
>, Option
<visit
::Source
<'_
>>) -> gix_diff
::tree
::visit
::Action
,
127 ) -> Result
<Outcome
, crate::object
::tree
::diff
::for_each
::Error
> {
128 fn by_id_and_location(a
: &Item
, b
: &Item
) -> std
::cmp
::Ordering
{
129 a
.change
.oid().cmp(b
.change
.oid()).then_with(|| {
132 .cmp(&b
.location
.start
)
133 .then(a
.location
.end
.cmp(&b
.location
.end
))
136 self.items
.sort_by(by_id_and_location
);
138 let mut out
= Outcome
{
139 options
: self.rewrites
,
142 out
= self.match_pairs_of_kind(
143 visit
::Kind
::RenameTarget
,
145 self.rewrites
.percentage
,
150 if let Some(copies
) = self.rewrites
.copies
{
151 out
= self.match_pairs_of_kind(
152 visit
::Kind
::CopyDestination
,
159 match copies
.source
{
160 CopySource
::FromSetOfModifiedFiles
=> {}
161 CopySource
::FromSetOfModifiedFilesAndSourceTree
=> {
164 .breadthfirst(&mut tree_to_events
::Delegate
::new(self))?
;
165 self.items
.sort_by(by_id_and_location
);
167 out
= self.match_pairs_of_kind(
168 visit
::Kind
::CopyDestination
,
179 .sort_by(|a
, b
| a
.location(&self.path_backing
).cmp(b
.location(&self.path_backing
)));
180 for item
in self.items
.drain(..).filter(|item
| !item
.emitted
) {
183 location
: item
.location(&self.path_backing
),
187 ) == gix_diff
::tree
::visit
::Action
::Cancel
195 fn match_pairs_of_kind(
198 cb
: &mut impl FnMut(visit
::Destination
<'_
>, Option
<visit
::Source
<'_
>>) -> gix_diff
::tree
::visit
::Action
,
199 percentage
: Option
<f32>,
202 ) -> Result
<Outcome
, crate::object
::tree
::diff
::for_each
::Error
> {
203 // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
204 let needs_second_pass
= !needs_exact_match(percentage
);
205 if self.match_pairs(cb
, None
/* by identity */, kind
, repo
, &mut out
)?
== gix_diff
::tree
::visit
::Action
::Cancel
209 if needs_second_pass
{
210 let is_limited
= if self.rewrites
.limit
== 0 {
212 } else if let Some(permutations
) = permutations_over_limit(&self.items
, self.rewrites
.limit
, kind
) {
214 visit
::Kind
::RenameTarget
=> {
215 out
.num_similarity_checks_skipped_for_rename_tracking_due_to_limit
= permutations
;
217 visit
::Kind
::CopyDestination
=> {
218 out
.num_similarity_checks_skipped_for_copy_tracking_due_to_limit
= permutations
;
226 self.match_pairs(cb
, self.rewrites
.percentage
, kind
, repo
, &mut out
)?
;
234 cb
: &mut impl FnMut(visit
::Destination
<'_
>, Option
<visit
::Source
<'_
>>) -> gix_diff
::tree
::visit
::Action
,
235 percentage
: Option
<f32>,
239 ) -> Result
<gix_diff
::tree
::visit
::Action
, crate::object
::tree
::diff
::for_each
::Error
> {
240 // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()`
241 let mut dest_ofs
= 0;
242 while let Some((mut dest_idx
, dest
)) = self.items
[dest_ofs
..].iter().enumerate().find_map(|(idx
, item
)| {
243 (!item
.emitted
&& matches
!(item
.change
, Change
::Addition { .. }
)).then_some((idx
, item
))
245 dest_idx
+= dest_ofs
;
246 dest_ofs
= dest_idx
+ 1;
248 find_match(&self.items
, dest
, dest_idx
, percentage
, kind
, repo
, stats
)?
.map(|(src_idx
, src
, diff
)| {
249 let (id
, mode
) = src
.change
.oid_and_entry_mode();
250 let id
= id
.to_owned();
251 let location
= src
.location(&self.path_backing
);
266 let location
= dest
.location(&self.path_backing
);
267 let change
= dest
.change
.clone();
268 let dest
= visit
::Destination { change, location }
;
269 self.items
[dest_idx
].emitted
= true;
270 if let Some(src_idx
) = src
.as_ref().map(|t
| t
.1) {
271 self.items
[src_idx
].emitted
= true;
273 if cb(dest
, src
.map(|t
| t
.0)) == gix_diff
::tree
::visit
::Action
::Cancel
{
274 return Ok(gix_diff
::tree
::visit
::Action
::Cancel
);
277 Ok(gix_diff
::tree
::visit
::Action
::Continue
)
281 fn permutations_over_limit(items
: &[Item
], limit
: usize, kind
: visit
::Kind
) -> Option
<usize> {
282 let (sources
, destinations
) = items
284 .filter(|item
| match kind
{
285 visit
::Kind
::RenameTarget
=> !item
.emitted
,
286 visit
::Kind
::CopyDestination
=> true,
288 .fold((0, 0), |(mut src
, mut dest
), item
| {
290 Change
::Addition { .. }
=> {
293 Change
::Deletion { .. }
=> {
294 if kind
== visit
::Kind
::RenameTarget
{
298 Change
::Modification { .. }
=> {
299 if kind
== visit
::Kind
::CopyDestination
{
306 let permutations
= sources
* destinations
;
307 (permutations
> limit
* limit
).then_some(permutations
)
310 fn needs_exact_match(percentage
: Option
<f32>) -> bool
{
311 percentage
.map_or(true, |p
| p
>= 1.0)
314 /// <src_idx, src, possibly diff stat>
315 type SourceTuple
<'a
> = (usize, &'a Item
, Option
<DiffLineStats
>);
317 /// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
318 /// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
319 /// We also ignore emitted items entirely.
320 /// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
321 /// any non-deletion otherwise.
322 /// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
323 /// of items to be searched.
328 percentage
: Option
<f32>,
332 ) -> Result
<Option
<SourceTuple
<'a
>>, crate::object
::tree
::diff
::for_each
::Error
> {
333 let (item_id
, item_mode
) = item
.change
.oid_and_entry_mode();
334 if needs_exact_match(percentage
) || item_mode
== gix_object
::tree
::EntryMode
::Link
{
335 let first_idx
= items
.partition_point(|a
| a
.change
.oid() < item_id
);
336 let range
= match items
.get(first_idx
..).map(|items
| {
339 .position(|a
| a
.change
.oid() != item_id
)
340 .map(|idx
| first_idx
+ idx
)
341 .unwrap_or(items
.len());
344 Some(range
) => range
,
345 None
=> return Ok(None
),
347 if range
.is_empty() {
350 let res
= items
[range
.clone()].iter().enumerate().find_map(|(mut src_idx
, src
)| {
351 src_idx
+= range
.start
;
352 (src_idx
!= item_idx
&& src
.is_source_for_destination_of(kind
, item_mode
)).then_some((src_idx
, src
, None
))
354 if let Some(src
) = res
{
355 return Ok(Some(src
));
358 let new
= item_id
.to_owned().attach(repo
).object()?
;
359 let percentage
= percentage
.expect("it's set to something below 1.0 and we assured this");
361 item
.change
.entry_mode().is_blob(),
362 "symlinks are matched exactly, and trees aren't used here"
364 let algo
= repo
.config
.diff_algorithm()?
;
365 for (can_idx
, src
) in items
368 .filter(|(src_idx
, src
)| *src_idx
!= item_idx
&& src
.is_source_for_destination_of(kind
, item_mode
))
370 let old
= src
.change
.oid().to_owned().attach(repo
).object()?
;
371 // TODO: make sure we get attribute handling and binary skips and filters right here. There is crate::object::blob::diff::Platform
372 // which should have facilities for that one day, but we don't use it because we need newlines in our tokens.
373 let tokens
= gix_diff
::blob
::intern
::InternedInput
::new(
374 gix_diff
::blob
::sources
::byte_lines_with_terminator(&old
.data
),
375 gix_diff
::blob
::sources
::byte_lines_with_terminator(&new
.data
),
377 let counts
= gix_diff
::blob
::diff(
380 gix_diff
::blob
::sink
::Counter
::new(diff
::Statistics
{
385 let similarity
= (old
.data
.len() - counts
.wrapped
) as f32 / old
.data
.len().max(new
.data
.len()) as f32;
386 stats
.num_similarity_checks
+= 1;
387 if similarity
>= percentage
{
392 removals
: counts
.removals
,
393 insertions
: counts
.insertions
,
394 before
: tokens
.before
.len().try_into().expect("interner handles only u32"),
395 after
: tokens
.after
.len().try_into().expect("interner handles only u32"),
408 pub struct Statistics
<'a
, 'data
> {
409 pub removed_bytes
: usize,
410 pub input
: &'a gix_diff
::blob
::intern
::InternedInput
<&'data
[u8]>,
413 impl<'a
, 'data
> gix_diff
::blob
::Sink
for Statistics
<'a
, 'data
> {
416 fn process_change(&mut self, before
: Range
<u32>, _after
: Range
<u32>) {
417 self.removed_bytes
= self.input
.before
[before
.start
as usize..before
.end
as usize]
419 .map(|token
| self.input
.interner
[*token
].len())
423 fn finish(self) -> Self::Out
{
430 use gix_diff
::tree
::visit
::Change
;
431 use gix_object
::tree
::EntryRef
;
433 use crate::bstr
::BStr
;
435 pub struct Delegate
<'a
> {
436 parent
: &'a
mut super::State
,
437 recorder
: gix_traverse
::tree
::Recorder
,
440 impl<'a
> Delegate
<'a
> {
441 pub fn new(parent
: &'a
mut super::State
) -> Self {
442 let tracking
= parent
.tracking
.map(|t
| match t
{
443 gix_diff
::tree
::recorder
::Location
::FileName
=> gix_traverse
::tree
::recorder
::Location
::FileName
,
444 gix_diff
::tree
::recorder
::Location
::Path
=> gix_traverse
::tree
::recorder
::Location
::Path
,
448 recorder
: gix_traverse
::tree
::Recorder
::default().track_location(tracking
),
453 impl gix_traverse
::tree
::Visit
for Delegate
<'_
> {
454 fn pop_front_tracked_path_and_set_current(&mut self) {
455 self.recorder
.pop_front_tracked_path_and_set_current()
458 fn push_back_tracked_path_component(&mut self, component
: &BStr
) {
459 self.recorder
.push_back_tracked_path_component(component
)
462 fn push_path_component(&mut self, component
: &BStr
) {
463 self.recorder
.push_path_component(component
)
466 fn pop_path_component(&mut self) {
467 self.recorder
.pop_path_component();
470 fn visit_tree(&mut self, _entry
: &EntryRef
<'_
>) -> gix_traverse
::tree
::visit
::Action
{
471 gix_traverse
::tree
::visit
::Action
::Continue
474 fn visit_nontree(&mut self, entry
: &EntryRef
<'_
>) -> gix_traverse
::tree
::visit
::Action
{
475 if entry
.mode
.is_blob() {
476 self.parent
.try_push_change(
477 Change
::Modification
{
478 previous_entry_mode
: entry
.mode
,
479 previous_oid
: gix_hash
::ObjectId
::null(entry
.oid
.kind()),
480 entry_mode
: entry
.mode
,
481 oid
: entry
.oid
.to_owned(),
483 self.recorder
.path(),
485 // make sure these aren't viable to be emitted anymore.
486 self.parent
.items
.last_mut().expect("just pushed").emitted
= true;
488 gix_traverse
::tree
::visit
::Action
::Continue