2 * Copyright (C) the libgit2 contributors. All rights reserved.
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
9 #include "git2/config.h"
10 #include "git2/blob.h"
11 #include "git2/sys/hashsig.h"
18 git_diff_delta
*git_diff__delta_dup(
19 const git_diff_delta
*d
, git_pool
*pool
)
21 git_diff_delta
*delta
= git__malloc(sizeof(git_diff_delta
));
25 memcpy(delta
, d
, sizeof(git_diff_delta
));
26 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta
->flags
);
28 if (d
->old_file
.path
!= NULL
) {
29 delta
->old_file
.path
= git_pool_strdup(pool
, d
->old_file
.path
);
30 if (delta
->old_file
.path
== NULL
)
34 if (d
->new_file
.path
!= d
->old_file
.path
&& d
->new_file
.path
!= NULL
) {
35 delta
->new_file
.path
= git_pool_strdup(pool
, d
->new_file
.path
);
36 if (delta
->new_file
.path
== NULL
)
39 delta
->new_file
.path
= delta
->old_file
.path
;
49 git_diff_delta
*git_diff__merge_like_cgit(
50 const git_diff_delta
*a
,
51 const git_diff_delta
*b
,
56 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
58 * When C git does a diff between the work dir and a tree, it actually
59 * diffs with the index but uses the workdir contents. This emulates
60 * those choices so we can emulate the type of diff.
62 * We have three file descriptions here, let's call them:
64 * f2 = a->new_file AND b->old_file
68 /* If one of the diffs is a conflict, just dup it */
69 if (b
->status
== GIT_DELTA_CONFLICTED
)
70 return git_diff__delta_dup(b
, pool
);
71 if (a
->status
== GIT_DELTA_CONFLICTED
)
72 return git_diff__delta_dup(a
, pool
);
74 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
75 if (b
->status
== GIT_DELTA_UNMODIFIED
|| a
->status
== GIT_DELTA_DELETED
)
76 return git_diff__delta_dup(a
, pool
);
78 /* otherwise, base this diff on the 'b' diff */
79 if ((dup
= git_diff__delta_dup(b
, pool
)) == NULL
)
82 /* If 'a' status is uninteresting, then we're done */
83 if (a
->status
== GIT_DELTA_UNMODIFIED
||
84 a
->status
== GIT_DELTA_UNTRACKED
||
85 a
->status
== GIT_DELTA_UNREADABLE
)
88 assert(b
->status
!= GIT_DELTA_UNMODIFIED
);
90 /* A cgit exception is that the diff of a file that is only in the
91 * index (i.e. not in HEAD nor workdir) is given as empty.
93 if (dup
->status
== GIT_DELTA_DELETED
) {
94 if (a
->status
== GIT_DELTA_ADDED
) {
95 dup
->status
= GIT_DELTA_UNMODIFIED
;
98 /* else don't overwrite DELETE status */
100 dup
->status
= a
->status
;
101 dup
->nfiles
= a
->nfiles
;
104 git_oid_cpy(&dup
->old_file
.id
, &a
->old_file
.id
);
105 dup
->old_file
.mode
= a
->old_file
.mode
;
106 dup
->old_file
.size
= a
->old_file
.size
;
107 dup
->old_file
.flags
= a
->old_file
.flags
;
113 git_diff
*onto
, const git_diff
*from
, git_diff__merge_cb cb
)
118 git_diff_delta
*delta
;
119 bool ignore_case
, reversed
;
122 assert(onto
&& from
);
124 if (!from
->deltas
.length
)
127 ignore_case
= ((onto
->opts
.flags
& GIT_DIFF_IGNORE_CASE
) != 0);
128 reversed
= ((onto
->opts
.flags
& GIT_DIFF_REVERSE
) != 0);
130 if (ignore_case
!= ((from
->opts
.flags
& GIT_DIFF_IGNORE_CASE
) != 0) ||
131 reversed
!= ((from
->opts
.flags
& GIT_DIFF_REVERSE
) != 0)) {
132 giterr_set(GITERR_INVALID
,
133 "Attempt to merge diffs created with conflicting options");
138 &onto_new
, onto
->deltas
.length
, git_diff_delta__cmp
) < 0 ||
139 git_pool_init(&onto_pool
, 1, 0) < 0)
142 for (i
= 0, j
= 0; i
< onto
->deltas
.length
|| j
< from
->deltas
.length
; ) {
143 git_diff_delta
*o
= GIT_VECTOR_GET(&onto
->deltas
, i
);
144 const git_diff_delta
*f
= GIT_VECTOR_GET(&from
->deltas
, j
);
145 int cmp
= !f
? -1 : !o
? 1 :
146 STRCMP_CASESELECT(ignore_case
, o
->old_file
.path
, f
->old_file
.path
);
149 delta
= git_diff__delta_dup(o
, &onto_pool
);
151 } else if (cmp
> 0) {
152 delta
= git_diff__delta_dup(f
, &onto_pool
);
155 const git_diff_delta
*left
= reversed
? f
: o
;
156 const git_diff_delta
*right
= reversed
? o
: f
;
158 delta
= cb(left
, right
, &onto_pool
);
163 /* the ignore rules for the target may not match the source
164 * or the result of a merged delta could be skippable...
166 if (delta
&& git_diff_delta__should_skip(&onto
->opts
, delta
)) {
171 if ((error
= !delta
? -1 : git_vector_insert(&onto_new
, delta
)) < 0)
176 git_vector_swap(&onto
->deltas
, &onto_new
);
177 git_pool_swap(&onto
->pool
, &onto_pool
);
179 if ((onto
->opts
.flags
& GIT_DIFF_REVERSE
) != 0)
180 onto
->old_src
= from
->old_src
;
182 onto
->new_src
= from
->new_src
;
184 /* prefix strings also come from old pool, so recreate those.*/
185 onto
->opts
.old_prefix
=
186 git_pool_strdup_safe(&onto
->pool
, onto
->opts
.old_prefix
);
187 onto
->opts
.new_prefix
=
188 git_pool_strdup_safe(&onto
->pool
, onto
->opts
.new_prefix
);
191 git_vector_free_deep(&onto_new
);
192 git_pool_clear(&onto_pool
);
197 int git_diff_merge(git_diff
*onto
, const git_diff
*from
)
199 return git_diff__merge(onto
, from
, git_diff__merge_like_cgit
);
202 int git_diff_find_similar__hashsig_for_file(
203 void **out
, const git_diff_file
*f
, const char *path
, void *p
)
205 git_hashsig_option_t opt
= (git_hashsig_option_t
)(intptr_t)p
;
208 return git_hashsig_create_fromfile((git_hashsig
**)out
, path
, opt
);
211 int git_diff_find_similar__hashsig_for_buf(
212 void **out
, const git_diff_file
*f
, const char *buf
, size_t len
, void *p
)
214 git_hashsig_option_t opt
= (git_hashsig_option_t
)(intptr_t)p
;
217 return git_hashsig_create((git_hashsig
**)out
, buf
, len
, opt
);
220 void git_diff_find_similar__hashsig_free(void *sig
, void *payload
)
223 git_hashsig_free(sig
);
226 int git_diff_find_similar__calc_similarity(
227 int *score
, void *siga
, void *sigb
, void *payload
)
232 error
= git_hashsig_compare(siga
, sigb
);
240 #define DEFAULT_THRESHOLD 50
241 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60
242 #define DEFAULT_RENAME_LIMIT 200
244 static int normalize_find_opts(
246 git_diff_find_options
*opts
,
247 const git_diff_find_options
*given
)
249 git_config
*cfg
= NULL
;
250 git_hashsig_option_t hashsig_opts
;
252 GITERR_CHECK_VERSION(given
, GIT_DIFF_FIND_OPTIONS_VERSION
, "git_diff_find_options");
254 if (diff
->repo
!= NULL
&&
255 git_repository_config__weakptr(&cfg
, diff
->repo
) < 0)
259 memcpy(opts
, given
, sizeof(*opts
));
262 (given
->flags
& GIT_DIFF_FIND_ALL
) == GIT_DIFF_FIND_BY_CONFIG
)
265 git_config__get_string_force(cfg
, "diff.renames", "true");
268 if (!git__parse_bool(&boolval
, rule
) && !boolval
)
269 /* don't set FIND_RENAMES if bool value is false */;
270 else if (!strcasecmp(rule
, "copies") || !strcasecmp(rule
, "copy"))
271 opts
->flags
|= GIT_DIFF_FIND_RENAMES
| GIT_DIFF_FIND_COPIES
;
273 opts
->flags
|= GIT_DIFF_FIND_RENAMES
;
278 /* some flags imply others */
280 if (opts
->flags
& GIT_DIFF_FIND_EXACT_MATCH_ONLY
) {
281 /* if we are only looking for exact matches, then don't turn
282 * MODIFIED items into ADD/DELETE pairs because it's too picky
284 opts
->flags
&= ~(GIT_DIFF_FIND_REWRITES
| GIT_DIFF_BREAK_REWRITES
);
286 /* similarly, don't look for self-rewrites to split */
287 opts
->flags
&= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES
;
290 if (opts
->flags
& GIT_DIFF_FIND_RENAMES_FROM_REWRITES
)
291 opts
->flags
|= GIT_DIFF_FIND_RENAMES
;
293 if (opts
->flags
& GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED
)
294 opts
->flags
|= GIT_DIFF_FIND_COPIES
;
296 if (opts
->flags
& GIT_DIFF_BREAK_REWRITES
)
297 opts
->flags
|= GIT_DIFF_FIND_REWRITES
;
299 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
301 if (USE_DEFAULT(opts
->rename_threshold
))
302 opts
->rename_threshold
= DEFAULT_THRESHOLD
;
304 if (USE_DEFAULT(opts
->rename_from_rewrite_threshold
))
305 opts
->rename_from_rewrite_threshold
= DEFAULT_THRESHOLD
;
307 if (USE_DEFAULT(opts
->copy_threshold
))
308 opts
->copy_threshold
= DEFAULT_THRESHOLD
;
310 if (USE_DEFAULT(opts
->break_rewrite_threshold
))
311 opts
->break_rewrite_threshold
= DEFAULT_BREAK_REWRITE_THRESHOLD
;
315 if (!opts
->rename_limit
) {
316 opts
->rename_limit
= git_config__get_int_force(
317 cfg
, "diff.renamelimit", DEFAULT_RENAME_LIMIT
);
319 if (opts
->rename_limit
<= 0)
320 opts
->rename_limit
= DEFAULT_RENAME_LIMIT
;
323 /* assign the internal metric with whitespace flag as payload */
325 opts
->metric
= git__malloc(sizeof(git_diff_similarity_metric
));
326 GITERR_CHECK_ALLOC(opts
->metric
);
328 opts
->metric
->file_signature
= git_diff_find_similar__hashsig_for_file
;
329 opts
->metric
->buffer_signature
= git_diff_find_similar__hashsig_for_buf
;
330 opts
->metric
->free_signature
= git_diff_find_similar__hashsig_free
;
331 opts
->metric
->similarity
= git_diff_find_similar__calc_similarity
;
333 if (opts
->flags
& GIT_DIFF_FIND_IGNORE_WHITESPACE
)
334 hashsig_opts
= GIT_HASHSIG_IGNORE_WHITESPACE
;
335 else if (opts
->flags
& GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE
)
336 hashsig_opts
= GIT_HASHSIG_NORMAL
;
338 hashsig_opts
= GIT_HASHSIG_SMART_WHITESPACE
;
339 hashsig_opts
|= GIT_HASHSIG_ALLOW_SMALL_FILES
;
340 opts
->metric
->payload
= (void *)hashsig_opts
;
346 static int insert_delete_side_of_split(
347 git_diff
*diff
, git_vector
*onto
, const git_diff_delta
*delta
)
349 /* make new record for DELETED side of split */
350 git_diff_delta
*deleted
= git_diff__delta_dup(delta
, &diff
->pool
);
351 GITERR_CHECK_ALLOC(deleted
);
353 deleted
->status
= GIT_DELTA_DELETED
;
355 memset(&deleted
->new_file
, 0, sizeof(deleted
->new_file
));
356 deleted
->new_file
.path
= deleted
->old_file
.path
;
357 deleted
->new_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
359 return git_vector_insert(onto
, deleted
);
362 static int apply_splits_and_deletes(
363 git_diff
*diff
, size_t expected_size
, bool actually_split
)
365 git_vector onto
= GIT_VECTOR_INIT
;
367 git_diff_delta
*delta
;
369 if (git_vector_init(&onto
, expected_size
, git_diff_delta__cmp
) < 0)
372 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
373 git_vector_foreach(&diff
->deltas
, i
, delta
) {
374 if ((delta
->flags
& GIT_DIFF_FLAG__TO_DELETE
) != 0)
377 if ((delta
->flags
& GIT_DIFF_FLAG__TO_SPLIT
) != 0 && actually_split
) {
378 delta
->similarity
= 0;
380 if (insert_delete_side_of_split(diff
, &onto
, delta
) < 0)
383 if (diff
->new_src
== GIT_ITERATOR_TYPE_WORKDIR
)
384 delta
->status
= GIT_DELTA_UNTRACKED
;
386 delta
->status
= GIT_DELTA_ADDED
;
388 memset(&delta
->old_file
, 0, sizeof(delta
->old_file
));
389 delta
->old_file
.path
= delta
->new_file
.path
;
390 delta
->old_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
393 /* clean up delta before inserting into new list */
394 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta
->flags
);
396 if (delta
->status
!= GIT_DELTA_COPIED
&&
397 delta
->status
!= GIT_DELTA_RENAMED
&&
398 (delta
->status
!= GIT_DELTA_MODIFIED
|| actually_split
))
399 delta
->similarity
= 0;
401 /* insert into new list */
402 if (git_vector_insert(&onto
, delta
) < 0)
406 /* cannot return an error past this point */
408 /* free deltas from old list that didn't make it to the new one */
409 git_vector_foreach(&diff
->deltas
, i
, delta
) {
410 if ((delta
->flags
& GIT_DIFF_FLAG__TO_DELETE
) != 0)
414 /* swap new delta list into place */
415 git_vector_swap(&diff
->deltas
, &onto
);
416 git_vector_free(&onto
);
417 git_vector_sort(&diff
->deltas
);
422 git_vector_free_deep(&onto
);
427 GIT_INLINE(git_diff_file
*) similarity_get_file(git_diff
*diff
, size_t idx
)
429 git_diff_delta
*delta
= git_vector_get(&diff
->deltas
, idx
/ 2);
430 return (idx
& 1) ? &delta
->new_file
: &delta
->old_file
;
435 git_iterator_type_t src
;
436 git_repository
*repo
;
439 git_odb_object
*odb_obj
;
443 static int similarity_init(
444 similarity_info
*info
, git_diff
*diff
, size_t file_idx
)
446 info
->idx
= file_idx
;
447 info
->src
= (file_idx
& 1) ? diff
->new_src
: diff
->old_src
;
448 info
->repo
= diff
->repo
;
449 info
->file
= similarity_get_file(diff
, file_idx
);
450 info
->odb_obj
= NULL
;
452 git_buf_init(&info
->data
, 0);
454 if (info
->file
->size
> 0 || info
->src
== GIT_ITERATOR_TYPE_WORKDIR
)
457 return git_diff_file__resolve_zero_size(
458 info
->file
, &info
->odb_obj
, info
->repo
);
461 static int similarity_sig(
462 similarity_info
*info
,
463 const git_diff_find_options
*opts
,
467 git_diff_file
*file
= info
->file
;
469 if (info
->src
== GIT_ITERATOR_TYPE_WORKDIR
) {
470 if ((error
= git_buf_joinpath(
471 &info
->data
, git_repository_workdir(info
->repo
), file
->path
)) < 0)
474 /* if path is not a regular file, just skip this item */
475 if (!git_path_isfile(info
->data
.ptr
))
478 /* TODO: apply wd-to-odb filters to file data if necessary */
480 error
= opts
->metric
->file_signature(
481 &cache
[info
->idx
], info
->file
,
482 info
->data
.ptr
, opts
->metric
->payload
);
484 /* if we didn't initially know the size, we might have an odb_obj
485 * around from earlier, so convert that, otherwise load the blob now
487 if (info
->odb_obj
!= NULL
)
488 error
= git_object__from_odb_object(
489 (git_object
**)&info
->blob
, info
->repo
,
490 info
->odb_obj
, GIT_OBJ_BLOB
);
492 error
= git_blob_lookup(&info
->blob
, info
->repo
, &file
->id
);
495 /* if lookup fails, just skip this item in similarity calc */
500 /* index size may not be actual blob size if filtered */
501 if (file
->size
!= git_blob_rawsize(info
->blob
))
502 file
->size
= git_blob_rawsize(info
->blob
);
504 sz
= (size_t)(git__is_sizet(file
->size
) ? file
->size
: -1);
506 error
= opts
->metric
->buffer_signature(
507 &cache
[info
->idx
], info
->file
,
508 git_blob_rawcontent(info
->blob
), sz
, opts
->metric
->payload
);
515 static void similarity_unload(similarity_info
*info
)
518 git_odb_object_free(info
->odb_obj
);
521 git_blob_free(info
->blob
);
523 git_buf_free(&info
->data
);
526 #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
528 /* - score < 0 means files cannot be compared
529 * - score >= 100 means files are exact match
530 * - score == 0 means files are completely different
532 static int similarity_measure(
535 const git_diff_find_options
*opts
,
540 git_diff_file
*a_file
= similarity_get_file(diff
, a_idx
);
541 git_diff_file
*b_file
= similarity_get_file(diff
, b_idx
);
542 bool exact_match
= FLAG_SET(opts
, GIT_DIFF_FIND_EXACT_MATCH_ONLY
);
544 similarity_info a_info
, b_info
;
548 /* don't try to compare files of different types */
549 if (GIT_MODE_TYPE(a_file
->mode
) != GIT_MODE_TYPE(b_file
->mode
))
552 /* if exact match is requested, force calculation of missing OIDs now */
554 if (git_oid_iszero(&a_file
->id
) &&
555 diff
->old_src
== GIT_ITERATOR_TYPE_WORKDIR
&&
556 !git_diff__oid_for_file(&a_file
->id
,
557 diff
, a_file
->path
, a_file
->mode
, a_file
->size
))
558 a_file
->flags
|= GIT_DIFF_FLAG_VALID_ID
;
560 if (git_oid_iszero(&b_file
->id
) &&
561 diff
->new_src
== GIT_ITERATOR_TYPE_WORKDIR
&&
562 !git_diff__oid_for_file(&b_file
->id
,
563 diff
, b_file
->path
, b_file
->mode
, b_file
->size
))
564 b_file
->flags
|= GIT_DIFF_FLAG_VALID_ID
;
567 /* check OID match as a quick test */
568 if (git_oid__cmp(&a_file
->id
, &b_file
->id
) == 0) {
573 /* don't calculate signatures if we are doing exact match */
579 memset(&a_info
, 0, sizeof(a_info
));
580 memset(&b_info
, 0, sizeof(b_info
));
582 /* set up similarity data (will try to update missing file sizes) */
583 if (!cache
[a_idx
] && (error
= similarity_init(&a_info
, diff
, a_idx
)) < 0)
585 if (!cache
[b_idx
] && (error
= similarity_init(&b_info
, diff
, b_idx
)) < 0)
588 /* check if file sizes are nowhere near each other */
589 if (a_file
->size
> 127 &&
590 b_file
->size
> 127 &&
591 (a_file
->size
> (b_file
->size
<< 3) ||
592 b_file
->size
> (a_file
->size
<< 3)))
595 /* update signature cache if needed */
597 if ((error
= similarity_sig(&a_info
, opts
, cache
)) < 0)
601 if ((error
= similarity_sig(&b_info
, opts
, cache
)) < 0)
605 /* calculate similarity provided that the metric choose to process
606 * both the a and b files (some may not if file is too big, etc).
608 if (cache
[a_idx
] && cache
[b_idx
])
609 error
= opts
->metric
->similarity(
610 score
, cache
[a_idx
], cache
[b_idx
], opts
->metric
->payload
);
613 similarity_unload(&a_info
);
614 similarity_unload(&b_info
);
619 static int calc_self_similarity(
621 const git_diff_find_options
*opts
,
625 int error
, similarity
= -1;
626 git_diff_delta
*delta
= GIT_VECTOR_GET(&diff
->deltas
, delta_idx
);
628 if ((delta
->flags
& GIT_DIFF_FLAG__HAS_SELF_SIMILARITY
) != 0)
631 error
= similarity_measure(
632 &similarity
, diff
, opts
, cache
, 2 * delta_idx
, 2 * delta_idx
+ 1);
636 if (similarity
>= 0) {
637 delta
->similarity
= (uint16_t)similarity
;
638 delta
->flags
|= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY
;
644 static bool is_rename_target(
646 const git_diff_find_options
*opts
,
650 git_diff_delta
*delta
= GIT_VECTOR_GET(&diff
->deltas
, delta_idx
);
652 /* skip things that aren't plain blobs */
653 if (!GIT_MODE_ISBLOB(delta
->new_file
.mode
))
656 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
657 * targets; maybe include UNTRACKED if requested.
659 switch (delta
->status
) {
660 case GIT_DELTA_UNMODIFIED
:
661 case GIT_DELTA_DELETED
:
662 case GIT_DELTA_IGNORED
:
663 case GIT_DELTA_CONFLICTED
:
666 case GIT_DELTA_MODIFIED
:
667 if (!FLAG_SET(opts
, GIT_DIFF_FIND_REWRITES
) &&
668 !FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
))
671 if (calc_self_similarity(diff
, opts
, delta_idx
, cache
) < 0)
674 if (FLAG_SET(opts
, GIT_DIFF_BREAK_REWRITES
) &&
675 delta
->similarity
< opts
->break_rewrite_threshold
) {
676 delta
->flags
|= GIT_DIFF_FLAG__TO_SPLIT
;
679 if (FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
) &&
680 delta
->similarity
< opts
->rename_from_rewrite_threshold
)
685 case GIT_DELTA_UNTRACKED
:
686 if (!FLAG_SET(opts
, GIT_DIFF_FIND_FOR_UNTRACKED
))
690 default: /* all other status values should be checked */
694 delta
->flags
|= GIT_DIFF_FLAG__IS_RENAME_TARGET
;
698 static bool is_rename_source(
700 const git_diff_find_options
*opts
,
704 git_diff_delta
*delta
= GIT_VECTOR_GET(&diff
->deltas
, delta_idx
);
706 /* skip things that aren't blobs */
707 if (!GIT_MODE_ISBLOB(delta
->old_file
.mode
))
710 switch (delta
->status
) {
711 case GIT_DELTA_ADDED
:
712 case GIT_DELTA_UNTRACKED
:
713 case GIT_DELTA_UNREADABLE
:
714 case GIT_DELTA_IGNORED
:
715 case GIT_DELTA_CONFLICTED
:
718 case GIT_DELTA_DELETED
:
719 case GIT_DELTA_TYPECHANGE
:
722 case GIT_DELTA_UNMODIFIED
:
723 if (!FLAG_SET(opts
, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED
))
725 if (FLAG_SET(opts
, GIT_DIFF_FIND_REMOVE_UNMODIFIED
))
726 delta
->flags
|= GIT_DIFF_FLAG__TO_DELETE
;
729 default: /* MODIFIED, RENAMED, COPIED */
730 /* if we're finding copies, this could be a source */
731 if (FLAG_SET(opts
, GIT_DIFF_FIND_COPIES
))
734 /* otherwise, this is only a source if we can split it */
735 if (!FLAG_SET(opts
, GIT_DIFF_FIND_REWRITES
) &&
736 !FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
))
739 if (calc_self_similarity(diff
, opts
, delta_idx
, cache
) < 0)
742 if (FLAG_SET(opts
, GIT_DIFF_BREAK_REWRITES
) &&
743 delta
->similarity
< opts
->break_rewrite_threshold
) {
744 delta
->flags
|= GIT_DIFF_FLAG__TO_SPLIT
;
748 if (FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
) &&
749 delta
->similarity
< opts
->rename_from_rewrite_threshold
)
755 delta
->flags
|= GIT_DIFF_FLAG__IS_RENAME_SOURCE
;
759 GIT_INLINE(bool) delta_is_split(git_diff_delta
*delta
)
761 return (delta
->status
== GIT_DELTA_TYPECHANGE
||
762 (delta
->flags
& GIT_DIFF_FLAG__TO_SPLIT
) != 0);
765 GIT_INLINE(bool) delta_is_new_only(git_diff_delta
*delta
)
767 return (delta
->status
== GIT_DELTA_ADDED
||
768 delta
->status
== GIT_DELTA_UNTRACKED
||
769 delta
->status
== GIT_DELTA_UNREADABLE
||
770 delta
->status
== GIT_DELTA_IGNORED
);
773 GIT_INLINE(void) delta_make_rename(
774 git_diff_delta
*to
, const git_diff_delta
*from
, uint16_t similarity
)
776 to
->status
= GIT_DELTA_RENAMED
;
777 to
->similarity
= similarity
;
779 memcpy(&to
->old_file
, &from
->old_file
, sizeof(to
->old_file
));
780 to
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
788 int git_diff_find_similar(
790 const git_diff_find_options
*given_opts
)
793 int error
= 0, result
;
795 git_diff_delta
*src
, *tgt
;
796 git_diff_find_options opts
= GIT_DIFF_FIND_OPTIONS_INIT
;
797 size_t num_deltas
, num_srcs
= 0, num_tgts
= 0;
798 size_t tried_srcs
= 0, tried_tgts
= 0;
799 size_t num_rewrites
= 0, num_updates
= 0, num_bumped
= 0;
800 size_t sigcache_size
;
801 void **sigcache
= NULL
; /* cache of similarity metric file signatures */
802 diff_find_match
*tgt2src
= NULL
;
803 diff_find_match
*src2tgt
= NULL
;
804 diff_find_match
*tgt2src_copy
= NULL
;
805 diff_find_match
*best_match
;
808 if ((error
= normalize_find_opts(diff
, &opts
, given_opts
)) < 0)
811 num_deltas
= diff
->deltas
.length
;
813 /* TODO: maybe abort if deltas.length > rename_limit ??? */
814 if (!git__is_uint32(num_deltas
))
817 /* No flags set; nothing to do */
818 if ((opts
.flags
& GIT_DIFF_FIND_ALL
) == 0)
821 GITERR_CHECK_ALLOC_MULTIPLY(&sigcache_size
, num_deltas
, 2);
822 sigcache
= git__calloc(sigcache_size
, sizeof(void *));
823 GITERR_CHECK_ALLOC(sigcache
);
825 /* Label rename sources and targets
827 * This will also set self-similarity scores for MODIFIED files and
828 * mark them for splitting if break-rewrites is enabled
830 git_vector_foreach(&diff
->deltas
, t
, tgt
) {
831 if (is_rename_source(diff
, &opts
, t
, sigcache
))
834 if (is_rename_target(diff
, &opts
, t
, sigcache
))
837 if ((tgt
->flags
& GIT_DIFF_FLAG__TO_SPLIT
) != 0)
841 /* if there are no candidate srcs or tgts, we're done */
842 if (!num_srcs
|| !num_tgts
)
845 src2tgt
= git__calloc(num_deltas
, sizeof(diff_find_match
));
846 GITERR_CHECK_ALLOC(src2tgt
);
847 tgt2src
= git__calloc(num_deltas
, sizeof(diff_find_match
));
848 GITERR_CHECK_ALLOC(tgt2src
);
850 if (FLAG_SET(&opts
, GIT_DIFF_FIND_COPIES
)) {
851 tgt2src_copy
= git__calloc(num_deltas
, sizeof(diff_find_match
));
852 GITERR_CHECK_ALLOC(tgt2src_copy
);
856 * Find best-fit matches for rename / copy candidates
860 tried_tgts
= num_bumped
= 0;
862 git_vector_foreach(&diff
->deltas
, t
, tgt
) {
863 /* skip things that are not rename targets */
864 if ((tgt
->flags
& GIT_DIFF_FLAG__IS_RENAME_TARGET
) == 0)
869 git_vector_foreach(&diff
->deltas
, s
, src
) {
870 /* skip things that are not rename sources */
871 if ((src
->flags
& GIT_DIFF_FLAG__IS_RENAME_SOURCE
) == 0)
874 /* calculate similarity for this pair and find best match */
876 result
= -1; /* don't measure self-similarity here */
877 else if ((error
= similarity_measure(
878 &result
, diff
, &opts
, sigcache
, 2 * s
, 2 * t
+ 1)) < 0)
883 similarity
= (uint16_t)result
;
885 /* is this a better rename? */
886 if (tgt2src
[t
].similarity
< similarity
&&
887 src2tgt
[s
].similarity
< similarity
)
889 /* eject old mapping */
890 if (src2tgt
[s
].similarity
> 0) {
891 tgt2src
[src2tgt
[s
].idx
].similarity
= 0;
894 if (tgt2src
[t
].similarity
> 0) {
895 src2tgt
[tgt2src
[t
].idx
].similarity
= 0;
899 /* write new mapping */
901 tgt2src
[t
].similarity
= similarity
;
903 src2tgt
[s
].similarity
= similarity
;
906 /* keep best absolute match for copies */
907 if (tgt2src_copy
!= NULL
&&
908 tgt2src_copy
[t
].similarity
< similarity
)
910 tgt2src_copy
[t
].idx
= s
;
911 tgt2src_copy
[t
].similarity
= similarity
;
914 if (++tried_srcs
>= num_srcs
)
917 /* cap on maximum targets we'll examine (per "tgt" file) */
918 if (tried_srcs
> opts
.rename_limit
)
922 if (++tried_tgts
>= num_tgts
)
926 if (num_bumped
> 0) /* try again if we bumped some items */
927 goto find_best_matches
;
930 * Rewrite the diffs with renames / copies
933 git_vector_foreach(&diff
->deltas
, t
, tgt
) {
934 /* skip things that are not rename targets */
935 if ((tgt
->flags
& GIT_DIFF_FLAG__IS_RENAME_TARGET
) == 0)
938 /* check if this delta was the target of a similarity */
939 if (tgt2src
[t
].similarity
)
940 best_match
= &tgt2src
[t
];
941 else if (tgt2src_copy
&& tgt2src_copy
[t
].similarity
)
942 best_match
= &tgt2src_copy
[t
];
947 src
= GIT_VECTOR_GET(&diff
->deltas
, s
);
949 /* possible scenarios:
950 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
951 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
952 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
953 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
954 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
957 if (src
->status
== GIT_DELTA_DELETED
) {
959 if (delta_is_new_only(tgt
)) {
961 if (best_match
->similarity
< opts
.rename_threshold
)
964 delta_make_rename(tgt
, src
, best_match
->similarity
);
966 src
->flags
|= GIT_DIFF_FLAG__TO_DELETE
;
969 assert(delta_is_split(tgt
));
971 if (best_match
->similarity
< opts
.rename_from_rewrite_threshold
)
974 memcpy(&swap
, &tgt
->old_file
, sizeof(swap
));
976 delta_make_rename(tgt
, src
, best_match
->similarity
);
979 assert(src
->status
== GIT_DELTA_DELETED
);
980 memcpy(&src
->old_file
, &swap
, sizeof(src
->old_file
));
981 memset(&src
->new_file
, 0, sizeof(src
->new_file
));
982 src
->new_file
.path
= src
->old_file
.path
;
983 src
->new_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
987 if (src2tgt
[t
].similarity
> 0 && src2tgt
[t
].idx
> t
) {
988 /* what used to be at src t is now at src s */
989 tgt2src
[src2tgt
[t
].idx
].idx
= s
;
994 else if (delta_is_split(src
)) {
996 if (delta_is_new_only(tgt
)) {
998 if (best_match
->similarity
< opts
.rename_threshold
)
1001 delta_make_rename(tgt
, src
, best_match
->similarity
);
1003 src
->status
= (diff
->new_src
== GIT_ITERATOR_TYPE_WORKDIR
) ?
1004 GIT_DELTA_UNTRACKED
: GIT_DELTA_ADDED
;
1006 memset(&src
->old_file
, 0, sizeof(src
->old_file
));
1007 src
->old_file
.path
= src
->new_file
.path
;
1008 src
->old_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
1010 src
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
1015 assert(delta_is_split(src
));
1017 if (best_match
->similarity
< opts
.rename_from_rewrite_threshold
)
1020 memcpy(&swap
, &tgt
->old_file
, sizeof(swap
));
1022 delta_make_rename(tgt
, src
, best_match
->similarity
);
1026 memcpy(&src
->old_file
, &swap
, sizeof(src
->old_file
));
1028 /* if we've just swapped the new element into the correct
1029 * place, clear the SPLIT flag
1031 if (tgt2src
[s
].idx
== t
&&
1032 tgt2src
[s
].similarity
>
1033 opts
.rename_from_rewrite_threshold
) {
1034 src
->status
= GIT_DELTA_RENAMED
;
1035 src
->similarity
= tgt2src
[s
].similarity
;
1036 tgt2src
[s
].similarity
= 0;
1037 src
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
1040 /* otherwise, if we just overwrote a source, update mapping */
1041 else if (src2tgt
[t
].similarity
> 0 && src2tgt
[t
].idx
> t
) {
1042 /* what used to be at src t is now at src s */
1043 tgt2src
[src2tgt
[t
].idx
].idx
= s
;
1050 else if (FLAG_SET(&opts
, GIT_DIFF_FIND_COPIES
)) {
1051 if (tgt2src_copy
[t
].similarity
< opts
.copy_threshold
)
1054 /* always use best possible source for copy */
1055 best_match
= &tgt2src_copy
[t
];
1056 src
= GIT_VECTOR_GET(&diff
->deltas
, best_match
->idx
);
1058 if (delta_is_split(tgt
)) {
1059 error
= insert_delete_side_of_split(diff
, &diff
->deltas
, tgt
);
1065 if (!delta_is_split(tgt
) && !delta_is_new_only(tgt
))
1068 tgt
->status
= GIT_DELTA_COPIED
;
1069 tgt
->similarity
= best_match
->similarity
;
1071 memcpy(&tgt
->old_file
, &src
->old_file
, sizeof(tgt
->old_file
));
1072 tgt
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
1079 * Actually split and delete entries as needed
1082 if (num_rewrites
> 0 || num_updates
> 0)
1083 error
= apply_splits_and_deletes(
1084 diff
, diff
->deltas
.length
- num_rewrites
,
1085 FLAG_SET(&opts
, GIT_DIFF_BREAK_REWRITES
) &&
1086 !FLAG_SET(&opts
, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY
));
1091 git__free(tgt2src_copy
);
1094 for (t
= 0; t
< num_deltas
* 2; ++t
) {
1095 if (sigcache
[t
] != NULL
)
1096 opts
.metric
->free_signature(sigcache
[t
], opts
.metric
->payload
);
1098 git__free(sigcache
);
1101 if (!given_opts
|| !given_opts
->metric
)
1102 git__free(opts
.metric
);