2 * Copyright (C) the libgit2 contributors. All rights reserved.
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
8 #include "diff_tform.h"
10 #include "git2/config.h"
11 #include "git2/blob.h"
12 #include "git2/sys/hashsig.h"
15 #include "diff_generate.h"
20 git_diff_delta
*git_diff__delta_dup(
21 const git_diff_delta
*d
, git_pool
*pool
)
23 git_diff_delta
*delta
= git__malloc(sizeof(git_diff_delta
));
27 memcpy(delta
, d
, sizeof(git_diff_delta
));
28 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta
->flags
);
30 if (d
->old_file
.path
!= NULL
) {
31 delta
->old_file
.path
= git_pool_strdup(pool
, d
->old_file
.path
);
32 if (delta
->old_file
.path
== NULL
)
36 if (d
->new_file
.path
!= d
->old_file
.path
&& d
->new_file
.path
!= NULL
) {
37 delta
->new_file
.path
= git_pool_strdup(pool
, d
->new_file
.path
);
38 if (delta
->new_file
.path
== NULL
)
41 delta
->new_file
.path
= delta
->old_file
.path
;
51 git_diff_delta
*git_diff__merge_like_cgit(
52 const git_diff_delta
*a
,
53 const git_diff_delta
*b
,
58 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
60 * When C git does a diff between the work dir and a tree, it actually
61 * diffs with the index but uses the workdir contents. This emulates
62 * those choices so we can emulate the type of diff.
64 * We have three file descriptions here, let's call them:
66 * f2 = a->new_file AND b->old_file
70 /* If one of the diffs is a conflict, just dup it */
71 if (b
->status
== GIT_DELTA_CONFLICTED
)
72 return git_diff__delta_dup(b
, pool
);
73 if (a
->status
== GIT_DELTA_CONFLICTED
)
74 return git_diff__delta_dup(a
, pool
);
76 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
77 if (b
->status
== GIT_DELTA_UNMODIFIED
|| a
->status
== GIT_DELTA_DELETED
)
78 return git_diff__delta_dup(a
, pool
);
80 /* otherwise, base this diff on the 'b' diff */
81 if ((dup
= git_diff__delta_dup(b
, pool
)) == NULL
)
84 /* If 'a' status is uninteresting, then we're done */
85 if (a
->status
== GIT_DELTA_UNMODIFIED
||
86 a
->status
== GIT_DELTA_UNTRACKED
||
87 a
->status
== GIT_DELTA_UNREADABLE
)
90 GIT_ASSERT_WITH_RETVAL(b
->status
!= GIT_DELTA_UNMODIFIED
, NULL
);
92 /* A cgit exception is that the diff of a file that is only in the
93 * index (i.e. not in HEAD nor workdir) is given as empty.
95 if (dup
->status
== GIT_DELTA_DELETED
) {
96 if (a
->status
== GIT_DELTA_ADDED
) {
97 dup
->status
= GIT_DELTA_UNMODIFIED
;
100 /* else don't overwrite DELETE status */
102 dup
->status
= a
->status
;
103 dup
->nfiles
= a
->nfiles
;
106 git_oid_cpy(&dup
->old_file
.id
, &a
->old_file
.id
);
107 dup
->old_file
.mode
= a
->old_file
.mode
;
108 dup
->old_file
.size
= a
->old_file
.size
;
109 dup
->old_file
.flags
= a
->old_file
.flags
;
115 git_diff
*onto
, const git_diff
*from
, git_diff__merge_cb cb
)
120 git_diff_delta
*delta
;
121 bool ignore_case
, reversed
;
124 GIT_ASSERT_ARG(onto
);
125 GIT_ASSERT_ARG(from
);
127 if (!from
->deltas
.length
)
130 ignore_case
= ((onto
->opts
.flags
& GIT_DIFF_IGNORE_CASE
) != 0);
131 reversed
= ((onto
->opts
.flags
& GIT_DIFF_REVERSE
) != 0);
133 if (ignore_case
!= ((from
->opts
.flags
& GIT_DIFF_IGNORE_CASE
) != 0) ||
134 reversed
!= ((from
->opts
.flags
& GIT_DIFF_REVERSE
) != 0)) {
135 git_error_set(GIT_ERROR_INVALID
,
136 "attempt to merge diffs created with conflicting options");
140 if (git_vector_init(&onto_new
, onto
->deltas
.length
, git_diff_delta__cmp
) < 0 ||
141 git_pool_init(&onto_pool
, 1) < 0)
144 for (i
= 0, j
= 0; i
< onto
->deltas
.length
|| j
< from
->deltas
.length
; ) {
145 git_diff_delta
*o
= GIT_VECTOR_GET(&onto
->deltas
, i
);
146 const git_diff_delta
*f
= GIT_VECTOR_GET(&from
->deltas
, j
);
147 int cmp
= !f
? -1 : !o
? 1 :
148 STRCMP_CASESELECT(ignore_case
, o
->old_file
.path
, f
->old_file
.path
);
151 delta
= git_diff__delta_dup(o
, &onto_pool
);
153 } else if (cmp
> 0) {
154 delta
= git_diff__delta_dup(f
, &onto_pool
);
157 const git_diff_delta
*left
= reversed
? f
: o
;
158 const git_diff_delta
*right
= reversed
? o
: f
;
160 delta
= cb(left
, right
, &onto_pool
);
165 /* the ignore rules for the target may not match the source
166 * or the result of a merged delta could be skippable...
168 if (delta
&& git_diff_delta__should_skip(&onto
->opts
, delta
)) {
173 if ((error
= !delta
? -1 : git_vector_insert(&onto_new
, delta
)) < 0)
178 git_vector_swap(&onto
->deltas
, &onto_new
);
179 git_pool_swap(&onto
->pool
, &onto_pool
);
181 if ((onto
->opts
.flags
& GIT_DIFF_REVERSE
) != 0)
182 onto
->old_src
= from
->old_src
;
184 onto
->new_src
= from
->new_src
;
186 /* prefix strings also come from old pool, so recreate those.*/
187 onto
->opts
.old_prefix
=
188 git_pool_strdup_safe(&onto
->pool
, onto
->opts
.old_prefix
);
189 onto
->opts
.new_prefix
=
190 git_pool_strdup_safe(&onto
->pool
, onto
->opts
.new_prefix
);
193 git_vector_free_deep(&onto_new
);
194 git_pool_clear(&onto_pool
);
199 int git_diff_merge(git_diff
*onto
, const git_diff
*from
)
201 return git_diff__merge(onto
, from
, git_diff__merge_like_cgit
);
204 int git_diff_find_similar__hashsig_for_file(
205 void **out
, const git_diff_file
*f
, const char *path
, void *p
)
207 git_hashsig_option_t opt
= (git_hashsig_option_t
)(intptr_t)p
;
210 return git_hashsig_create_fromfile((git_hashsig
**)out
, path
, opt
);
213 int git_diff_find_similar__hashsig_for_buf(
214 void **out
, const git_diff_file
*f
, const char *buf
, size_t len
, void *p
)
216 git_hashsig_option_t opt
= (git_hashsig_option_t
)(intptr_t)p
;
219 return git_hashsig_create((git_hashsig
**)out
, buf
, len
, opt
);
222 void git_diff_find_similar__hashsig_free(void *sig
, void *payload
)
225 git_hashsig_free(sig
);
228 int git_diff_find_similar__calc_similarity(
229 int *score
, void *siga
, void *sigb
, void *payload
)
234 error
= git_hashsig_compare(siga
, sigb
);
242 #define DEFAULT_THRESHOLD 50
243 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60
244 #define DEFAULT_RENAME_LIMIT 200
246 static int normalize_find_opts(
248 git_diff_find_options
*opts
,
249 const git_diff_find_options
*given
)
251 git_config
*cfg
= NULL
;
252 git_hashsig_option_t hashsig_opts
;
254 GIT_ERROR_CHECK_VERSION(given
, GIT_DIFF_FIND_OPTIONS_VERSION
, "git_diff_find_options");
256 if (diff
->repo
!= NULL
&&
257 git_repository_config__weakptr(&cfg
, diff
->repo
) < 0)
261 memcpy(opts
, given
, sizeof(*opts
));
264 (given
->flags
& GIT_DIFF_FIND_ALL
) == GIT_DIFF_FIND_BY_CONFIG
)
268 git_config__get_string_force(cfg
, "diff.renames", "true");
271 if (!git__parse_bool(&boolval
, rule
) && !boolval
)
272 /* don't set FIND_RENAMES if bool value is false */;
273 else if (!strcasecmp(rule
, "copies") || !strcasecmp(rule
, "copy"))
274 opts
->flags
|= GIT_DIFF_FIND_RENAMES
| GIT_DIFF_FIND_COPIES
;
276 opts
->flags
|= GIT_DIFF_FIND_RENAMES
;
280 /* set default flag */
281 opts
->flags
|= GIT_DIFF_FIND_RENAMES
;
285 /* some flags imply others */
287 if (opts
->flags
& GIT_DIFF_FIND_EXACT_MATCH_ONLY
) {
288 /* if we are only looking for exact matches, then don't turn
289 * MODIFIED items into ADD/DELETE pairs because it's too picky
291 opts
->flags
&= ~(GIT_DIFF_FIND_REWRITES
| GIT_DIFF_BREAK_REWRITES
);
293 /* similarly, don't look for self-rewrites to split */
294 opts
->flags
&= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES
;
297 if (opts
->flags
& GIT_DIFF_FIND_RENAMES_FROM_REWRITES
)
298 opts
->flags
|= GIT_DIFF_FIND_RENAMES
;
300 if (opts
->flags
& GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED
)
301 opts
->flags
|= GIT_DIFF_FIND_COPIES
;
303 if (opts
->flags
& GIT_DIFF_BREAK_REWRITES
)
304 opts
->flags
|= GIT_DIFF_FIND_REWRITES
;
306 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
308 if (USE_DEFAULT(opts
->rename_threshold
))
309 opts
->rename_threshold
= DEFAULT_THRESHOLD
;
311 if (USE_DEFAULT(opts
->rename_from_rewrite_threshold
))
312 opts
->rename_from_rewrite_threshold
= DEFAULT_THRESHOLD
;
314 if (USE_DEFAULT(opts
->copy_threshold
))
315 opts
->copy_threshold
= DEFAULT_THRESHOLD
;
317 if (USE_DEFAULT(opts
->break_rewrite_threshold
))
318 opts
->break_rewrite_threshold
= DEFAULT_BREAK_REWRITE_THRESHOLD
;
322 if (!opts
->rename_limit
) {
324 opts
->rename_limit
= git_config__get_int_force(
325 cfg
, "diff.renamelimit", DEFAULT_RENAME_LIMIT
);
328 if (opts
->rename_limit
<= 0)
329 opts
->rename_limit
= DEFAULT_RENAME_LIMIT
;
332 /* assign the internal metric with whitespace flag as payload */
334 opts
->metric
= git__malloc(sizeof(git_diff_similarity_metric
));
335 GIT_ERROR_CHECK_ALLOC(opts
->metric
);
337 opts
->metric
->file_signature
= git_diff_find_similar__hashsig_for_file
;
338 opts
->metric
->buffer_signature
= git_diff_find_similar__hashsig_for_buf
;
339 opts
->metric
->free_signature
= git_diff_find_similar__hashsig_free
;
340 opts
->metric
->similarity
= git_diff_find_similar__calc_similarity
;
342 if (opts
->flags
& GIT_DIFF_FIND_IGNORE_WHITESPACE
)
343 hashsig_opts
= GIT_HASHSIG_IGNORE_WHITESPACE
;
344 else if (opts
->flags
& GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE
)
345 hashsig_opts
= GIT_HASHSIG_NORMAL
;
347 hashsig_opts
= GIT_HASHSIG_SMART_WHITESPACE
;
348 hashsig_opts
|= GIT_HASHSIG_ALLOW_SMALL_FILES
;
349 opts
->metric
->payload
= (void *)hashsig_opts
;
355 static int insert_delete_side_of_split(
356 git_diff
*diff
, git_vector
*onto
, const git_diff_delta
*delta
)
358 /* make new record for DELETED side of split */
359 git_diff_delta
*deleted
= git_diff__delta_dup(delta
, &diff
->pool
);
360 GIT_ERROR_CHECK_ALLOC(deleted
);
362 deleted
->status
= GIT_DELTA_DELETED
;
364 memset(&deleted
->new_file
, 0, sizeof(deleted
->new_file
));
365 deleted
->new_file
.path
= deleted
->old_file
.path
;
366 deleted
->new_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
368 return git_vector_insert(onto
, deleted
);
371 static int apply_splits_and_deletes(
372 git_diff
*diff
, size_t expected_size
, bool actually_split
)
374 git_vector onto
= GIT_VECTOR_INIT
;
376 git_diff_delta
*delta
;
378 if (git_vector_init(&onto
, expected_size
, git_diff_delta__cmp
) < 0)
381 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
382 git_vector_foreach(&diff
->deltas
, i
, delta
) {
383 if ((delta
->flags
& GIT_DIFF_FLAG__TO_DELETE
) != 0)
386 if ((delta
->flags
& GIT_DIFF_FLAG__TO_SPLIT
) != 0 && actually_split
) {
387 delta
->similarity
= 0;
389 if (insert_delete_side_of_split(diff
, &onto
, delta
) < 0)
392 if (diff
->new_src
== GIT_ITERATOR_WORKDIR
)
393 delta
->status
= GIT_DELTA_UNTRACKED
;
395 delta
->status
= GIT_DELTA_ADDED
;
397 memset(&delta
->old_file
, 0, sizeof(delta
->old_file
));
398 delta
->old_file
.path
= delta
->new_file
.path
;
399 delta
->old_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
402 /* clean up delta before inserting into new list */
403 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta
->flags
);
405 if (delta
->status
!= GIT_DELTA_COPIED
&&
406 delta
->status
!= GIT_DELTA_RENAMED
&&
407 (delta
->status
!= GIT_DELTA_MODIFIED
|| actually_split
))
408 delta
->similarity
= 0;
410 /* insert into new list */
411 if (git_vector_insert(&onto
, delta
) < 0)
415 /* cannot return an error past this point */
417 /* free deltas from old list that didn't make it to the new one */
418 git_vector_foreach(&diff
->deltas
, i
, delta
) {
419 if ((delta
->flags
& GIT_DIFF_FLAG__TO_DELETE
) != 0)
423 /* swap new delta list into place */
424 git_vector_swap(&diff
->deltas
, &onto
);
425 git_vector_free(&onto
);
426 git_vector_sort(&diff
->deltas
);
431 git_vector_free_deep(&onto
);
436 GIT_INLINE(git_diff_file
*) similarity_get_file(git_diff
*diff
, size_t idx
)
438 git_diff_delta
*delta
= git_vector_get(&diff
->deltas
, idx
/ 2);
439 return (idx
& 1) ? &delta
->new_file
: &delta
->old_file
;
445 git_repository
*repo
;
448 git_odb_object
*odb_obj
;
452 static int similarity_init(
453 similarity_info
*info
, git_diff
*diff
, size_t file_idx
)
455 info
->idx
= file_idx
;
456 info
->src
= (file_idx
& 1) ? diff
->new_src
: diff
->old_src
;
457 info
->repo
= diff
->repo
;
458 info
->file
= similarity_get_file(diff
, file_idx
);
459 info
->odb_obj
= NULL
;
461 git_buf_init(&info
->data
, 0);
463 if (info
->file
->size
> 0 || info
->src
== GIT_ITERATOR_WORKDIR
)
466 return git_diff_file__resolve_zero_size(
467 info
->file
, &info
->odb_obj
, info
->repo
);
470 static int similarity_sig(
471 similarity_info
*info
,
472 const git_diff_find_options
*opts
,
476 git_diff_file
*file
= info
->file
;
478 if (info
->src
== GIT_ITERATOR_WORKDIR
) {
479 if ((error
= git_repository_workdir_path(
480 &info
->data
, info
->repo
, file
->path
)) < 0)
483 /* if path is not a regular file, just skip this item */
484 if (!git_path_isfile(info
->data
.ptr
))
487 /* TODO: apply wd-to-odb filters to file data if necessary */
489 error
= opts
->metric
->file_signature(
490 &cache
[info
->idx
], info
->file
,
491 info
->data
.ptr
, opts
->metric
->payload
);
493 /* if we didn't initially know the size, we might have an odb_obj
494 * around from earlier, so convert that, otherwise load the blob now
496 if (info
->odb_obj
!= NULL
)
497 error
= git_object__from_odb_object(
498 (git_object
**)&info
->blob
, info
->repo
,
499 info
->odb_obj
, GIT_OBJECT_BLOB
);
501 error
= git_blob_lookup(&info
->blob
, info
->repo
, &file
->id
);
504 /* if lookup fails, just skip this item in similarity calc */
509 /* index size may not be actual blob size if filtered */
510 if (file
->size
!= git_blob_rawsize(info
->blob
))
511 file
->size
= git_blob_rawsize(info
->blob
);
513 sz
= git__is_sizet(file
->size
) ? (size_t)file
->size
: (size_t)-1;
515 error
= opts
->metric
->buffer_signature(
516 &cache
[info
->idx
], info
->file
,
517 git_blob_rawcontent(info
->blob
), sz
, opts
->metric
->payload
);
524 static void similarity_unload(similarity_info
*info
)
527 git_odb_object_free(info
->odb_obj
);
530 git_blob_free(info
->blob
);
532 git_buf_dispose(&info
->data
);
535 #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
537 /* - score < 0 means files cannot be compared
538 * - score >= 100 means files are exact match
539 * - score == 0 means files are completely different
541 static int similarity_measure(
544 const git_diff_find_options
*opts
,
549 git_diff_file
*a_file
= similarity_get_file(diff
, a_idx
);
550 git_diff_file
*b_file
= similarity_get_file(diff
, b_idx
);
551 bool exact_match
= FLAG_SET(opts
, GIT_DIFF_FIND_EXACT_MATCH_ONLY
);
553 similarity_info a_info
, b_info
;
557 /* don't try to compare things that aren't files */
558 if (!GIT_MODE_ISBLOB(a_file
->mode
) || !GIT_MODE_ISBLOB(b_file
->mode
))
561 /* if exact match is requested, force calculation of missing OIDs now */
563 if (git_oid_is_zero(&a_file
->id
) &&
564 diff
->old_src
== GIT_ITERATOR_WORKDIR
&&
565 !git_diff__oid_for_file(&a_file
->id
,
566 diff
, a_file
->path
, a_file
->mode
, a_file
->size
))
567 a_file
->flags
|= GIT_DIFF_FLAG_VALID_ID
;
569 if (git_oid_is_zero(&b_file
->id
) &&
570 diff
->new_src
== GIT_ITERATOR_WORKDIR
&&
571 !git_diff__oid_for_file(&b_file
->id
,
572 diff
, b_file
->path
, b_file
->mode
, b_file
->size
))
573 b_file
->flags
|= GIT_DIFF_FLAG_VALID_ID
;
576 /* check OID match as a quick test */
577 if (git_oid__cmp(&a_file
->id
, &b_file
->id
) == 0) {
582 /* don't calculate signatures if we are doing exact match */
588 memset(&a_info
, 0, sizeof(a_info
));
589 memset(&b_info
, 0, sizeof(b_info
));
591 /* set up similarity data (will try to update missing file sizes) */
592 if (!cache
[a_idx
] && (error
= similarity_init(&a_info
, diff
, a_idx
)) < 0)
594 if (!cache
[b_idx
] && (error
= similarity_init(&b_info
, diff
, b_idx
)) < 0)
597 /* check if file sizes are nowhere near each other */
598 if (a_file
->size
> 127 &&
599 b_file
->size
> 127 &&
600 (a_file
->size
> (b_file
->size
<< 3) ||
601 b_file
->size
> (a_file
->size
<< 3)))
604 /* update signature cache if needed */
606 if ((error
= similarity_sig(&a_info
, opts
, cache
)) < 0)
610 if ((error
= similarity_sig(&b_info
, opts
, cache
)) < 0)
614 /* calculate similarity provided that the metric choose to process
615 * both the a and b files (some may not if file is too big, etc).
617 if (cache
[a_idx
] && cache
[b_idx
])
618 error
= opts
->metric
->similarity(
619 score
, cache
[a_idx
], cache
[b_idx
], opts
->metric
->payload
);
622 similarity_unload(&a_info
);
623 similarity_unload(&b_info
);
628 static int calc_self_similarity(
630 const git_diff_find_options
*opts
,
634 int error
, similarity
= -1;
635 git_diff_delta
*delta
= GIT_VECTOR_GET(&diff
->deltas
, delta_idx
);
637 if ((delta
->flags
& GIT_DIFF_FLAG__HAS_SELF_SIMILARITY
) != 0)
640 error
= similarity_measure(
641 &similarity
, diff
, opts
, cache
, 2 * delta_idx
, 2 * delta_idx
+ 1);
645 if (similarity
>= 0) {
646 delta
->similarity
= (uint16_t)similarity
;
647 delta
->flags
|= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY
;
653 static bool is_rename_target(
655 const git_diff_find_options
*opts
,
659 git_diff_delta
*delta
= GIT_VECTOR_GET(&diff
->deltas
, delta_idx
);
661 /* skip things that aren't plain blobs */
662 if (!GIT_MODE_ISBLOB(delta
->new_file
.mode
))
665 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
666 * targets; maybe include UNTRACKED if requested.
668 switch (delta
->status
) {
669 case GIT_DELTA_UNMODIFIED
:
670 case GIT_DELTA_DELETED
:
671 case GIT_DELTA_IGNORED
:
672 case GIT_DELTA_CONFLICTED
:
675 case GIT_DELTA_MODIFIED
:
676 if (!FLAG_SET(opts
, GIT_DIFF_FIND_REWRITES
) &&
677 !FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
))
680 if (calc_self_similarity(diff
, opts
, delta_idx
, cache
) < 0)
683 if (FLAG_SET(opts
, GIT_DIFF_BREAK_REWRITES
) &&
684 delta
->similarity
< opts
->break_rewrite_threshold
) {
685 delta
->flags
|= GIT_DIFF_FLAG__TO_SPLIT
;
688 if (FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
) &&
689 delta
->similarity
< opts
->rename_from_rewrite_threshold
) {
690 delta
->flags
|= GIT_DIFF_FLAG__TO_SPLIT
;
696 case GIT_DELTA_UNTRACKED
:
697 if (!FLAG_SET(opts
, GIT_DIFF_FIND_FOR_UNTRACKED
))
701 default: /* all other status values should be checked */
705 delta
->flags
|= GIT_DIFF_FLAG__IS_RENAME_TARGET
;
709 static bool is_rename_source(
711 const git_diff_find_options
*opts
,
715 git_diff_delta
*delta
= GIT_VECTOR_GET(&diff
->deltas
, delta_idx
);
717 /* skip things that aren't blobs */
718 if (!GIT_MODE_ISBLOB(delta
->old_file
.mode
))
721 switch (delta
->status
) {
722 case GIT_DELTA_ADDED
:
723 case GIT_DELTA_UNTRACKED
:
724 case GIT_DELTA_UNREADABLE
:
725 case GIT_DELTA_IGNORED
:
726 case GIT_DELTA_CONFLICTED
:
729 case GIT_DELTA_DELETED
:
730 case GIT_DELTA_TYPECHANGE
:
733 case GIT_DELTA_UNMODIFIED
:
734 if (!FLAG_SET(opts
, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED
))
736 if (FLAG_SET(opts
, GIT_DIFF_FIND_REMOVE_UNMODIFIED
))
737 delta
->flags
|= GIT_DIFF_FLAG__TO_DELETE
;
740 default: /* MODIFIED, RENAMED, COPIED */
741 /* if we're finding copies, this could be a source */
742 if (FLAG_SET(opts
, GIT_DIFF_FIND_COPIES
))
745 /* otherwise, this is only a source if we can split it */
746 if (!FLAG_SET(opts
, GIT_DIFF_FIND_REWRITES
) &&
747 !FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
))
750 if (calc_self_similarity(diff
, opts
, delta_idx
, cache
) < 0)
753 if (FLAG_SET(opts
, GIT_DIFF_BREAK_REWRITES
) &&
754 delta
->similarity
< opts
->break_rewrite_threshold
) {
755 delta
->flags
|= GIT_DIFF_FLAG__TO_SPLIT
;
759 if (FLAG_SET(opts
, GIT_DIFF_FIND_RENAMES_FROM_REWRITES
) &&
760 delta
->similarity
< opts
->rename_from_rewrite_threshold
)
766 delta
->flags
|= GIT_DIFF_FLAG__IS_RENAME_SOURCE
;
770 GIT_INLINE(bool) delta_is_split(git_diff_delta
*delta
)
772 return (delta
->status
== GIT_DELTA_TYPECHANGE
||
773 (delta
->flags
& GIT_DIFF_FLAG__TO_SPLIT
) != 0);
776 GIT_INLINE(bool) delta_is_new_only(git_diff_delta
*delta
)
778 return (delta
->status
== GIT_DELTA_ADDED
||
779 delta
->status
== GIT_DELTA_UNTRACKED
||
780 delta
->status
== GIT_DELTA_UNREADABLE
||
781 delta
->status
== GIT_DELTA_IGNORED
);
784 GIT_INLINE(void) delta_make_rename(
785 git_diff_delta
*to
, const git_diff_delta
*from
, uint16_t similarity
)
787 to
->status
= GIT_DELTA_RENAMED
;
788 to
->similarity
= similarity
;
790 memcpy(&to
->old_file
, &from
->old_file
, sizeof(to
->old_file
));
791 to
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
799 int git_diff_find_similar(
801 const git_diff_find_options
*given_opts
)
804 int error
= 0, result
;
806 git_diff_delta
*src
, *tgt
;
807 git_diff_find_options opts
= GIT_DIFF_FIND_OPTIONS_INIT
;
808 size_t num_deltas
, num_srcs
= 0, num_tgts
= 0;
809 size_t tried_srcs
= 0, tried_tgts
= 0;
810 size_t num_rewrites
= 0, num_updates
= 0, num_bumped
= 0;
811 size_t sigcache_size
;
812 void **sigcache
= NULL
; /* cache of similarity metric file signatures */
813 diff_find_match
*tgt2src
= NULL
;
814 diff_find_match
*src2tgt
= NULL
;
815 diff_find_match
*tgt2src_copy
= NULL
;
816 diff_find_match
*best_match
;
819 GIT_ASSERT_ARG(diff
);
821 if ((error
= normalize_find_opts(diff
, &opts
, given_opts
)) < 0)
824 num_deltas
= diff
->deltas
.length
;
826 /* TODO: maybe abort if deltas.length > rename_limit ??? */
827 if (!num_deltas
|| !git__is_uint32(num_deltas
))
830 /* No flags set; nothing to do */
831 if ((opts
.flags
& GIT_DIFF_FIND_ALL
) == 0)
834 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&sigcache_size
, num_deltas
, 2);
835 sigcache
= git__calloc(sigcache_size
, sizeof(void *));
836 GIT_ERROR_CHECK_ALLOC(sigcache
);
838 /* Label rename sources and targets
840 * This will also set self-similarity scores for MODIFIED files and
841 * mark them for splitting if break-rewrites is enabled
843 git_vector_foreach(&diff
->deltas
, t
, tgt
) {
844 if (is_rename_source(diff
, &opts
, t
, sigcache
))
847 if (is_rename_target(diff
, &opts
, t
, sigcache
))
850 if ((tgt
->flags
& GIT_DIFF_FLAG__TO_SPLIT
) != 0)
854 /* if there are no candidate srcs or tgts, we're done */
855 if (!num_srcs
|| !num_tgts
)
858 src2tgt
= git__calloc(num_deltas
, sizeof(diff_find_match
));
859 GIT_ERROR_CHECK_ALLOC(src2tgt
);
860 tgt2src
= git__calloc(num_deltas
, sizeof(diff_find_match
));
861 GIT_ERROR_CHECK_ALLOC(tgt2src
);
863 if (FLAG_SET(&opts
, GIT_DIFF_FIND_COPIES
)) {
864 tgt2src_copy
= git__calloc(num_deltas
, sizeof(diff_find_match
));
865 GIT_ERROR_CHECK_ALLOC(tgt2src_copy
);
869 * Find best-fit matches for rename / copy candidates
873 tried_tgts
= num_bumped
= 0;
875 git_vector_foreach(&diff
->deltas
, t
, tgt
) {
876 /* skip things that are not rename targets */
877 if ((tgt
->flags
& GIT_DIFF_FLAG__IS_RENAME_TARGET
) == 0)
882 git_vector_foreach(&diff
->deltas
, s
, src
) {
883 /* skip things that are not rename sources */
884 if ((src
->flags
& GIT_DIFF_FLAG__IS_RENAME_SOURCE
) == 0)
887 /* calculate similarity for this pair and find best match */
889 result
= -1; /* don't measure self-similarity here */
890 else if ((error
= similarity_measure(
891 &result
, diff
, &opts
, sigcache
, 2 * s
, 2 * t
+ 1)) < 0)
896 similarity
= (uint16_t)result
;
898 /* is this a better rename? */
899 if (tgt2src
[t
].similarity
< similarity
&&
900 src2tgt
[s
].similarity
< similarity
)
902 /* eject old mapping */
903 if (src2tgt
[s
].similarity
> 0) {
904 tgt2src
[src2tgt
[s
].idx
].similarity
= 0;
907 if (tgt2src
[t
].similarity
> 0) {
908 src2tgt
[tgt2src
[t
].idx
].similarity
= 0;
912 /* write new mapping */
914 tgt2src
[t
].similarity
= similarity
;
916 src2tgt
[s
].similarity
= similarity
;
919 /* keep best absolute match for copies */
920 if (tgt2src_copy
!= NULL
&&
921 tgt2src_copy
[t
].similarity
< similarity
)
923 tgt2src_copy
[t
].idx
= s
;
924 tgt2src_copy
[t
].similarity
= similarity
;
927 if (++tried_srcs
>= num_srcs
)
930 /* cap on maximum targets we'll examine (per "tgt" file) */
931 if (tried_srcs
> opts
.rename_limit
)
935 if (++tried_tgts
>= num_tgts
)
939 if (num_bumped
> 0) /* try again if we bumped some items */
940 goto find_best_matches
;
943 * Rewrite the diffs with renames / copies
946 git_vector_foreach(&diff
->deltas
, t
, tgt
) {
947 /* skip things that are not rename targets */
948 if ((tgt
->flags
& GIT_DIFF_FLAG__IS_RENAME_TARGET
) == 0)
951 /* check if this delta was the target of a similarity */
952 if (tgt2src
[t
].similarity
)
953 best_match
= &tgt2src
[t
];
954 else if (tgt2src_copy
&& tgt2src_copy
[t
].similarity
)
955 best_match
= &tgt2src_copy
[t
];
960 src
= GIT_VECTOR_GET(&diff
->deltas
, s
);
962 /* possible scenarios:
963 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
964 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
965 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
966 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
967 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
970 if (src
->status
== GIT_DELTA_DELETED
) {
972 if (delta_is_new_only(tgt
)) {
974 if (best_match
->similarity
< opts
.rename_threshold
)
977 delta_make_rename(tgt
, src
, best_match
->similarity
);
979 src
->flags
|= GIT_DIFF_FLAG__TO_DELETE
;
982 GIT_ASSERT(delta_is_split(tgt
));
984 if (best_match
->similarity
< opts
.rename_from_rewrite_threshold
)
987 memcpy(&swap
, &tgt
->old_file
, sizeof(swap
));
989 delta_make_rename(tgt
, src
, best_match
->similarity
);
992 GIT_ASSERT(src
->status
== GIT_DELTA_DELETED
);
993 memcpy(&src
->old_file
, &swap
, sizeof(src
->old_file
));
994 memset(&src
->new_file
, 0, sizeof(src
->new_file
));
995 src
->new_file
.path
= src
->old_file
.path
;
996 src
->new_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
1000 if (src2tgt
[t
].similarity
> 0 && src2tgt
[t
].idx
> t
) {
1001 /* what used to be at src t is now at src s */
1002 tgt2src
[src2tgt
[t
].idx
].idx
= s
;
1007 else if (delta_is_split(src
)) {
1009 if (delta_is_new_only(tgt
)) {
1011 if (best_match
->similarity
< opts
.rename_threshold
)
1014 delta_make_rename(tgt
, src
, best_match
->similarity
);
1016 src
->status
= (diff
->new_src
== GIT_ITERATOR_WORKDIR
) ?
1017 GIT_DELTA_UNTRACKED
: GIT_DELTA_ADDED
;
1019 memset(&src
->old_file
, 0, sizeof(src
->old_file
));
1020 src
->old_file
.path
= src
->new_file
.path
;
1021 src
->old_file
.flags
|= GIT_DIFF_FLAG_VALID_ID
;
1023 src
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
1028 GIT_ASSERT(delta_is_split(src
));
1030 if (best_match
->similarity
< opts
.rename_from_rewrite_threshold
)
1033 memcpy(&swap
, &tgt
->old_file
, sizeof(swap
));
1035 delta_make_rename(tgt
, src
, best_match
->similarity
);
1039 memcpy(&src
->old_file
, &swap
, sizeof(src
->old_file
));
1041 /* if we've just swapped the new element into the correct
1042 * place, clear the SPLIT and RENAME_TARGET flags
1044 if (tgt2src
[s
].idx
== t
&&
1045 tgt2src
[s
].similarity
>
1046 opts
.rename_from_rewrite_threshold
) {
1047 src
->status
= GIT_DELTA_RENAMED
;
1048 src
->similarity
= tgt2src
[s
].similarity
;
1049 tgt2src
[s
].similarity
= 0;
1050 src
->flags
&= ~(GIT_DIFF_FLAG__TO_SPLIT
| GIT_DIFF_FLAG__IS_RENAME_TARGET
);
1053 /* otherwise, if we just overwrote a source, update mapping */
1054 else if (src2tgt
[t
].similarity
> 0 && src2tgt
[t
].idx
> t
) {
1055 /* what used to be at src t is now at src s */
1056 tgt2src
[src2tgt
[t
].idx
].idx
= s
;
1063 else if (FLAG_SET(&opts
, GIT_DIFF_FIND_COPIES
)) {
1064 if (tgt2src_copy
[t
].similarity
< opts
.copy_threshold
)
1067 /* always use best possible source for copy */
1068 best_match
= &tgt2src_copy
[t
];
1069 src
= GIT_VECTOR_GET(&diff
->deltas
, best_match
->idx
);
1071 if (delta_is_split(tgt
)) {
1072 error
= insert_delete_side_of_split(diff
, &diff
->deltas
, tgt
);
1078 if (!delta_is_split(tgt
) && !delta_is_new_only(tgt
))
1081 tgt
->status
= GIT_DELTA_COPIED
;
1082 tgt
->similarity
= best_match
->similarity
;
1084 memcpy(&tgt
->old_file
, &src
->old_file
, sizeof(tgt
->old_file
));
1085 tgt
->flags
&= ~GIT_DIFF_FLAG__TO_SPLIT
;
1092 * Actually split and delete entries as needed
1095 if (num_rewrites
> 0 || num_updates
> 0)
1096 error
= apply_splits_and_deletes(
1097 diff
, diff
->deltas
.length
- num_rewrites
,
1098 FLAG_SET(&opts
, GIT_DIFF_BREAK_REWRITES
) &&
1099 !FLAG_SET(&opts
, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY
));
1104 git__free(tgt2src_copy
);
1107 for (t
= 0; t
< num_deltas
* 2; ++t
) {
1108 if (sigcache
[t
] != NULL
)
1109 opts
.metric
->free_signature(sigcache
[t
], opts
.metric
->payload
);
1111 git__free(sigcache
);
1114 if (!given_opts
|| !given_opts
->metric
)
1115 git__free(opts
.metric
);