]> git.proxmox.com Git - libgit2.git/blob - src/diff_tform.c
Merge pull request #3097 from libgit2/cmn/submodule-config-state
[libgit2.git] / src / diff_tform.c
1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7 #include "common.h"
8
9 #include "git2/config.h"
10 #include "git2/blob.h"
11 #include "git2/sys/hashsig.h"
12
13 #include "diff.h"
14 #include "path.h"
15 #include "fileops.h"
16 #include "config.h"
17
18 git_diff_delta *git_diff__delta_dup(
19 const git_diff_delta *d, git_pool *pool)
20 {
21 git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
22 if (!delta)
23 return NULL;
24
25 memcpy(delta, d, sizeof(git_diff_delta));
26 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
27
28 if (d->old_file.path != NULL) {
29 delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
30 if (delta->old_file.path == NULL)
31 goto fail;
32 }
33
34 if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
35 delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
36 if (delta->new_file.path == NULL)
37 goto fail;
38 } else {
39 delta->new_file.path = delta->old_file.path;
40 }
41
42 return delta;
43
44 fail:
45 git__free(delta);
46 return NULL;
47 }
48
49 git_diff_delta *git_diff__merge_like_cgit(
50 const git_diff_delta *a,
51 const git_diff_delta *b,
52 git_pool *pool)
53 {
54 git_diff_delta *dup;
55
56 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
57 *
58 * When C git does a diff between the work dir and a tree, it actually
59 * diffs with the index but uses the workdir contents. This emulates
60 * those choices so we can emulate the type of diff.
61 *
62 * We have three file descriptions here, let's call them:
63 * f1 = a->old_file
64 * f2 = a->new_file AND b->old_file
65 * f3 = b->new_file
66 */
67
68 /* If one of the diffs is a conflict, just dup it */
69 if (b->status == GIT_DELTA_CONFLICTED)
70 return git_diff__delta_dup(b, pool);
71 if (a->status == GIT_DELTA_CONFLICTED)
72 return git_diff__delta_dup(a, pool);
73
74 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
75 if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
76 return git_diff__delta_dup(a, pool);
77
78 /* otherwise, base this diff on the 'b' diff */
79 if ((dup = git_diff__delta_dup(b, pool)) == NULL)
80 return NULL;
81
82 /* If 'a' status is uninteresting, then we're done */
83 if (a->status == GIT_DELTA_UNMODIFIED ||
84 a->status == GIT_DELTA_UNTRACKED ||
85 a->status == GIT_DELTA_UNREADABLE)
86 return dup;
87
88 assert(b->status != GIT_DELTA_UNMODIFIED);
89
90 /* A cgit exception is that the diff of a file that is only in the
91 * index (i.e. not in HEAD nor workdir) is given as empty.
92 */
93 if (dup->status == GIT_DELTA_DELETED) {
94 if (a->status == GIT_DELTA_ADDED) {
95 dup->status = GIT_DELTA_UNMODIFIED;
96 dup->nfiles = 2;
97 }
98 /* else don't overwrite DELETE status */
99 } else {
100 dup->status = a->status;
101 dup->nfiles = a->nfiles;
102 }
103
104 git_oid_cpy(&dup->old_file.id, &a->old_file.id);
105 dup->old_file.mode = a->old_file.mode;
106 dup->old_file.size = a->old_file.size;
107 dup->old_file.flags = a->old_file.flags;
108
109 return dup;
110 }
111
112 int git_diff__merge(
113 git_diff *onto, const git_diff *from, git_diff__merge_cb cb)
114 {
115 int error = 0;
116 git_pool onto_pool;
117 git_vector onto_new;
118 git_diff_delta *delta;
119 bool ignore_case, reversed;
120 unsigned int i, j;
121
122 assert(onto && from);
123
124 if (!from->deltas.length)
125 return 0;
126
127 ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
128 reversed = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);
129
130 if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
131 reversed != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
132 giterr_set(GITERR_INVALID,
133 "Attempt to merge diffs created with conflicting options");
134 return -1;
135 }
136
137 if (git_vector_init(
138 &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 ||
139 git_pool_init(&onto_pool, 1, 0) < 0)
140 return -1;
141
142 for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
143 git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
144 const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
145 int cmp = !f ? -1 : !o ? 1 :
146 STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
147
148 if (cmp < 0) {
149 delta = git_diff__delta_dup(o, &onto_pool);
150 i++;
151 } else if (cmp > 0) {
152 delta = git_diff__delta_dup(f, &onto_pool);
153 j++;
154 } else {
155 const git_diff_delta *left = reversed ? f : o;
156 const git_diff_delta *right = reversed ? o : f;
157
158 delta = cb(left, right, &onto_pool);
159 i++;
160 j++;
161 }
162
163 /* the ignore rules for the target may not match the source
164 * or the result of a merged delta could be skippable...
165 */
166 if (delta && git_diff_delta__should_skip(&onto->opts, delta)) {
167 git__free(delta);
168 continue;
169 }
170
171 if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
172 break;
173 }
174
175 if (!error) {
176 git_vector_swap(&onto->deltas, &onto_new);
177 git_pool_swap(&onto->pool, &onto_pool);
178
179 if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
180 onto->old_src = from->old_src;
181 else
182 onto->new_src = from->new_src;
183
184 /* prefix strings also come from old pool, so recreate those.*/
185 onto->opts.old_prefix =
186 git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
187 onto->opts.new_prefix =
188 git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
189 }
190
191 git_vector_free_deep(&onto_new);
192 git_pool_clear(&onto_pool);
193
194 return error;
195 }
196
197 int git_diff_merge(git_diff *onto, const git_diff *from)
198 {
199 return git_diff__merge(onto, from, git_diff__merge_like_cgit);
200 }
201
202 int git_diff_find_similar__hashsig_for_file(
203 void **out, const git_diff_file *f, const char *path, void *p)
204 {
205 git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
206
207 GIT_UNUSED(f);
208 return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
209 }
210
211 int git_diff_find_similar__hashsig_for_buf(
212 void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
213 {
214 git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
215
216 GIT_UNUSED(f);
217 return git_hashsig_create((git_hashsig **)out, buf, len, opt);
218 }
219
220 void git_diff_find_similar__hashsig_free(void *sig, void *payload)
221 {
222 GIT_UNUSED(payload);
223 git_hashsig_free(sig);
224 }
225
226 int git_diff_find_similar__calc_similarity(
227 int *score, void *siga, void *sigb, void *payload)
228 {
229 int error;
230
231 GIT_UNUSED(payload);
232 error = git_hashsig_compare(siga, sigb);
233 if (error < 0)
234 return error;
235
236 *score = error;
237 return 0;
238 }
239
240 #define DEFAULT_THRESHOLD 50
241 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60
242 #define DEFAULT_RENAME_LIMIT 200
243
244 static int normalize_find_opts(
245 git_diff *diff,
246 git_diff_find_options *opts,
247 const git_diff_find_options *given)
248 {
249 git_config *cfg = NULL;
250 git_hashsig_option_t hashsig_opts;
251
252 GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
253
254 if (diff->repo != NULL &&
255 git_repository_config__weakptr(&cfg, diff->repo) < 0)
256 return -1;
257
258 if (given)
259 memcpy(opts, given, sizeof(*opts));
260
261 if (!given ||
262 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
263 {
264 char *rule =
265 git_config__get_string_force(cfg, "diff.renames", "true");
266 int boolval;
267
268 if (!git__parse_bool(&boolval, rule) && !boolval)
269 /* don't set FIND_RENAMES if bool value is false */;
270 else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
271 opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
272 else
273 opts->flags |= GIT_DIFF_FIND_RENAMES;
274
275 git__free(rule);
276 }
277
278 /* some flags imply others */
279
280 if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
281 /* if we are only looking for exact matches, then don't turn
282 * MODIFIED items into ADD/DELETE pairs because it's too picky
283 */
284 opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
285
286 /* similarly, don't look for self-rewrites to split */
287 opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
288 }
289
290 if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
291 opts->flags |= GIT_DIFF_FIND_RENAMES;
292
293 if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
294 opts->flags |= GIT_DIFF_FIND_COPIES;
295
296 if (opts->flags & GIT_DIFF_BREAK_REWRITES)
297 opts->flags |= GIT_DIFF_FIND_REWRITES;
298
299 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
300
301 if (USE_DEFAULT(opts->rename_threshold))
302 opts->rename_threshold = DEFAULT_THRESHOLD;
303
304 if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
305 opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
306
307 if (USE_DEFAULT(opts->copy_threshold))
308 opts->copy_threshold = DEFAULT_THRESHOLD;
309
310 if (USE_DEFAULT(opts->break_rewrite_threshold))
311 opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
312
313 #undef USE_DEFAULT
314
315 if (!opts->rename_limit) {
316 opts->rename_limit = git_config__get_int_force(
317 cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
318
319 if (opts->rename_limit <= 0)
320 opts->rename_limit = DEFAULT_RENAME_LIMIT;
321 }
322
323 /* assign the internal metric with whitespace flag as payload */
324 if (!opts->metric) {
325 opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
326 GITERR_CHECK_ALLOC(opts->metric);
327
328 opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
329 opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
330 opts->metric->free_signature = git_diff_find_similar__hashsig_free;
331 opts->metric->similarity = git_diff_find_similar__calc_similarity;
332
333 if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
334 hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
335 else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
336 hashsig_opts = GIT_HASHSIG_NORMAL;
337 else
338 hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
339 hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
340 opts->metric->payload = (void *)hashsig_opts;
341 }
342
343 return 0;
344 }
345
346 static int insert_delete_side_of_split(
347 git_diff *diff, git_vector *onto, const git_diff_delta *delta)
348 {
349 /* make new record for DELETED side of split */
350 git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool);
351 GITERR_CHECK_ALLOC(deleted);
352
353 deleted->status = GIT_DELTA_DELETED;
354 deleted->nfiles = 1;
355 memset(&deleted->new_file, 0, sizeof(deleted->new_file));
356 deleted->new_file.path = deleted->old_file.path;
357 deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
358
359 return git_vector_insert(onto, deleted);
360 }
361
362 static int apply_splits_and_deletes(
363 git_diff *diff, size_t expected_size, bool actually_split)
364 {
365 git_vector onto = GIT_VECTOR_INIT;
366 size_t i;
367 git_diff_delta *delta;
368
369 if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
370 return -1;
371
372 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
373 git_vector_foreach(&diff->deltas, i, delta) {
374 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
375 continue;
376
377 if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
378 delta->similarity = 0;
379
380 if (insert_delete_side_of_split(diff, &onto, delta) < 0)
381 goto on_error;
382
383 if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
384 delta->status = GIT_DELTA_UNTRACKED;
385 else
386 delta->status = GIT_DELTA_ADDED;
387 delta->nfiles = 1;
388 memset(&delta->old_file, 0, sizeof(delta->old_file));
389 delta->old_file.path = delta->new_file.path;
390 delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
391 }
392
393 /* clean up delta before inserting into new list */
394 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
395
396 if (delta->status != GIT_DELTA_COPIED &&
397 delta->status != GIT_DELTA_RENAMED &&
398 (delta->status != GIT_DELTA_MODIFIED || actually_split))
399 delta->similarity = 0;
400
401 /* insert into new list */
402 if (git_vector_insert(&onto, delta) < 0)
403 goto on_error;
404 }
405
406 /* cannot return an error past this point */
407
408 /* free deltas from old list that didn't make it to the new one */
409 git_vector_foreach(&diff->deltas, i, delta) {
410 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
411 git__free(delta);
412 }
413
414 /* swap new delta list into place */
415 git_vector_swap(&diff->deltas, &onto);
416 git_vector_free(&onto);
417 git_vector_sort(&diff->deltas);
418
419 return 0;
420
421 on_error:
422 git_vector_free_deep(&onto);
423
424 return -1;
425 }
426
427 GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
428 {
429 git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
430 return (idx & 1) ? &delta->new_file : &delta->old_file;
431 }
432
433 typedef struct {
434 size_t idx;
435 git_iterator_type_t src;
436 git_repository *repo;
437 git_diff_file *file;
438 git_buf data;
439 git_odb_object *odb_obj;
440 git_blob *blob;
441 } similarity_info;
442
443 static int similarity_init(
444 similarity_info *info, git_diff *diff, size_t file_idx)
445 {
446 info->idx = file_idx;
447 info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
448 info->repo = diff->repo;
449 info->file = similarity_get_file(diff, file_idx);
450 info->odb_obj = NULL;
451 info->blob = NULL;
452 git_buf_init(&info->data, 0);
453
454 if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR)
455 return 0;
456
457 return git_diff_file__resolve_zero_size(
458 info->file, &info->odb_obj, info->repo);
459 }
460
461 static int similarity_sig(
462 similarity_info *info,
463 const git_diff_find_options *opts,
464 void **cache)
465 {
466 int error = 0;
467 git_diff_file *file = info->file;
468
469 if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
470 if ((error = git_buf_joinpath(
471 &info->data, git_repository_workdir(info->repo), file->path)) < 0)
472 return error;
473
474 /* if path is not a regular file, just skip this item */
475 if (!git_path_isfile(info->data.ptr))
476 return 0;
477
478 /* TODO: apply wd-to-odb filters to file data if necessary */
479
480 error = opts->metric->file_signature(
481 &cache[info->idx], info->file,
482 info->data.ptr, opts->metric->payload);
483 } else {
484 /* if we didn't initially know the size, we might have an odb_obj
485 * around from earlier, so convert that, otherwise load the blob now
486 */
487 if (info->odb_obj != NULL)
488 error = git_object__from_odb_object(
489 (git_object **)&info->blob, info->repo,
490 info->odb_obj, GIT_OBJ_BLOB);
491 else
492 error = git_blob_lookup(&info->blob, info->repo, &file->id);
493
494 if (error < 0) {
495 /* if lookup fails, just skip this item in similarity calc */
496 giterr_clear();
497 } else {
498 size_t sz;
499
500 /* index size may not be actual blob size if filtered */
501 if (file->size != git_blob_rawsize(info->blob))
502 file->size = git_blob_rawsize(info->blob);
503
504 sz = (size_t)(git__is_sizet(file->size) ? file->size : -1);
505
506 error = opts->metric->buffer_signature(
507 &cache[info->idx], info->file,
508 git_blob_rawcontent(info->blob), sz, opts->metric->payload);
509 }
510 }
511
512 return error;
513 }
514
515 static void similarity_unload(similarity_info *info)
516 {
517 if (info->odb_obj)
518 git_odb_object_free(info->odb_obj);
519
520 if (info->blob)
521 git_blob_free(info->blob);
522 else
523 git_buf_free(&info->data);
524 }
525
526 #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
527
528 /* - score < 0 means files cannot be compared
529 * - score >= 100 means files are exact match
530 * - score == 0 means files are completely different
531 */
532 static int similarity_measure(
533 int *score,
534 git_diff *diff,
535 const git_diff_find_options *opts,
536 void **cache,
537 size_t a_idx,
538 size_t b_idx)
539 {
540 git_diff_file *a_file = similarity_get_file(diff, a_idx);
541 git_diff_file *b_file = similarity_get_file(diff, b_idx);
542 bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
543 int error = 0;
544 similarity_info a_info, b_info;
545
546 *score = -1;
547
548 /* don't try to compare files of different types */
549 if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
550 return 0;
551
552 /* if exact match is requested, force calculation of missing OIDs now */
553 if (exact_match) {
554 if (git_oid_iszero(&a_file->id) &&
555 diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
556 !git_diff__oid_for_file(&a_file->id,
557 diff, a_file->path, a_file->mode, a_file->size))
558 a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
559
560 if (git_oid_iszero(&b_file->id) &&
561 diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
562 !git_diff__oid_for_file(&b_file->id,
563 diff, b_file->path, b_file->mode, b_file->size))
564 b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
565 }
566
567 /* check OID match as a quick test */
568 if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
569 *score = 100;
570 return 0;
571 }
572
573 /* don't calculate signatures if we are doing exact match */
574 if (exact_match) {
575 *score = 0;
576 return 0;
577 }
578
579 memset(&a_info, 0, sizeof(a_info));
580 memset(&b_info, 0, sizeof(b_info));
581
582 /* set up similarity data (will try to update missing file sizes) */
583 if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
584 return error;
585 if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
586 goto cleanup;
587
588 /* check if file sizes are nowhere near each other */
589 if (a_file->size > 127 &&
590 b_file->size > 127 &&
591 (a_file->size > (b_file->size << 3) ||
592 b_file->size > (a_file->size << 3)))
593 goto cleanup;
594
595 /* update signature cache if needed */
596 if (!cache[a_idx]) {
597 if ((error = similarity_sig(&a_info, opts, cache)) < 0)
598 goto cleanup;
599 }
600 if (!cache[b_idx]) {
601 if ((error = similarity_sig(&b_info, opts, cache)) < 0)
602 goto cleanup;
603 }
604
605 /* calculate similarity provided that the metric choose to process
606 * both the a and b files (some may not if file is too big, etc).
607 */
608 if (cache[a_idx] && cache[b_idx])
609 error = opts->metric->similarity(
610 score, cache[a_idx], cache[b_idx], opts->metric->payload);
611
612 cleanup:
613 similarity_unload(&a_info);
614 similarity_unload(&b_info);
615
616 return error;
617 }
618
619 static int calc_self_similarity(
620 git_diff *diff,
621 const git_diff_find_options *opts,
622 size_t delta_idx,
623 void **cache)
624 {
625 int error, similarity = -1;
626 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
627
628 if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
629 return 0;
630
631 error = similarity_measure(
632 &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
633 if (error < 0)
634 return error;
635
636 if (similarity >= 0) {
637 delta->similarity = (uint16_t)similarity;
638 delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
639 }
640
641 return 0;
642 }
643
644 static bool is_rename_target(
645 git_diff *diff,
646 const git_diff_find_options *opts,
647 size_t delta_idx,
648 void **cache)
649 {
650 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
651
652 /* skip things that aren't plain blobs */
653 if (!GIT_MODE_ISBLOB(delta->new_file.mode))
654 return false;
655
656 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
657 * targets; maybe include UNTRACKED if requested.
658 */
659 switch (delta->status) {
660 case GIT_DELTA_UNMODIFIED:
661 case GIT_DELTA_DELETED:
662 case GIT_DELTA_IGNORED:
663 case GIT_DELTA_CONFLICTED:
664 return false;
665
666 case GIT_DELTA_MODIFIED:
667 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
668 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
669 return false;
670
671 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
672 return false;
673
674 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
675 delta->similarity < opts->break_rewrite_threshold) {
676 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
677 break;
678 }
679 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
680 delta->similarity < opts->rename_from_rewrite_threshold)
681 break;
682
683 return false;
684
685 case GIT_DELTA_UNTRACKED:
686 if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
687 return false;
688 break;
689
690 default: /* all other status values should be checked */
691 break;
692 }
693
694 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
695 return true;
696 }
697
698 static bool is_rename_source(
699 git_diff *diff,
700 const git_diff_find_options *opts,
701 size_t delta_idx,
702 void **cache)
703 {
704 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
705
706 /* skip things that aren't blobs */
707 if (!GIT_MODE_ISBLOB(delta->old_file.mode))
708 return false;
709
710 switch (delta->status) {
711 case GIT_DELTA_ADDED:
712 case GIT_DELTA_UNTRACKED:
713 case GIT_DELTA_UNREADABLE:
714 case GIT_DELTA_IGNORED:
715 case GIT_DELTA_CONFLICTED:
716 return false;
717
718 case GIT_DELTA_DELETED:
719 case GIT_DELTA_TYPECHANGE:
720 break;
721
722 case GIT_DELTA_UNMODIFIED:
723 if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
724 return false;
725 if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
726 delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
727 break;
728
729 default: /* MODIFIED, RENAMED, COPIED */
730 /* if we're finding copies, this could be a source */
731 if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
732 break;
733
734 /* otherwise, this is only a source if we can split it */
735 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
736 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
737 return false;
738
739 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
740 return false;
741
742 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
743 delta->similarity < opts->break_rewrite_threshold) {
744 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
745 break;
746 }
747
748 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
749 delta->similarity < opts->rename_from_rewrite_threshold)
750 break;
751
752 return false;
753 }
754
755 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
756 return true;
757 }
758
759 GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
760 {
761 return (delta->status == GIT_DELTA_TYPECHANGE ||
762 (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
763 }
764
765 GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
766 {
767 return (delta->status == GIT_DELTA_ADDED ||
768 delta->status == GIT_DELTA_UNTRACKED ||
769 delta->status == GIT_DELTA_UNREADABLE ||
770 delta->status == GIT_DELTA_IGNORED);
771 }
772
773 GIT_INLINE(void) delta_make_rename(
774 git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
775 {
776 to->status = GIT_DELTA_RENAMED;
777 to->similarity = similarity;
778 to->nfiles = 2;
779 memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
780 to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
781 }
782
783 typedef struct {
784 size_t idx;
785 uint16_t similarity;
786 } diff_find_match;
787
788 int git_diff_find_similar(
789 git_diff *diff,
790 const git_diff_find_options *given_opts)
791 {
792 size_t s, t;
793 int error = 0, result;
794 uint16_t similarity;
795 git_diff_delta *src, *tgt;
796 git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
797 size_t num_deltas, num_srcs = 0, num_tgts = 0;
798 size_t tried_srcs = 0, tried_tgts = 0;
799 size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
800 size_t sigcache_size;
801 void **sigcache = NULL; /* cache of similarity metric file signatures */
802 diff_find_match *tgt2src = NULL;
803 diff_find_match *src2tgt = NULL;
804 diff_find_match *tgt2src_copy = NULL;
805 diff_find_match *best_match;
806 git_diff_file swap;
807
808 if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
809 return error;
810
811 num_deltas = diff->deltas.length;
812
813 /* TODO: maybe abort if deltas.length > rename_limit ??? */
814 if (!git__is_uint32(num_deltas))
815 goto cleanup;
816
817 /* No flags set; nothing to do */
818 if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
819 goto cleanup;
820
821 GITERR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
822 sigcache = git__calloc(sigcache_size, sizeof(void *));
823 GITERR_CHECK_ALLOC(sigcache);
824
825 /* Label rename sources and targets
826 *
827 * This will also set self-similarity scores for MODIFIED files and
828 * mark them for splitting if break-rewrites is enabled
829 */
830 git_vector_foreach(&diff->deltas, t, tgt) {
831 if (is_rename_source(diff, &opts, t, sigcache))
832 ++num_srcs;
833
834 if (is_rename_target(diff, &opts, t, sigcache))
835 ++num_tgts;
836
837 if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
838 num_rewrites++;
839 }
840
841 /* if there are no candidate srcs or tgts, we're done */
842 if (!num_srcs || !num_tgts)
843 goto cleanup;
844
845 src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
846 GITERR_CHECK_ALLOC(src2tgt);
847 tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
848 GITERR_CHECK_ALLOC(tgt2src);
849
850 if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
851 tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
852 GITERR_CHECK_ALLOC(tgt2src_copy);
853 }
854
855 /*
856 * Find best-fit matches for rename / copy candidates
857 */
858
859 find_best_matches:
860 tried_tgts = num_bumped = 0;
861
862 git_vector_foreach(&diff->deltas, t, tgt) {
863 /* skip things that are not rename targets */
864 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
865 continue;
866
867 tried_srcs = 0;
868
869 git_vector_foreach(&diff->deltas, s, src) {
870 /* skip things that are not rename sources */
871 if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
872 continue;
873
874 /* calculate similarity for this pair and find best match */
875 if (s == t)
876 result = -1; /* don't measure self-similarity here */
877 else if ((error = similarity_measure(
878 &result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
879 goto cleanup;
880
881 if (result < 0)
882 continue;
883 similarity = (uint16_t)result;
884
885 /* is this a better rename? */
886 if (tgt2src[t].similarity < similarity &&
887 src2tgt[s].similarity < similarity)
888 {
889 /* eject old mapping */
890 if (src2tgt[s].similarity > 0) {
891 tgt2src[src2tgt[s].idx].similarity = 0;
892 num_bumped++;
893 }
894 if (tgt2src[t].similarity > 0) {
895 src2tgt[tgt2src[t].idx].similarity = 0;
896 num_bumped++;
897 }
898
899 /* write new mapping */
900 tgt2src[t].idx = s;
901 tgt2src[t].similarity = similarity;
902 src2tgt[s].idx = t;
903 src2tgt[s].similarity = similarity;
904 }
905
906 /* keep best absolute match for copies */
907 if (tgt2src_copy != NULL &&
908 tgt2src_copy[t].similarity < similarity)
909 {
910 tgt2src_copy[t].idx = s;
911 tgt2src_copy[t].similarity = similarity;
912 }
913
914 if (++tried_srcs >= num_srcs)
915 break;
916
917 /* cap on maximum targets we'll examine (per "tgt" file) */
918 if (tried_srcs > opts.rename_limit)
919 break;
920 }
921
922 if (++tried_tgts >= num_tgts)
923 break;
924 }
925
926 if (num_bumped > 0) /* try again if we bumped some items */
927 goto find_best_matches;
928
929 /*
930 * Rewrite the diffs with renames / copies
931 */
932
933 git_vector_foreach(&diff->deltas, t, tgt) {
934 /* skip things that are not rename targets */
935 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
936 continue;
937
938 /* check if this delta was the target of a similarity */
939 if (tgt2src[t].similarity)
940 best_match = &tgt2src[t];
941 else if (tgt2src_copy && tgt2src_copy[t].similarity)
942 best_match = &tgt2src_copy[t];
943 else
944 continue;
945
946 s = best_match->idx;
947 src = GIT_VECTOR_GET(&diff->deltas, s);
948
949 /* possible scenarios:
950 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
951 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
952 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
953 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
954 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
955 */
956
957 if (src->status == GIT_DELTA_DELETED) {
958
959 if (delta_is_new_only(tgt)) {
960
961 if (best_match->similarity < opts.rename_threshold)
962 continue;
963
964 delta_make_rename(tgt, src, best_match->similarity);
965
966 src->flags |= GIT_DIFF_FLAG__TO_DELETE;
967 num_rewrites++;
968 } else {
969 assert(delta_is_split(tgt));
970
971 if (best_match->similarity < opts.rename_from_rewrite_threshold)
972 continue;
973
974 memcpy(&swap, &tgt->old_file, sizeof(swap));
975
976 delta_make_rename(tgt, src, best_match->similarity);
977 num_rewrites--;
978
979 assert(src->status == GIT_DELTA_DELETED);
980 memcpy(&src->old_file, &swap, sizeof(src->old_file));
981 memset(&src->new_file, 0, sizeof(src->new_file));
982 src->new_file.path = src->old_file.path;
983 src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
984
985 num_updates++;
986
987 if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
988 /* what used to be at src t is now at src s */
989 tgt2src[src2tgt[t].idx].idx = s;
990 }
991 }
992 }
993
994 else if (delta_is_split(src)) {
995
996 if (delta_is_new_only(tgt)) {
997
998 if (best_match->similarity < opts.rename_threshold)
999 continue;
1000
1001 delta_make_rename(tgt, src, best_match->similarity);
1002
1003 src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
1004 GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
1005 src->nfiles = 1;
1006 memset(&src->old_file, 0, sizeof(src->old_file));
1007 src->old_file.path = src->new_file.path;
1008 src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
1009
1010 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1011 num_rewrites--;
1012
1013 num_updates++;
1014 } else {
1015 assert(delta_is_split(src));
1016
1017 if (best_match->similarity < opts.rename_from_rewrite_threshold)
1018 continue;
1019
1020 memcpy(&swap, &tgt->old_file, sizeof(swap));
1021
1022 delta_make_rename(tgt, src, best_match->similarity);
1023 num_rewrites--;
1024 num_updates++;
1025
1026 memcpy(&src->old_file, &swap, sizeof(src->old_file));
1027
1028 /* if we've just swapped the new element into the correct
1029 * place, clear the SPLIT flag
1030 */
1031 if (tgt2src[s].idx == t &&
1032 tgt2src[s].similarity >
1033 opts.rename_from_rewrite_threshold) {
1034 src->status = GIT_DELTA_RENAMED;
1035 src->similarity = tgt2src[s].similarity;
1036 tgt2src[s].similarity = 0;
1037 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1038 num_rewrites--;
1039 }
1040 /* otherwise, if we just overwrote a source, update mapping */
1041 else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1042 /* what used to be at src t is now at src s */
1043 tgt2src[src2tgt[t].idx].idx = s;
1044 }
1045
1046 num_updates++;
1047 }
1048 }
1049
1050 else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
1051 if (tgt2src_copy[t].similarity < opts.copy_threshold)
1052 continue;
1053
1054 /* always use best possible source for copy */
1055 best_match = &tgt2src_copy[t];
1056 src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
1057
1058 if (delta_is_split(tgt)) {
1059 error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
1060 if (error < 0)
1061 goto cleanup;
1062 num_rewrites--;
1063 }
1064
1065 if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
1066 continue;
1067
1068 tgt->status = GIT_DELTA_COPIED;
1069 tgt->similarity = best_match->similarity;
1070 tgt->nfiles = 2;
1071 memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
1072 tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1073
1074 num_updates++;
1075 }
1076 }
1077
1078 /*
1079 * Actually split and delete entries as needed
1080 */
1081
1082 if (num_rewrites > 0 || num_updates > 0)
1083 error = apply_splits_and_deletes(
1084 diff, diff->deltas.length - num_rewrites,
1085 FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
1086 !FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
1087
1088 cleanup:
1089 git__free(tgt2src);
1090 git__free(src2tgt);
1091 git__free(tgt2src_copy);
1092
1093 if (sigcache) {
1094 for (t = 0; t < num_deltas * 2; ++t) {
1095 if (sigcache[t] != NULL)
1096 opts.metric->free_signature(sigcache[t], opts.metric->payload);
1097 }
1098 git__free(sigcache);
1099 }
1100
1101 if (!given_opts || !given_opts->metric)
1102 git__free(opts.metric);
1103
1104 return error;
1105 }
1106
1107 #undef FLAG_SET