]> git.proxmox.com Git - libgit2.git/blob - src/diff_tform.c
00ce1cbf63d15d31496b77b1c4defb5ccc2a3dd0
[libgit2.git] / src / diff_tform.c
1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "diff_tform.h"
9
10 #include "git2/config.h"
11 #include "git2/blob.h"
12 #include "git2/sys/hashsig.h"
13
14 #include "diff.h"
15 #include "diff_generate.h"
16 #include "path.h"
17 #include "fileops.h"
18 #include "config.h"
19
20 git_diff_delta *git_diff__delta_dup(
21 const git_diff_delta *d, git_pool *pool)
22 {
23 git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
24 if (!delta)
25 return NULL;
26
27 memcpy(delta, d, sizeof(git_diff_delta));
28 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
29
30 if (d->old_file.path != NULL) {
31 delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
32 if (delta->old_file.path == NULL)
33 goto fail;
34 }
35
36 if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
37 delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
38 if (delta->new_file.path == NULL)
39 goto fail;
40 } else {
41 delta->new_file.path = delta->old_file.path;
42 }
43
44 return delta;
45
46 fail:
47 git__free(delta);
48 return NULL;
49 }
50
51 git_diff_delta *git_diff__merge_like_cgit(
52 const git_diff_delta *a,
53 const git_diff_delta *b,
54 git_pool *pool)
55 {
56 git_diff_delta *dup;
57
58 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
59 *
60 * When C git does a diff between the work dir and a tree, it actually
61 * diffs with the index but uses the workdir contents. This emulates
62 * those choices so we can emulate the type of diff.
63 *
64 * We have three file descriptions here, let's call them:
65 * f1 = a->old_file
66 * f2 = a->new_file AND b->old_file
67 * f3 = b->new_file
68 */
69
70 /* If one of the diffs is a conflict, just dup it */
71 if (b->status == GIT_DELTA_CONFLICTED)
72 return git_diff__delta_dup(b, pool);
73 if (a->status == GIT_DELTA_CONFLICTED)
74 return git_diff__delta_dup(a, pool);
75
76 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
77 if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
78 return git_diff__delta_dup(a, pool);
79
80 /* otherwise, base this diff on the 'b' diff */
81 if ((dup = git_diff__delta_dup(b, pool)) == NULL)
82 return NULL;
83
84 /* If 'a' status is uninteresting, then we're done */
85 if (a->status == GIT_DELTA_UNMODIFIED ||
86 a->status == GIT_DELTA_UNTRACKED ||
87 a->status == GIT_DELTA_UNREADABLE)
88 return dup;
89
90 assert(b->status != GIT_DELTA_UNMODIFIED);
91
92 /* A cgit exception is that the diff of a file that is only in the
93 * index (i.e. not in HEAD nor workdir) is given as empty.
94 */
95 if (dup->status == GIT_DELTA_DELETED) {
96 if (a->status == GIT_DELTA_ADDED) {
97 dup->status = GIT_DELTA_UNMODIFIED;
98 dup->nfiles = 2;
99 }
100 /* else don't overwrite DELETE status */
101 } else {
102 dup->status = a->status;
103 dup->nfiles = a->nfiles;
104 }
105
106 git_oid_cpy(&dup->old_file.id, &a->old_file.id);
107 dup->old_file.mode = a->old_file.mode;
108 dup->old_file.size = a->old_file.size;
109 dup->old_file.flags = a->old_file.flags;
110
111 return dup;
112 }
113
114 int git_diff__merge(
115 git_diff *onto, const git_diff *from, git_diff__merge_cb cb)
116 {
117 int error = 0;
118 git_pool onto_pool;
119 git_vector onto_new;
120 git_diff_delta *delta;
121 bool ignore_case, reversed;
122 unsigned int i, j;
123
124 assert(onto && from);
125
126 if (!from->deltas.length)
127 return 0;
128
129 ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
130 reversed = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);
131
132 if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
133 reversed != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
134 git_error_set(GIT_ERROR_INVALID,
135 "attempt to merge diffs created with conflicting options");
136 return -1;
137 }
138
139 if (git_vector_init(&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0)
140 return -1;
141
142 git_pool_init(&onto_pool, 1);
143
144 for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
145 git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
146 const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
147 int cmp = !f ? -1 : !o ? 1 :
148 STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
149
150 if (cmp < 0) {
151 delta = git_diff__delta_dup(o, &onto_pool);
152 i++;
153 } else if (cmp > 0) {
154 delta = git_diff__delta_dup(f, &onto_pool);
155 j++;
156 } else {
157 const git_diff_delta *left = reversed ? f : o;
158 const git_diff_delta *right = reversed ? o : f;
159
160 delta = cb(left, right, &onto_pool);
161 i++;
162 j++;
163 }
164
165 /* the ignore rules for the target may not match the source
166 * or the result of a merged delta could be skippable...
167 */
168 if (delta && git_diff_delta__should_skip(&onto->opts, delta)) {
169 git__free(delta);
170 continue;
171 }
172
173 if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
174 break;
175 }
176
177 if (!error) {
178 git_vector_swap(&onto->deltas, &onto_new);
179 git_pool_swap(&onto->pool, &onto_pool);
180
181 if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
182 onto->old_src = from->old_src;
183 else
184 onto->new_src = from->new_src;
185
186 /* prefix strings also come from old pool, so recreate those.*/
187 onto->opts.old_prefix =
188 git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
189 onto->opts.new_prefix =
190 git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
191 }
192
193 git_vector_free_deep(&onto_new);
194 git_pool_clear(&onto_pool);
195
196 return error;
197 }
198
199 int git_diff_merge(git_diff *onto, const git_diff *from)
200 {
201 return git_diff__merge(onto, from, git_diff__merge_like_cgit);
202 }
203
204 int git_diff_find_similar__hashsig_for_file(
205 void **out, const git_diff_file *f, const char *path, void *p)
206 {
207 git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
208
209 GIT_UNUSED(f);
210 return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
211 }
212
213 int git_diff_find_similar__hashsig_for_buf(
214 void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
215 {
216 git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
217
218 GIT_UNUSED(f);
219 return git_hashsig_create((git_hashsig **)out, buf, len, opt);
220 }
221
222 void git_diff_find_similar__hashsig_free(void *sig, void *payload)
223 {
224 GIT_UNUSED(payload);
225 git_hashsig_free(sig);
226 }
227
228 int git_diff_find_similar__calc_similarity(
229 int *score, void *siga, void *sigb, void *payload)
230 {
231 int error;
232
233 GIT_UNUSED(payload);
234 error = git_hashsig_compare(siga, sigb);
235 if (error < 0)
236 return error;
237
238 *score = error;
239 return 0;
240 }
241
242 #define DEFAULT_THRESHOLD 50
243 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60
244 #define DEFAULT_RENAME_LIMIT 200
245
246 static int normalize_find_opts(
247 git_diff *diff,
248 git_diff_find_options *opts,
249 const git_diff_find_options *given)
250 {
251 git_config *cfg = NULL;
252 git_hashsig_option_t hashsig_opts;
253
254 GIT_ERROR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
255
256 if (diff->repo != NULL &&
257 git_repository_config__weakptr(&cfg, diff->repo) < 0)
258 return -1;
259
260 if (given)
261 memcpy(opts, given, sizeof(*opts));
262
263 if (!given ||
264 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
265 {
266 if (cfg) {
267 char *rule =
268 git_config__get_string_force(cfg, "diff.renames", "true");
269 int boolval;
270
271 if (!git__parse_bool(&boolval, rule) && !boolval)
272 /* don't set FIND_RENAMES if bool value is false */;
273 else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
274 opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
275 else
276 opts->flags |= GIT_DIFF_FIND_RENAMES;
277
278 git__free(rule);
279 } else {
280 /* set default flag */
281 opts->flags |= GIT_DIFF_FIND_RENAMES;
282 }
283 }
284
285 /* some flags imply others */
286
287 if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
288 /* if we are only looking for exact matches, then don't turn
289 * MODIFIED items into ADD/DELETE pairs because it's too picky
290 */
291 opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
292
293 /* similarly, don't look for self-rewrites to split */
294 opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
295 }
296
297 if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
298 opts->flags |= GIT_DIFF_FIND_RENAMES;
299
300 if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
301 opts->flags |= GIT_DIFF_FIND_COPIES;
302
303 if (opts->flags & GIT_DIFF_BREAK_REWRITES)
304 opts->flags |= GIT_DIFF_FIND_REWRITES;
305
306 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
307
308 if (USE_DEFAULT(opts->rename_threshold))
309 opts->rename_threshold = DEFAULT_THRESHOLD;
310
311 if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
312 opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
313
314 if (USE_DEFAULT(opts->copy_threshold))
315 opts->copy_threshold = DEFAULT_THRESHOLD;
316
317 if (USE_DEFAULT(opts->break_rewrite_threshold))
318 opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
319
320 #undef USE_DEFAULT
321
322 if (!opts->rename_limit) {
323 if (cfg) {
324 opts->rename_limit = git_config__get_int_force(
325 cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
326 }
327
328 if (opts->rename_limit <= 0)
329 opts->rename_limit = DEFAULT_RENAME_LIMIT;
330 }
331
332 /* assign the internal metric with whitespace flag as payload */
333 if (!opts->metric) {
334 opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
335 GIT_ERROR_CHECK_ALLOC(opts->metric);
336
337 opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
338 opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
339 opts->metric->free_signature = git_diff_find_similar__hashsig_free;
340 opts->metric->similarity = git_diff_find_similar__calc_similarity;
341
342 if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
343 hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
344 else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
345 hashsig_opts = GIT_HASHSIG_NORMAL;
346 else
347 hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
348 hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
349 opts->metric->payload = (void *)hashsig_opts;
350 }
351
352 return 0;
353 }
354
355 static int insert_delete_side_of_split(
356 git_diff *diff, git_vector *onto, const git_diff_delta *delta)
357 {
358 /* make new record for DELETED side of split */
359 git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool);
360 GIT_ERROR_CHECK_ALLOC(deleted);
361
362 deleted->status = GIT_DELTA_DELETED;
363 deleted->nfiles = 1;
364 memset(&deleted->new_file, 0, sizeof(deleted->new_file));
365 deleted->new_file.path = deleted->old_file.path;
366 deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
367
368 return git_vector_insert(onto, deleted);
369 }
370
371 static int apply_splits_and_deletes(
372 git_diff *diff, size_t expected_size, bool actually_split)
373 {
374 git_vector onto = GIT_VECTOR_INIT;
375 size_t i;
376 git_diff_delta *delta;
377
378 if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
379 return -1;
380
381 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
382 git_vector_foreach(&diff->deltas, i, delta) {
383 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
384 continue;
385
386 if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
387 delta->similarity = 0;
388
389 if (insert_delete_side_of_split(diff, &onto, delta) < 0)
390 goto on_error;
391
392 if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
393 delta->status = GIT_DELTA_UNTRACKED;
394 else
395 delta->status = GIT_DELTA_ADDED;
396 delta->nfiles = 1;
397 memset(&delta->old_file, 0, sizeof(delta->old_file));
398 delta->old_file.path = delta->new_file.path;
399 delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
400 }
401
402 /* clean up delta before inserting into new list */
403 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
404
405 if (delta->status != GIT_DELTA_COPIED &&
406 delta->status != GIT_DELTA_RENAMED &&
407 (delta->status != GIT_DELTA_MODIFIED || actually_split))
408 delta->similarity = 0;
409
410 /* insert into new list */
411 if (git_vector_insert(&onto, delta) < 0)
412 goto on_error;
413 }
414
415 /* cannot return an error past this point */
416
417 /* free deltas from old list that didn't make it to the new one */
418 git_vector_foreach(&diff->deltas, i, delta) {
419 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
420 git__free(delta);
421 }
422
423 /* swap new delta list into place */
424 git_vector_swap(&diff->deltas, &onto);
425 git_vector_free(&onto);
426 git_vector_sort(&diff->deltas);
427
428 return 0;
429
430 on_error:
431 git_vector_free_deep(&onto);
432
433 return -1;
434 }
435
436 GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
437 {
438 git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
439 return (idx & 1) ? &delta->new_file : &delta->old_file;
440 }
441
442 typedef struct {
443 size_t idx;
444 git_iterator_type_t src;
445 git_repository *repo;
446 git_diff_file *file;
447 git_buf data;
448 git_odb_object *odb_obj;
449 git_blob *blob;
450 } similarity_info;
451
452 static int similarity_init(
453 similarity_info *info, git_diff *diff, size_t file_idx)
454 {
455 info->idx = file_idx;
456 info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
457 info->repo = diff->repo;
458 info->file = similarity_get_file(diff, file_idx);
459 info->odb_obj = NULL;
460 info->blob = NULL;
461 git_buf_init(&info->data, 0);
462
463 if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR)
464 return 0;
465
466 return git_diff_file__resolve_zero_size(
467 info->file, &info->odb_obj, info->repo);
468 }
469
470 static int similarity_sig(
471 similarity_info *info,
472 const git_diff_find_options *opts,
473 void **cache)
474 {
475 int error = 0;
476 git_diff_file *file = info->file;
477
478 if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
479 if ((error = git_buf_joinpath(
480 &info->data, git_repository_workdir(info->repo), file->path)) < 0)
481 return error;
482
483 /* if path is not a regular file, just skip this item */
484 if (!git_path_isfile(info->data.ptr))
485 return 0;
486
487 /* TODO: apply wd-to-odb filters to file data if necessary */
488
489 error = opts->metric->file_signature(
490 &cache[info->idx], info->file,
491 info->data.ptr, opts->metric->payload);
492 } else {
493 /* if we didn't initially know the size, we might have an odb_obj
494 * around from earlier, so convert that, otherwise load the blob now
495 */
496 if (info->odb_obj != NULL)
497 error = git_object__from_odb_object(
498 (git_object **)&info->blob, info->repo,
499 info->odb_obj, GIT_OBJECT_BLOB);
500 else
501 error = git_blob_lookup(&info->blob, info->repo, &file->id);
502
503 if (error < 0) {
504 /* if lookup fails, just skip this item in similarity calc */
505 git_error_clear();
506 } else {
507 size_t sz;
508
509 /* index size may not be actual blob size if filtered */
510 if (file->size != git_blob_rawsize(info->blob))
511 file->size = git_blob_rawsize(info->blob);
512
513 sz = (size_t)(git__is_sizet(file->size) ? file->size : -1);
514
515 error = opts->metric->buffer_signature(
516 &cache[info->idx], info->file,
517 git_blob_rawcontent(info->blob), sz, opts->metric->payload);
518 }
519 }
520
521 return error;
522 }
523
524 static void similarity_unload(similarity_info *info)
525 {
526 if (info->odb_obj)
527 git_odb_object_free(info->odb_obj);
528
529 if (info->blob)
530 git_blob_free(info->blob);
531 else
532 git_buf_dispose(&info->data);
533 }
534
535 #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
536
537 /* - score < 0 means files cannot be compared
538 * - score >= 100 means files are exact match
539 * - score == 0 means files are completely different
540 */
541 static int similarity_measure(
542 int *score,
543 git_diff *diff,
544 const git_diff_find_options *opts,
545 void **cache,
546 size_t a_idx,
547 size_t b_idx)
548 {
549 git_diff_file *a_file = similarity_get_file(diff, a_idx);
550 git_diff_file *b_file = similarity_get_file(diff, b_idx);
551 bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
552 int error = 0;
553 similarity_info a_info, b_info;
554
555 *score = -1;
556
557 /* don't try to compare things that aren't files */
558 if (!GIT_MODE_ISBLOB(a_file->mode) || !GIT_MODE_ISBLOB(b_file->mode))
559 return 0;
560
561 /* if exact match is requested, force calculation of missing OIDs now */
562 if (exact_match) {
563 if (git_oid_iszero(&a_file->id) &&
564 diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
565 !git_diff__oid_for_file(&a_file->id,
566 diff, a_file->path, a_file->mode, a_file->size))
567 a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
568
569 if (git_oid_iszero(&b_file->id) &&
570 diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
571 !git_diff__oid_for_file(&b_file->id,
572 diff, b_file->path, b_file->mode, b_file->size))
573 b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
574 }
575
576 /* check OID match as a quick test */
577 if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
578 *score = 100;
579 return 0;
580 }
581
582 /* don't calculate signatures if we are doing exact match */
583 if (exact_match) {
584 *score = 0;
585 return 0;
586 }
587
588 memset(&a_info, 0, sizeof(a_info));
589 memset(&b_info, 0, sizeof(b_info));
590
591 /* set up similarity data (will try to update missing file sizes) */
592 if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
593 return error;
594 if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
595 goto cleanup;
596
597 /* check if file sizes are nowhere near each other */
598 if (a_file->size > 127 &&
599 b_file->size > 127 &&
600 (a_file->size > (b_file->size << 3) ||
601 b_file->size > (a_file->size << 3)))
602 goto cleanup;
603
604 /* update signature cache if needed */
605 if (!cache[a_idx]) {
606 if ((error = similarity_sig(&a_info, opts, cache)) < 0)
607 goto cleanup;
608 }
609 if (!cache[b_idx]) {
610 if ((error = similarity_sig(&b_info, opts, cache)) < 0)
611 goto cleanup;
612 }
613
614 /* calculate similarity provided that the metric choose to process
615 * both the a and b files (some may not if file is too big, etc).
616 */
617 if (cache[a_idx] && cache[b_idx])
618 error = opts->metric->similarity(
619 score, cache[a_idx], cache[b_idx], opts->metric->payload);
620
621 cleanup:
622 similarity_unload(&a_info);
623 similarity_unload(&b_info);
624
625 return error;
626 }
627
628 static int calc_self_similarity(
629 git_diff *diff,
630 const git_diff_find_options *opts,
631 size_t delta_idx,
632 void **cache)
633 {
634 int error, similarity = -1;
635 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
636
637 if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
638 return 0;
639
640 error = similarity_measure(
641 &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
642 if (error < 0)
643 return error;
644
645 if (similarity >= 0) {
646 delta->similarity = (uint16_t)similarity;
647 delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
648 }
649
650 return 0;
651 }
652
653 static bool is_rename_target(
654 git_diff *diff,
655 const git_diff_find_options *opts,
656 size_t delta_idx,
657 void **cache)
658 {
659 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
660
661 /* skip things that aren't plain blobs */
662 if (!GIT_MODE_ISBLOB(delta->new_file.mode))
663 return false;
664
665 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
666 * targets; maybe include UNTRACKED if requested.
667 */
668 switch (delta->status) {
669 case GIT_DELTA_UNMODIFIED:
670 case GIT_DELTA_DELETED:
671 case GIT_DELTA_IGNORED:
672 case GIT_DELTA_CONFLICTED:
673 return false;
674
675 case GIT_DELTA_MODIFIED:
676 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
677 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
678 return false;
679
680 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
681 return false;
682
683 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
684 delta->similarity < opts->break_rewrite_threshold) {
685 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
686 break;
687 }
688 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
689 delta->similarity < opts->rename_from_rewrite_threshold) {
690 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
691 break;
692 }
693
694 return false;
695
696 case GIT_DELTA_UNTRACKED:
697 if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
698 return false;
699 break;
700
701 default: /* all other status values should be checked */
702 break;
703 }
704
705 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
706 return true;
707 }
708
709 static bool is_rename_source(
710 git_diff *diff,
711 const git_diff_find_options *opts,
712 size_t delta_idx,
713 void **cache)
714 {
715 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
716
717 /* skip things that aren't blobs */
718 if (!GIT_MODE_ISBLOB(delta->old_file.mode))
719 return false;
720
721 switch (delta->status) {
722 case GIT_DELTA_ADDED:
723 case GIT_DELTA_UNTRACKED:
724 case GIT_DELTA_UNREADABLE:
725 case GIT_DELTA_IGNORED:
726 case GIT_DELTA_CONFLICTED:
727 return false;
728
729 case GIT_DELTA_DELETED:
730 case GIT_DELTA_TYPECHANGE:
731 break;
732
733 case GIT_DELTA_UNMODIFIED:
734 if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
735 return false;
736 if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
737 delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
738 break;
739
740 default: /* MODIFIED, RENAMED, COPIED */
741 /* if we're finding copies, this could be a source */
742 if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
743 break;
744
745 /* otherwise, this is only a source if we can split it */
746 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
747 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
748 return false;
749
750 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
751 return false;
752
753 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
754 delta->similarity < opts->break_rewrite_threshold) {
755 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
756 break;
757 }
758
759 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
760 delta->similarity < opts->rename_from_rewrite_threshold)
761 break;
762
763 return false;
764 }
765
766 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
767 return true;
768 }
769
770 GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
771 {
772 return (delta->status == GIT_DELTA_TYPECHANGE ||
773 (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
774 }
775
776 GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
777 {
778 return (delta->status == GIT_DELTA_ADDED ||
779 delta->status == GIT_DELTA_UNTRACKED ||
780 delta->status == GIT_DELTA_UNREADABLE ||
781 delta->status == GIT_DELTA_IGNORED);
782 }
783
784 GIT_INLINE(void) delta_make_rename(
785 git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
786 {
787 to->status = GIT_DELTA_RENAMED;
788 to->similarity = similarity;
789 to->nfiles = 2;
790 memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
791 to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
792 }
793
794 typedef struct {
795 size_t idx;
796 uint16_t similarity;
797 } diff_find_match;
798
799 int git_diff_find_similar(
800 git_diff *diff,
801 const git_diff_find_options *given_opts)
802 {
803 size_t s, t;
804 int error = 0, result;
805 uint16_t similarity;
806 git_diff_delta *src, *tgt;
807 git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
808 size_t num_deltas, num_srcs = 0, num_tgts = 0;
809 size_t tried_srcs = 0, tried_tgts = 0;
810 size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
811 size_t sigcache_size;
812 void **sigcache = NULL; /* cache of similarity metric file signatures */
813 diff_find_match *tgt2src = NULL;
814 diff_find_match *src2tgt = NULL;
815 diff_find_match *tgt2src_copy = NULL;
816 diff_find_match *best_match;
817 git_diff_file swap;
818
819 assert(diff);
820
821 if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
822 return error;
823
824 num_deltas = diff->deltas.length;
825
826 /* TODO: maybe abort if deltas.length > rename_limit ??? */
827 if (!num_deltas || !git__is_uint32(num_deltas))
828 goto cleanup;
829
830 /* No flags set; nothing to do */
831 if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
832 goto cleanup;
833
834 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
835 sigcache = git__calloc(sigcache_size, sizeof(void *));
836 GIT_ERROR_CHECK_ALLOC(sigcache);
837
838 /* Label rename sources and targets
839 *
840 * This will also set self-similarity scores for MODIFIED files and
841 * mark them for splitting if break-rewrites is enabled
842 */
843 git_vector_foreach(&diff->deltas, t, tgt) {
844 if (is_rename_source(diff, &opts, t, sigcache))
845 ++num_srcs;
846
847 if (is_rename_target(diff, &opts, t, sigcache))
848 ++num_tgts;
849
850 if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
851 num_rewrites++;
852 }
853
854 /* if there are no candidate srcs or tgts, we're done */
855 if (!num_srcs || !num_tgts)
856 goto cleanup;
857
858 src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
859 GIT_ERROR_CHECK_ALLOC(src2tgt);
860 tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
861 GIT_ERROR_CHECK_ALLOC(tgt2src);
862
863 if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
864 tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
865 GIT_ERROR_CHECK_ALLOC(tgt2src_copy);
866 }
867
868 /*
869 * Find best-fit matches for rename / copy candidates
870 */
871
872 find_best_matches:
873 tried_tgts = num_bumped = 0;
874
875 git_vector_foreach(&diff->deltas, t, tgt) {
876 /* skip things that are not rename targets */
877 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
878 continue;
879
880 tried_srcs = 0;
881
882 git_vector_foreach(&diff->deltas, s, src) {
883 /* skip things that are not rename sources */
884 if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
885 continue;
886
887 /* calculate similarity for this pair and find best match */
888 if (s == t)
889 result = -1; /* don't measure self-similarity here */
890 else if ((error = similarity_measure(
891 &result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
892 goto cleanup;
893
894 if (result < 0)
895 continue;
896 similarity = (uint16_t)result;
897
898 /* is this a better rename? */
899 if (tgt2src[t].similarity < similarity &&
900 src2tgt[s].similarity < similarity)
901 {
902 /* eject old mapping */
903 if (src2tgt[s].similarity > 0) {
904 tgt2src[src2tgt[s].idx].similarity = 0;
905 num_bumped++;
906 }
907 if (tgt2src[t].similarity > 0) {
908 src2tgt[tgt2src[t].idx].similarity = 0;
909 num_bumped++;
910 }
911
912 /* write new mapping */
913 tgt2src[t].idx = s;
914 tgt2src[t].similarity = similarity;
915 src2tgt[s].idx = t;
916 src2tgt[s].similarity = similarity;
917 }
918
919 /* keep best absolute match for copies */
920 if (tgt2src_copy != NULL &&
921 tgt2src_copy[t].similarity < similarity)
922 {
923 tgt2src_copy[t].idx = s;
924 tgt2src_copy[t].similarity = similarity;
925 }
926
927 if (++tried_srcs >= num_srcs)
928 break;
929
930 /* cap on maximum targets we'll examine (per "tgt" file) */
931 if (tried_srcs > opts.rename_limit)
932 break;
933 }
934
935 if (++tried_tgts >= num_tgts)
936 break;
937 }
938
939 if (num_bumped > 0) /* try again if we bumped some items */
940 goto find_best_matches;
941
942 /*
943 * Rewrite the diffs with renames / copies
944 */
945
946 git_vector_foreach(&diff->deltas, t, tgt) {
947 /* skip things that are not rename targets */
948 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
949 continue;
950
951 /* check if this delta was the target of a similarity */
952 if (tgt2src[t].similarity)
953 best_match = &tgt2src[t];
954 else if (tgt2src_copy && tgt2src_copy[t].similarity)
955 best_match = &tgt2src_copy[t];
956 else
957 continue;
958
959 s = best_match->idx;
960 src = GIT_VECTOR_GET(&diff->deltas, s);
961
962 /* possible scenarios:
963 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
964 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
965 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
966 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
967 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
968 */
969
970 if (src->status == GIT_DELTA_DELETED) {
971
972 if (delta_is_new_only(tgt)) {
973
974 if (best_match->similarity < opts.rename_threshold)
975 continue;
976
977 delta_make_rename(tgt, src, best_match->similarity);
978
979 src->flags |= GIT_DIFF_FLAG__TO_DELETE;
980 num_rewrites++;
981 } else {
982 assert(delta_is_split(tgt));
983
984 if (best_match->similarity < opts.rename_from_rewrite_threshold)
985 continue;
986
987 memcpy(&swap, &tgt->old_file, sizeof(swap));
988
989 delta_make_rename(tgt, src, best_match->similarity);
990 num_rewrites--;
991
992 assert(src->status == GIT_DELTA_DELETED);
993 memcpy(&src->old_file, &swap, sizeof(src->old_file));
994 memset(&src->new_file, 0, sizeof(src->new_file));
995 src->new_file.path = src->old_file.path;
996 src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
997
998 num_updates++;
999
1000 if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1001 /* what used to be at src t is now at src s */
1002 tgt2src[src2tgt[t].idx].idx = s;
1003 }
1004 }
1005 }
1006
1007 else if (delta_is_split(src)) {
1008
1009 if (delta_is_new_only(tgt)) {
1010
1011 if (best_match->similarity < opts.rename_threshold)
1012 continue;
1013
1014 delta_make_rename(tgt, src, best_match->similarity);
1015
1016 src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
1017 GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
1018 src->nfiles = 1;
1019 memset(&src->old_file, 0, sizeof(src->old_file));
1020 src->old_file.path = src->new_file.path;
1021 src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
1022
1023 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1024 num_rewrites--;
1025
1026 num_updates++;
1027 } else {
1028 assert(delta_is_split(src));
1029
1030 if (best_match->similarity < opts.rename_from_rewrite_threshold)
1031 continue;
1032
1033 memcpy(&swap, &tgt->old_file, sizeof(swap));
1034
1035 delta_make_rename(tgt, src, best_match->similarity);
1036 num_rewrites--;
1037 num_updates++;
1038
1039 memcpy(&src->old_file, &swap, sizeof(src->old_file));
1040
1041 /* if we've just swapped the new element into the correct
1042 * place, clear the SPLIT flag
1043 */
1044 if (tgt2src[s].idx == t &&
1045 tgt2src[s].similarity >
1046 opts.rename_from_rewrite_threshold) {
1047 src->status = GIT_DELTA_RENAMED;
1048 src->similarity = tgt2src[s].similarity;
1049 tgt2src[s].similarity = 0;
1050 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1051 num_rewrites--;
1052 }
1053 /* otherwise, if we just overwrote a source, update mapping */
1054 else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1055 /* what used to be at src t is now at src s */
1056 tgt2src[src2tgt[t].idx].idx = s;
1057 }
1058
1059 num_updates++;
1060 }
1061 }
1062
1063 else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
1064 if (tgt2src_copy[t].similarity < opts.copy_threshold)
1065 continue;
1066
1067 /* always use best possible source for copy */
1068 best_match = &tgt2src_copy[t];
1069 src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
1070
1071 if (delta_is_split(tgt)) {
1072 error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
1073 if (error < 0)
1074 goto cleanup;
1075 num_rewrites--;
1076 }
1077
1078 if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
1079 continue;
1080
1081 tgt->status = GIT_DELTA_COPIED;
1082 tgt->similarity = best_match->similarity;
1083 tgt->nfiles = 2;
1084 memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
1085 tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1086
1087 num_updates++;
1088 }
1089 }
1090
1091 /*
1092 * Actually split and delete entries as needed
1093 */
1094
1095 if (num_rewrites > 0 || num_updates > 0)
1096 error = apply_splits_and_deletes(
1097 diff, diff->deltas.length - num_rewrites,
1098 FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
1099 !FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
1100
1101 cleanup:
1102 git__free(tgt2src);
1103 git__free(src2tgt);
1104 git__free(tgt2src_copy);
1105
1106 if (sigcache) {
1107 for (t = 0; t < num_deltas * 2; ++t) {
1108 if (sigcache[t] != NULL)
1109 opts.metric->free_signature(sigcache[t], opts.metric->payload);
1110 }
1111 git__free(sigcache);
1112 }
1113
1114 if (!given_opts || !given_opts->metric)
1115 git__free(opts.metric);
1116
1117 return error;
1118 }
1119
1120 #undef FLAG_SET