]> git.proxmox.com Git - libgit2.git/blame - src/diff_tform.c
Fix some warnings
[libgit2.git] / src / diff_tform.c
CommitLineData
db106d01 1/*
359fc2d2 2 * Copyright (C) the libgit2 contributors. All rights reserved.
db106d01
RB
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7#include "common.h"
114f5a6c 8
db106d01 9#include "git2/config.h"
960a04dd 10#include "git2/blob.h"
114f5a6c
RB
11
12#include "diff.h"
5e5848eb 13#include "hashsig.h"
114f5a6c
RB
14#include "path.h"
15#include "fileops.h"
db106d01
RB
16
17static git_diff_delta *diff_delta__dup(
18 const git_diff_delta *d, git_pool *pool)
19{
20 git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
21 if (!delta)
22 return NULL;
23
24 memcpy(delta, d, sizeof(git_diff_delta));
c68b09dc 25 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
db106d01 26
d958e37a
RB
27 if (d->old_file.path != NULL) {
28 delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
29 if (delta->old_file.path == NULL)
30 goto fail;
31 }
db106d01 32
d958e37a 33 if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
db106d01
RB
34 delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
35 if (delta->new_file.path == NULL)
36 goto fail;
37 } else {
38 delta->new_file.path = delta->old_file.path;
39 }
40
41 return delta;
42
43fail:
44 git__free(delta);
45 return NULL;
46}
47
48static git_diff_delta *diff_delta__merge_like_cgit(
49 const git_diff_delta *a, const git_diff_delta *b, git_pool *pool)
50{
51 git_diff_delta *dup;
52
53 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
54 *
55 * When C git does a diff between the work dir and a tree, it actually
56 * diffs with the index but uses the workdir contents. This emulates
57 * those choices so we can emulate the type of diff.
58 *
59 * We have three file descriptions here, let's call them:
60 * f1 = a->old_file
61 * f2 = a->new_file AND b->old_file
62 * f3 = b->new_file
63 */
64
65 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
66 if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
67 return diff_delta__dup(a, pool);
68
69 /* otherwise, base this diff on the 'b' diff */
70 if ((dup = diff_delta__dup(b, pool)) == NULL)
71 return NULL;
72
73 /* If 'a' status is uninteresting, then we're done */
74 if (a->status == GIT_DELTA_UNMODIFIED)
75 return dup;
76
77 assert(a->status != GIT_DELTA_UNMODIFIED);
78 assert(b->status != GIT_DELTA_UNMODIFIED);
79
80 /* A cgit exception is that the diff of a file that is only in the
81 * index (i.e. not in HEAD nor workdir) is given as empty.
82 */
83 if (dup->status == GIT_DELTA_DELETED) {
84 if (a->status == GIT_DELTA_ADDED)
85 dup->status = GIT_DELTA_UNMODIFIED;
86 /* else don't overwrite DELETE status */
87 } else {
88 dup->status = a->status;
89 }
90
91 git_oid_cpy(&dup->old_file.oid, &a->old_file.oid);
92 dup->old_file.mode = a->old_file.mode;
93 dup->old_file.size = a->old_file.size;
94 dup->old_file.flags = a->old_file.flags;
95
96 return dup;
97}
98
99int git_diff_merge(
100 git_diff_list *onto,
101 const git_diff_list *from)
102{
103 int error = 0;
104 git_pool onto_pool;
105 git_vector onto_new;
106 git_diff_delta *delta;
107 bool ignore_case = false;
108 unsigned int i, j;
109
110 assert(onto && from);
111
112 if (!from->deltas.length)
113 return 0;
114
115 if (git_vector_init(
116 &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 ||
117 git_pool_init(&onto_pool, 1, 0) < 0)
118 return -1;
119
120 if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 ||
121 (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0)
122 {
123 ignore_case = true;
124
125 /* This function currently only supports merging diff lists that
126 * are sorted identically. */
127 assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 &&
128 (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0);
129 }
130
131 for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
132 git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
133 const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
134 int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
135
136 if (cmp < 0) {
137 delta = diff_delta__dup(o, &onto_pool);
138 i++;
139 } else if (cmp > 0) {
140 delta = diff_delta__dup(f, &onto_pool);
141 j++;
142 } else {
143 delta = diff_delta__merge_like_cgit(o, f, &onto_pool);
144 i++;
145 j++;
146 }
147
148 /* the ignore rules for the target may not match the source
149 * or the result of a merged delta could be skippable...
150 */
151 if (git_diff_delta__should_skip(&onto->opts, delta)) {
152 git__free(delta);
153 continue;
154 }
155
156 if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
157 break;
158 }
159
160 if (!error) {
161 git_vector_swap(&onto->deltas, &onto_new);
162 git_pool_swap(&onto->pool, &onto_pool);
163 onto->new_src = from->new_src;
164
165 /* prefix strings also come from old pool, so recreate those.*/
166 onto->opts.old_prefix =
167 git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
168 onto->opts.new_prefix =
169 git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
170 }
171
172 git_vector_foreach(&onto_new, i, delta)
173 git__free(delta);
174 git_vector_free(&onto_new);
175 git_pool_clear(&onto_pool);
176
177 return error;
178}
179
0462fba5 180int git_diff_find_similar__hashsig_for_file(
f8275890
RB
181 void **out, const git_diff_file *f, const char *path, void *p)
182{
183 git_hashsig_option_t opt = (git_hashsig_option_t)p;
aa408cbf
ET
184 int error = 0;
185
f8275890 186 GIT_UNUSED(f);
aa408cbf 187 error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
1fed6b07 188
aa408cbf
ET
189 if (error == GIT_EBUFS) {
190 error = 0;
191 giterr_clear();
192 }
193
194 return error;
f8275890 195}
9bc8be3d 196
0462fba5 197int git_diff_find_similar__hashsig_for_buf(
f8275890
RB
198 void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
199{
200 git_hashsig_option_t opt = (git_hashsig_option_t)p;
aa408cbf 201 int error = 0;
0462fba5 202
f8275890 203 GIT_UNUSED(f);
aa408cbf 204 error = git_hashsig_create((git_hashsig **)out, buf, len, opt);
1fed6b07 205
aa408cbf
ET
206 if (error == GIT_EBUFS) {
207 error = 0;
208 giterr_clear();
209 }
210
211 return error;
f8275890 212}
9bc8be3d 213
0462fba5 214void git_diff_find_similar__hashsig_free(void *sig, void *payload)
9bc8be3d
RB
215{
216 GIT_UNUSED(payload);
217 git_hashsig_free(sig);
218}
219
0462fba5 220int git_diff_find_similar__calc_similarity(
9bc8be3d
RB
221 int *score, void *siga, void *sigb, void *payload)
222{
223 GIT_UNUSED(payload);
224 *score = git_hashsig_compare(siga, sigb);
225 return 0;
226}
227
db106d01
RB
228#define DEFAULT_THRESHOLD 50
229#define DEFAULT_BREAK_REWRITE_THRESHOLD 60
a21cbb12 230#define DEFAULT_RENAME_LIMIT 200
db106d01
RB
231
232static int normalize_find_opts(
233 git_diff_list *diff,
234 git_diff_find_options *opts,
235 git_diff_find_options *given)
236{
237 git_config *cfg = NULL;
db106d01
RB
238
239 if (diff->repo != NULL &&
240 git_repository_config__weakptr(&cfg, diff->repo) < 0)
241 return -1;
242
243 if (given != NULL)
244 memcpy(opts, given, sizeof(*opts));
245 else {
0a008913
RB
246 const char *val = NULL;
247
248 GIT_INIT_STRUCTURE(opts, GIT_DIFF_FIND_OPTIONS_VERSION);
db106d01
RB
249
250 opts->flags = GIT_DIFF_FIND_RENAMES;
251
252 if (git_config_get_string(&val, cfg, "diff.renames") < 0)
253 giterr_clear();
254 else if (val &&
255 (!strcasecmp(val, "copies") || !strcasecmp(val, "copy")))
256 opts->flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
257 }
258
c7231c45 259 GITERR_CHECK_VERSION(opts, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
ca901e7b 260
db106d01
RB
261 /* some flags imply others */
262
9be5be47
RB
263 if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
264 /* if we are only looking for exact matches, then don't turn
265 * MODIFIED items into ADD/DELETE pairs because it's too picky
266 */
267 opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
268
269 /* similarly, don't look for self-rewrites to split */
270 opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
271 }
272
db106d01
RB
273 if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
274 opts->flags |= GIT_DIFF_FIND_RENAMES;
275
276 if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
277 opts->flags |= GIT_DIFF_FIND_COPIES;
278
d958e37a
RB
279 if (opts->flags & GIT_DIFF_BREAK_REWRITES)
280 opts->flags |= GIT_DIFF_FIND_REWRITES;
281
db106d01
RB
282#define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
283
284 if (USE_DEFAULT(opts->rename_threshold))
285 opts->rename_threshold = DEFAULT_THRESHOLD;
286
287 if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
288 opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
289
290 if (USE_DEFAULT(opts->copy_threshold))
291 opts->copy_threshold = DEFAULT_THRESHOLD;
292
293 if (USE_DEFAULT(opts->break_rewrite_threshold))
294 opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
295
296#undef USE_DEFAULT
297
a21cbb12 298 if (!opts->rename_limit) {
db106d01
RB
299 int32_t limit = 0;
300
a21cbb12 301 opts->rename_limit = DEFAULT_RENAME_LIMIT;
db106d01
RB
302
303 if (git_config_get_int32(&limit, cfg, "diff.renameLimit") < 0)
304 giterr_clear();
305 else if (limit > 0)
a21cbb12 306 opts->rename_limit = limit;
db106d01
RB
307 }
308
f8275890 309 /* assign the internal metric with whitespace flag as payload */
9bc8be3d 310 if (!opts->metric) {
f8275890
RB
311 opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
312 GITERR_CHECK_ALLOC(opts->metric);
313
0462fba5
ET
314 opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
315 opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
316 opts->metric->free_signature = git_diff_find_similar__hashsig_free;
317 opts->metric->similarity = git_diff_find_similar__calc_similarity;
f8275890 318
9bc8be3d 319 if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
f8275890 320 opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE;
9bc8be3d 321 else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
f8275890 322 opts->metric->payload = (void *)GIT_HASHSIG_NORMAL;
9bc8be3d 323 else
f8275890 324 opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE;
9bc8be3d
RB
325 }
326
db106d01
RB
327 return 0;
328}
329
d958e37a
RB
330static int apply_splits_and_deletes(
331 git_diff_list *diff, size_t expected_size, bool actually_split)
db106d01
RB
332{
333 git_vector onto = GIT_VECTOR_INIT;
334 size_t i;
d958e37a 335 git_diff_delta *delta, *deleted;
db106d01
RB
336
337 if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
338 return -1;
339
340 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
341 git_vector_foreach(&diff->deltas, i, delta) {
71a3d27e 342 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
db106d01 343 continue;
db106d01 344
a21cbb12 345 if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
d958e37a
RB
346 delta->similarity = 0;
347
348 /* make new record for DELETED side of split */
349 if (!(deleted = diff_delta__dup(delta, &diff->pool)))
11d9f6b3 350 goto on_error;
db106d01
RB
351
352 deleted->status = GIT_DELTA_DELETED;
353 memset(&deleted->new_file, 0, sizeof(deleted->new_file));
354 deleted->new_file.path = deleted->old_file.path;
71a3d27e 355 deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
db106d01 356
11d9f6b3
PK
357 if (git_vector_insert(&onto, deleted) < 0)
358 goto on_error;
db106d01 359
9be5be47
RB
360 if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
361 delta->status = GIT_DELTA_UNTRACKED;
362 else
363 delta->status = GIT_DELTA_ADDED;
db106d01
RB
364 memset(&delta->old_file, 0, sizeof(delta->old_file));
365 delta->old_file.path = delta->new_file.path;
71a3d27e 366 delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
db106d01
RB
367 }
368
c68b09dc
RB
369 /* clean up delta before inserting into new list */
370 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
371
372 if (delta->status != GIT_DELTA_COPIED &&
373 delta->status != GIT_DELTA_RENAMED &&
374 (delta->status != GIT_DELTA_MODIFIED || actually_split))
375 delta->similarity = 0;
376
377 /* insert into new list */
11d9f6b3
PK
378 if (git_vector_insert(&onto, delta) < 0)
379 goto on_error;
db106d01
RB
380 }
381
11d9f6b3 382 /* cannot return an error past this point */
c68b09dc
RB
383
384 /* free deltas from old list that didn't make it to the new one */
a21cbb12 385 git_vector_foreach(&diff->deltas, i, delta) {
71a3d27e 386 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
11d9f6b3 387 git__free(delta);
a21cbb12
RB
388 }
389
db106d01 390 /* swap new delta list into place */
db106d01
RB
391 git_vector_swap(&diff->deltas, &onto);
392 git_vector_free(&onto);
a21cbb12 393 git_vector_sort(&diff->deltas);
db106d01
RB
394
395 return 0;
11d9f6b3
PK
396
397on_error:
398 git_vector_foreach(&onto, i, delta)
399 git__free(delta);
11d9f6b3
PK
400 git_vector_free(&onto);
401
402 return -1;
db106d01
RB
403}
404
960a04dd
RB
405GIT_INLINE(git_diff_file *) similarity_get_file(git_diff_list *diff, size_t idx)
406{
407 git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
408 return (idx & 1) ? &delta->new_file : &delta->old_file;
409}
99ba8f23 410
a5140f4d
RB
411typedef struct {
412 size_t idx;
413 git_iterator_type_t src;
414 git_repository *repo;
415 git_diff_file *file;
416 git_buf data;
effdbeb3 417 git_odb_object *odb_obj;
a5140f4d 418 git_blob *blob;
a5140f4d
RB
419} similarity_info;
420
effdbeb3 421static int similarity_init(
a5140f4d
RB
422 similarity_info *info, git_diff_list *diff, size_t file_idx)
423{
424 info->idx = file_idx;
425 info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
426 info->repo = diff->repo;
427 info->file = similarity_get_file(diff, file_idx);
effdbeb3 428 info->odb_obj = NULL;
a5140f4d 429 info->blob = NULL;
a5140f4d 430 git_buf_init(&info->data, 0);
09fae31d 431
effdbeb3
RB
432 if (info->file->size > 0)
433 return 0;
5e5848eb 434
effdbeb3
RB
435 return git_diff_file__resolve_zero_size(
436 info->file, &info->odb_obj, info->repo);
a5140f4d 437}
960a04dd 438
a5140f4d
RB
439static int similarity_calc(
440 similarity_info *info,
441 const git_diff_find_options *opts,
442 void **cache)
443{
444 int error = 0;
effdbeb3 445 git_diff_file *file = info->file;
8cfd54f0 446
effdbeb3
RB
447 if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
448 if ((error = git_buf_joinpath(
449 &info->data, git_repository_workdir(info->repo), file->path)) < 0)
450 return error;
960a04dd 451
effdbeb3
RB
452 /* if path is not a regular file, just skip this item */
453 if (!git_path_isfile(info->data.ptr))
454 return 0;
a5140f4d 455
a5140f4d
RB
456 /* TODO: apply wd-to-odb filters to file data if necessary */
457
458 error = opts->metric->file_signature(
459 &cache[info->idx], info->file,
460 info->data.ptr, opts->metric->payload);
461 } else {
effdbeb3
RB
462 /* if we didn't initially know the size, we might have an odb_obj
463 * around from earlier, so convert that, otherwise load the blob now
464 */
465 if (info->odb_obj != NULL)
466 error = git_object__from_odb_object(
467 (git_object **)&info->blob, info->repo,
468 info->odb_obj, GIT_OBJ_BLOB);
469 else
470 error = git_blob_lookup(&info->blob, info->repo, &file->oid);
471
472 if (error < 0) {
473 /* if lookup fails, just skip this item in similarity calc */
474 giterr_clear();
475 } else {
a16e4172
RB
476 size_t sz;
477
478 /* index size may not be actual blob size if filtered */
479 if (file->size != git_blob_rawsize(info->blob))
480 file->size = git_blob_rawsize(info->blob);
481
482 sz = (size_t)(git__is_sizet(file->size) ? file->size : -1);
effdbeb3
RB
483
484 error = opts->metric->buffer_signature(
485 &cache[info->idx], info->file,
486 git_blob_rawcontent(info->blob), sz, opts->metric->payload);
487 }
960a04dd
RB
488 }
489
490 return error;
491}
492
effdbeb3
RB
493static void similarity_unload(similarity_info *info)
494{
495 if (info->odb_obj)
496 git_odb_object_free(info->odb_obj);
497
498 if (info->blob)
499 git_blob_free(info->blob);
500 else
501 git_buf_free(&info->data);
502}
503
a21cbb12 504#define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
9be5be47
RB
505
506/* - score < 0 means files cannot be compared
507 * - score >= 100 means files are exact match
508 * - score == 0 means files are completely different
509 */
960a04dd 510static int similarity_measure(
9be5be47 511 int *score,
960a04dd 512 git_diff_list *diff,
a21cbb12 513 const git_diff_find_options *opts,
960a04dd
RB
514 void **cache,
515 size_t a_idx,
516 size_t b_idx)
517{
960a04dd
RB
518 git_diff_file *a_file = similarity_get_file(diff, a_idx);
519 git_diff_file *b_file = similarity_get_file(diff, b_idx);
a21cbb12 520 bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
a5140f4d
RB
521 int error = 0;
522 similarity_info a_info, b_info;
9be5be47
RB
523
524 *score = -1;
960a04dd 525
9be5be47 526 /* don't try to compare files of different types */
960a04dd
RB
527 if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
528 return 0;
529
a1683f28 530 /* if exact match is requested, force calculation of missing OIDs now */
9be5be47
RB
531 if (exact_match) {
532 if (git_oid_iszero(&a_file->oid) &&
533 diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
534 !git_diff__oid_for_file(diff->repo, a_file->path,
535 a_file->mode, a_file->size, &a_file->oid))
536 a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
537
538 if (git_oid_iszero(&b_file->oid) &&
539 diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
540 !git_diff__oid_for_file(diff->repo, b_file->path,
541 b_file->mode, b_file->size, &b_file->oid))
542 b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
543 }
544
545 /* check OID match as a quick test */
546 if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
547 *score = 100;
548 return 0;
549 }
550
551 /* don't calculate signatures if we are doing exact match */
552 if (exact_match) {
553 *score = 0;
554 return 0;
555 }
db106d01 556
effdbeb3
RB
557 memset(&a_info, 0, sizeof(a_info));
558 memset(&b_info, 0, sizeof(b_info));
a5140f4d 559
effdbeb3
RB
560 /* set up similarity data (will try to update missing file sizes) */
561 if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
562 return error;
563 if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
564 goto cleanup;
a5140f4d 565
f5c4d022 566 /* check if file sizes are nowhere near each other */
18e9efc4
RB
567 if (a_file->size > 127 &&
568 b_file->size > 127 &&
569 (a_file->size > (b_file->size << 4) ||
570 b_file->size > (a_file->size << 4)))
effdbeb3 571 goto cleanup;
18e9efc4 572
960a04dd 573 /* update signature cache if needed */
a5140f4d 574 if (!cache[a_idx] && (error = similarity_calc(&a_info, opts, cache)) < 0)
effdbeb3
RB
575 goto cleanup;
576
a5140f4d 577 if (!cache[b_idx] && (error = similarity_calc(&b_info, opts, cache)) < 0)
effdbeb3 578 goto cleanup;
1fed6b07 579
a5140f4d
RB
580 /* calculate similarity provided that the metric choose to process
581 * both the a and b files (some may not if file is too big, etc).
582 */
583 if (cache[a_idx] && cache[b_idx])
584 error = opts->metric->similarity(
585 score, cache[a_idx], cache[b_idx], opts->metric->payload);
db106d01 586
effdbeb3 587cleanup:
a5140f4d
RB
588 similarity_unload(&a_info);
589 similarity_unload(&b_info);
590
591 return error;
db106d01
RB
592}
593
a21cbb12 594static int calc_self_similarity(
9be5be47 595 git_diff_list *diff,
a21cbb12
RB
596 const git_diff_find_options *opts,
597 size_t delta_idx,
598 void **cache)
9be5be47 599{
a21cbb12
RB
600 int error, similarity = -1;
601 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
602
603 if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
604 return 0;
605
606 error = similarity_measure(
607 &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
608 if (error < 0)
609 return error;
610
611 if (similarity >= 0) {
612 delta->similarity = (uint32_t)similarity;
613 delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
614 }
615
616 return 0;
617}
618
619static bool is_rename_target(
620 git_diff_list *diff,
621 const git_diff_find_options *opts,
622 size_t delta_idx,
623 void **cache)
624{
625 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
626
627 /* skip things that aren't plain blobs */
628 if (!GIT_MODE_ISBLOB(delta->new_file.mode))
629 return false;
630
631 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
632 * targets; maybe include UNTRACKED and IGNORED if requested.
633 */
634 switch (delta->status) {
635 case GIT_DELTA_UNMODIFIED:
636 case GIT_DELTA_DELETED:
637 return false;
638
639 case GIT_DELTA_MODIFIED:
640 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
641 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
642 return false;
643
644 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
645 return false;
646
647 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
648 delta->similarity < opts->break_rewrite_threshold) {
649 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
650 break;
651 }
652 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
653 delta->similarity < opts->rename_from_rewrite_threshold)
654 break;
655
656 return false;
657
658 case GIT_DELTA_UNTRACKED:
a21cbb12
RB
659 if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
660 return false;
661 break;
662
d55bed1a
ET
663 case GIT_DELTA_IGNORED:
664 return false;
665
a21cbb12
RB
666 default: /* all other status values should be checked */
667 break;
668 }
669
670 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
671 return true;
672}
673
674static bool is_rename_source(
675 git_diff_list *diff,
676 const git_diff_find_options *opts,
677 size_t delta_idx,
678 void **cache)
679{
680 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
681
682 /* skip things that aren't blobs */
683 if (!GIT_MODE_ISBLOB(delta->old_file.mode))
684 return false;
685
686 switch (delta->status) {
687 case GIT_DELTA_ADDED:
688 case GIT_DELTA_UNTRACKED:
689 case GIT_DELTA_IGNORED:
690 return false;
691
692 case GIT_DELTA_DELETED:
693 case GIT_DELTA_TYPECHANGE:
694 break;
695
696 case GIT_DELTA_UNMODIFIED:
697 if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
698 return false;
699 break;
700
701 default: /* MODIFIED, RENAMED, COPIED */
702 /* if we're finding copies, this could be a source */
703 if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
704 break;
705
706 /* otherwise, this is only a source if we can split it */
707 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
708 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
709 return false;
710
711 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
712 return false;
713
714 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
715 delta->similarity < opts->break_rewrite_threshold) {
716 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
717 break;
718 }
719
720 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
721 delta->similarity < opts->rename_from_rewrite_threshold)
722 break;
723
724 return false;
725 }
726
727 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
728 return true;
729}
730
731GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
732{
733 return (delta->status == GIT_DELTA_TYPECHANGE ||
734 (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
735}
736
737GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
738{
739 return (delta->status == GIT_DELTA_ADDED ||
740 delta->status == GIT_DELTA_UNTRACKED ||
741 delta->status == GIT_DELTA_IGNORED);
9be5be47 742}
db106d01 743
e4acc3ba
RB
744GIT_INLINE(void) delta_make_rename(
745 git_diff_delta *to, const git_diff_delta *from, uint32_t similarity)
746{
747 to->status = GIT_DELTA_RENAMED;
748 to->similarity = similarity;
749 memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
750 to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
751}
752
d958e37a
RB
753typedef struct {
754 uint32_t idx;
755 uint32_t similarity;
756} diff_find_match;
757
db106d01
RB
758int git_diff_find_similar(
759 git_diff_list *diff,
760 git_diff_find_options *given_opts)
761{
e4acc3ba 762 size_t i, j, sigcache_size;
960a04dd 763 int error = 0, similarity;
db106d01
RB
764 git_diff_delta *from, *to;
765 git_diff_find_options opts;
e4acc3ba
RB
766 size_t num_srcs = 0, num_tgts = 0, tried_srcs = 0, tried_tgts = 0;
767 size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
768 void **sigcache; /* cache of similarity metric file signatures */
769 diff_find_match *match_srcs = NULL, *match_tgts = NULL, *best_match;
770 git_diff_file swap;
db106d01 771
960a04dd
RB
772 if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
773 return error;
db106d01 774
a21cbb12 775 /* TODO: maybe abort if deltas.length > rename_limit ??? */
d958e37a
RB
776 if (!git__is_uint32(diff->deltas.length))
777 return 0;
960a04dd 778
e4acc3ba
RB
779 sigcache_size = diff->deltas.length * 2; /* keep size b/c diff may change */
780 sigcache = git__calloc(sigcache_size, sizeof(void *));
781 GITERR_CHECK_ALLOC(sigcache);
782
783 /* Label rename sources and targets
784 *
785 * This will also set self-similarity scores for MODIFIED files and
786 * mark them for splitting if break-rewrites is enabled
787 */
788 git_vector_foreach(&diff->deltas, i, to) {
789 if (is_rename_source(diff, &opts, i, sigcache))
790 ++num_srcs;
791
792 if (is_rename_target(diff, &opts, i, sigcache))
793 ++num_tgts;
794 }
960a04dd 795
e4acc3ba
RB
796 /* if there are no candidate srcs or tgts, we're done */
797 if (!num_srcs || !num_tgts)
798 goto cleanup;
960a04dd 799
e4acc3ba
RB
800 match_tgts = git__calloc(diff->deltas.length, sizeof(diff_find_match));
801 GITERR_CHECK_ALLOC(match_tgts);
802 match_srcs = git__calloc(diff->deltas.length, sizeof(diff_find_match));
803 GITERR_CHECK_ALLOC(match_srcs);
db106d01 804
e4acc3ba
RB
805 /*
806 * Find best-fit matches for rename / copy candidates
807 */
d958e37a 808
e4acc3ba
RB
809find_best_matches:
810 tried_tgts = num_bumped = 0;
db106d01 811
e4acc3ba 812 git_vector_foreach(&diff->deltas, i, to) {
a21cbb12 813 /* skip things that are not rename targets */
e4acc3ba 814 if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
d958e37a
RB
815 continue;
816
e4acc3ba 817 tried_srcs = 0;
db106d01 818
e4acc3ba 819 git_vector_foreach(&diff->deltas, j, from) {
a21cbb12 820 /* skip things that are not rename sources */
e4acc3ba 821 if ((from->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
960a04dd
RB
822 continue;
823
d958e37a 824 /* calculate similarity for this pair and find best match */
e4acc3ba
RB
825 if (i == j)
826 similarity = -1; /* don't measure self-similarity here */
827 else if ((error = similarity_measure(
828 &similarity, diff, &opts, sigcache, 2 * j, 2 * i + 1)) < 0)
960a04dd 829 goto cleanup;
a21cbb12 830
e4acc3ba
RB
831 /* if this pairing is better for the src and the tgt, keep it */
832 if (similarity > 0 &&
833 match_tgts[i].similarity < (uint32_t)similarity &&
834 match_srcs[j].similarity < (uint32_t)similarity)
835 {
836 if (match_tgts[i].similarity > 0) {
837 match_tgts[match_srcs[j].idx].similarity = 0;
838 match_srcs[match_tgts[i].idx].similarity = 0;
839 ++num_bumped;
840 }
841
842 match_tgts[i].similarity = (uint32_t)similarity;
843 match_tgts[i].idx = (uint32_t)j;
a21cbb12 844
e4acc3ba
RB
845 match_srcs[j].similarity = (uint32_t)similarity;
846 match_srcs[j].idx = (uint32_t)i;
db106d01 847 }
e4acc3ba
RB
848
849 if (++tried_srcs >= num_srcs)
850 break;
851
852 /* cap on maximum targets we'll examine (per "to" file) */
853 if (tried_srcs > opts.rename_limit)
854 break;
db106d01 855 }
e4acc3ba
RB
856
857 if (++tried_tgts >= num_tgts)
858 break;
db106d01
RB
859 }
860
e4acc3ba
RB
861 if (num_bumped > 0) /* try again if we bumped some items */
862 goto find_best_matches;
863
864 /*
865 * Rewrite the diffs with renames / copies
866 */
867
868 tried_tgts = 0;
db106d01 869
a21cbb12 870 git_vector_foreach(&diff->deltas, i, to) {
e4acc3ba
RB
871 /* skip things that are not rename targets */
872 if ((to->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
a21cbb12 873 continue;
690bf41c 874
e4acc3ba
RB
875 /* check if this delta was the target of a similarity */
876 best_match = &match_tgts[i];
877 if (!best_match->similarity)
878 continue;
d958e37a 879
e4acc3ba
RB
880 j = best_match->idx;
881 from = GIT_VECTOR_GET(&diff->deltas, j);
d958e37a 882
a21cbb12
RB
883 /* possible scenarios:
884 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
885 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
886 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
887 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
888 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
db106d01
RB
889 */
890
891 if (from->status == GIT_DELTA_DELETED) {
db106d01 892
a21cbb12 893 if (delta_is_new_only(to)) {
db106d01 894
e4acc3ba 895 if (best_match->similarity < opts.rename_threshold)
a21cbb12 896 continue;
960a04dd 897
e4acc3ba 898 delta_make_rename(to, from, best_match->similarity);
d958e37a 899
e4acc3ba 900 from->flags |= GIT_DIFF_FLAG__TO_DELETE;
a21cbb12
RB
901 num_rewrites++;
902 } else {
49f70f2c 903 assert(delta_is_split(to));
960a04dd 904
e4acc3ba 905 if (best_match->similarity < opts.rename_from_rewrite_threshold)
a21cbb12 906 continue;
db106d01 907
e4acc3ba 908 memcpy(&swap, &to->old_file, sizeof(swap));
db106d01 909
e4acc3ba
RB
910 delta_make_rename(to, from, best_match->similarity);
911 num_rewrites--;
912
913 from->status = GIT_DELTA_DELETED;
914 memcpy(&from->old_file, &swap, sizeof(from->old_file));
915 memset(&from->new_file, 0, sizeof(from->new_file));
916 from->new_file.path = from->old_file.path;
917 from->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
db106d01 918
d958e37a 919 num_updates++;
db106d01
RB
920 }
921 }
922
a21cbb12 923 else if (delta_is_split(from)) {
a21cbb12
RB
924
925 if (delta_is_new_only(to)) {
db106d01 926
e4acc3ba 927 if (best_match->similarity < opts.rename_threshold)
a21cbb12 928 continue;
d958e37a 929
e4acc3ba 930 delta_make_rename(to, from, best_match->similarity);
a21cbb12 931
e4acc3ba 932 from->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
a21cbb12 933 GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
e4acc3ba
RB
934 memset(&from->old_file, 0, sizeof(from->old_file));
935 from->old_file.path = from->new_file.path;
936 from->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
937
938 from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
939 num_rewrites--;
a21cbb12
RB
940
941 num_updates++;
942 } else {
943 assert(delta_is_split(from));
944
e4acc3ba 945 if (best_match->similarity < opts.rename_from_rewrite_threshold)
a21cbb12
RB
946 continue;
947
e4acc3ba 948 memcpy(&swap, &to->old_file, sizeof(swap));
a21cbb12 949
e4acc3ba
RB
950 delta_make_rename(to, from, best_match->similarity);
951 num_rewrites--;
952 num_updates++;
a21cbb12 953
e4acc3ba 954 memcpy(&from->old_file, &swap, sizeof(from->old_file));
a21cbb12 955
e4acc3ba
RB
956 /* if we've just swapped the new element into the correct
957 * place, clear the SPLIT flag
67db583d 958 */
e4acc3ba
RB
959 if (match_tgts[j].idx == i &&
960 match_tgts[j].similarity >
67db583d
RB
961 opts.rename_from_rewrite_threshold) {
962
e4acc3ba
RB
963 from->status = GIT_DELTA_RENAMED;
964 from->similarity = match_tgts[j].similarity;
965 match_tgts[j].similarity = 0;
67db583d
RB
966 from->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
967 num_rewrites--;
968 }
e4acc3ba
RB
969 /* otherwise, if we just overwrote a source, update mapping */
970 else if (j > i && match_srcs[i].similarity > 0) {
c4ac556e 971 match_tgts[match_srcs[i].idx].idx = (uint32_t)j;
e4acc3ba 972 }
67db583d 973
a21cbb12
RB
974 num_updates++;
975 }
976 }
977
978 else if (delta_is_new_only(to)) {
979 if (!FLAG_SET(&opts, GIT_DIFF_FIND_COPIES) ||
e4acc3ba 980 best_match->similarity < opts.copy_threshold)
a21cbb12
RB
981 continue;
982
e4acc3ba
RB
983 to->status = GIT_DELTA_COPIED;
984 to->similarity = best_match->similarity;
a21cbb12
RB
985 memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
986
987 num_updates++;
988 }
db106d01
RB
989 }
990
e4acc3ba
RB
991 /*
992 * Actually split and delete entries as needed
993 */
994
a21cbb12 995 if (num_rewrites > 0 || num_updates > 0)
960a04dd 996 error = apply_splits_and_deletes(
d958e37a 997 diff, diff->deltas.length - num_rewrites,
a21cbb12 998 FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES));
d958e37a 999
960a04dd 1000cleanup:
e4acc3ba
RB
1001 git__free(match_srcs);
1002 git__free(match_tgts);
db106d01 1003
e4acc3ba
RB
1004 for (i = 0; i < sigcache_size; ++i) {
1005 if (sigcache[i] != NULL)
1006 opts.metric->free_signature(sigcache[i], opts.metric->payload);
db106d01 1007 }
e4acc3ba 1008 git__free(sigcache);
db106d01 1009
f8275890
RB
1010 if (!given_opts || !given_opts->metric)
1011 git__free(opts.metric);
1012
960a04dd 1013 return error;
db106d01
RB
1014}
1015
1016#undef FLAG_SET