]> git.proxmox.com Git - libgit2.git/blame - src/diff_tform.c
tree: remove legacy 'oid' naming
[libgit2.git] / src / diff_tform.c
CommitLineData
db106d01 1/*
359fc2d2 2 * Copyright (C) the libgit2 contributors. All rights reserved.
db106d01
RB
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7#include "common.h"
114f5a6c 8
db106d01 9#include "git2/config.h"
960a04dd 10#include "git2/blob.h"
114f5a6c
RB
11
12#include "diff.h"
5e5848eb 13#include "hashsig.h"
114f5a6c
RB
14#include "path.h"
15#include "fileops.h"
9f77b3f6 16#include "config.h"
db106d01
RB
17
18static git_diff_delta *diff_delta__dup(
19 const git_diff_delta *d, git_pool *pool)
20{
21 git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
22 if (!delta)
23 return NULL;
24
25 memcpy(delta, d, sizeof(git_diff_delta));
c68b09dc 26 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
db106d01 27
d958e37a
RB
28 if (d->old_file.path != NULL) {
29 delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
30 if (delta->old_file.path == NULL)
31 goto fail;
32 }
db106d01 33
d958e37a 34 if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
db106d01
RB
35 delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
36 if (delta->new_file.path == NULL)
37 goto fail;
38 } else {
39 delta->new_file.path = delta->old_file.path;
40 }
41
42 return delta;
43
44fail:
45 git__free(delta);
46 return NULL;
47}
48
49static git_diff_delta *diff_delta__merge_like_cgit(
3940310e
RB
50 const git_diff_delta *a,
51 const git_diff_delta *b,
52 git_pool *pool)
db106d01
RB
53{
54 git_diff_delta *dup;
55
56 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
57 *
58 * When C git does a diff between the work dir and a tree, it actually
59 * diffs with the index but uses the workdir contents. This emulates
60 * those choices so we can emulate the type of diff.
61 *
62 * We have three file descriptions here, let's call them:
63 * f1 = a->old_file
64 * f2 = a->new_file AND b->old_file
65 * f3 = b->new_file
66 */
67
68 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
69 if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
70 return diff_delta__dup(a, pool);
71
72 /* otherwise, base this diff on the 'b' diff */
73 if ((dup = diff_delta__dup(b, pool)) == NULL)
74 return NULL;
75
76 /* If 'a' status is uninteresting, then we're done */
77 if (a->status == GIT_DELTA_UNMODIFIED)
78 return dup;
79
80 assert(a->status != GIT_DELTA_UNMODIFIED);
81 assert(b->status != GIT_DELTA_UNMODIFIED);
82
83 /* A cgit exception is that the diff of a file that is only in the
84 * index (i.e. not in HEAD nor workdir) is given as empty.
85 */
86 if (dup->status == GIT_DELTA_DELETED) {
87 if (a->status == GIT_DELTA_ADDED)
88 dup->status = GIT_DELTA_UNMODIFIED;
89 /* else don't overwrite DELETE status */
90 } else {
91 dup->status = a->status;
92 }
93
94 git_oid_cpy(&dup->old_file.oid, &a->old_file.oid);
95 dup->old_file.mode = a->old_file.mode;
96 dup->old_file.size = a->old_file.size;
97 dup->old_file.flags = a->old_file.flags;
98
99 return dup;
100}
101
e7c85120
RB
102static git_diff_delta *diff_delta__merge_like_cgit_reversed(
103 const git_diff_delta *a,
104 const git_diff_delta *b,
105 git_pool *pool)
106{
107 git_diff_delta *dup;
108
109 /* reversed version of above logic */
110
111 if (a->status == GIT_DELTA_UNMODIFIED)
112 return diff_delta__dup(b, pool);
113
114 if ((dup = diff_delta__dup(a, pool)) == NULL)
115 return NULL;
116
117 if (b->status == GIT_DELTA_UNMODIFIED || b->status == GIT_DELTA_UNTRACKED)
118 return dup;
119
120 if (dup->status == GIT_DELTA_DELETED) {
121 if (b->status == GIT_DELTA_ADDED)
122 dup->status = GIT_DELTA_UNMODIFIED;
123 } else {
124 dup->status = b->status;
125 }
126
127 git_oid_cpy(&dup->old_file.oid, &b->old_file.oid);
128 dup->old_file.mode = b->old_file.mode;
129 dup->old_file.size = b->old_file.size;
130 dup->old_file.flags = b->old_file.flags;
131
132 return dup;
133}
134
135int git_diff_merge(git_diff *onto, const git_diff *from)
db106d01
RB
136{
137 int error = 0;
138 git_pool onto_pool;
139 git_vector onto_new;
140 git_diff_delta *delta;
e7c85120 141 bool ignore_case, reversed;
db106d01
RB
142 unsigned int i, j;
143
144 assert(onto && from);
145
146 if (!from->deltas.length)
147 return 0;
148
e7c85120
RB
149 ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
150 reversed = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);
151
152 if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
153 reversed != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
3940310e
RB
154 giterr_set(GITERR_INVALID,
155 "Attempt to merge diffs created with conflicting options");
156 return -1;
157 }
158
db106d01
RB
159 if (git_vector_init(
160 &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 ||
161 git_pool_init(&onto_pool, 1, 0) < 0)
162 return -1;
163
db106d01
RB
164 for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
165 git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
166 const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
3940310e
RB
167 int cmp = !f ? -1 : !o ? 1 :
168 STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
db106d01
RB
169
170 if (cmp < 0) {
171 delta = diff_delta__dup(o, &onto_pool);
172 i++;
173 } else if (cmp > 0) {
174 delta = diff_delta__dup(f, &onto_pool);
175 j++;
176 } else {
e7c85120
RB
177 delta = reversed ?
178 diff_delta__merge_like_cgit_reversed(o, f, &onto_pool) :
179 diff_delta__merge_like_cgit(o, f, &onto_pool);
db106d01
RB
180 i++;
181 j++;
182 }
183
184 /* the ignore rules for the target may not match the source
185 * or the result of a merged delta could be skippable...
186 */
187 if (git_diff_delta__should_skip(&onto->opts, delta)) {
188 git__free(delta);
189 continue;
190 }
191
192 if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
193 break;
194 }
195
196 if (!error) {
197 git_vector_swap(&onto->deltas, &onto_new);
198 git_pool_swap(&onto->pool, &onto_pool);
3940310e
RB
199
200 if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
201 onto->old_src = from->old_src;
202 else
203 onto->new_src = from->new_src;
db106d01
RB
204
205 /* prefix strings also come from old pool, so recreate those.*/
206 onto->opts.old_prefix =
207 git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
208 onto->opts.new_prefix =
209 git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
210 }
211
9cfce273 212 git_vector_free_deep(&onto_new);
db106d01
RB
213 git_pool_clear(&onto_pool);
214
215 return error;
216}
217
0462fba5 218int git_diff_find_similar__hashsig_for_file(
f8275890
RB
219 void **out, const git_diff_file *f, const char *path, void *p)
220{
221 git_hashsig_option_t opt = (git_hashsig_option_t)p;
aa408cbf
ET
222 int error = 0;
223
f8275890 224 GIT_UNUSED(f);
aa408cbf 225 error = git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
1fed6b07 226
aa408cbf
ET
227 if (error == GIT_EBUFS) {
228 error = 0;
229 giterr_clear();
230 }
231
232 return error;
f8275890 233}
9bc8be3d 234
0462fba5 235int git_diff_find_similar__hashsig_for_buf(
f8275890
RB
236 void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
237{
238 git_hashsig_option_t opt = (git_hashsig_option_t)p;
aa408cbf 239 int error = 0;
0462fba5 240
f8275890 241 GIT_UNUSED(f);
aa408cbf 242 error = git_hashsig_create((git_hashsig **)out, buf, len, opt);
1fed6b07 243
aa408cbf
ET
244 if (error == GIT_EBUFS) {
245 error = 0;
246 giterr_clear();
247 }
248
249 return error;
f8275890 250}
9bc8be3d 251
0462fba5 252void git_diff_find_similar__hashsig_free(void *sig, void *payload)
9bc8be3d
RB
253{
254 GIT_UNUSED(payload);
255 git_hashsig_free(sig);
256}
257
0462fba5 258int git_diff_find_similar__calc_similarity(
9bc8be3d
RB
259 int *score, void *siga, void *sigb, void *payload)
260{
261 GIT_UNUSED(payload);
262 *score = git_hashsig_compare(siga, sigb);
263 return 0;
264}
265
db106d01
RB
266#define DEFAULT_THRESHOLD 50
267#define DEFAULT_BREAK_REWRITE_THRESHOLD 60
a21cbb12 268#define DEFAULT_RENAME_LIMIT 200
db106d01
RB
269
270static int normalize_find_opts(
3ff1d123 271 git_diff *diff,
db106d01 272 git_diff_find_options *opts,
10672e3e 273 const git_diff_find_options *given)
db106d01
RB
274{
275 git_config *cfg = NULL;
db106d01 276
5a52d6be
BS
277 GITERR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
278
db106d01
RB
279 if (diff->repo != NULL &&
280 git_repository_config__weakptr(&cfg, diff->repo) < 0)
281 return -1;
282
7e3ed419 283 if (given)
db106d01 284 memcpy(opts, given, sizeof(*opts));
db106d01 285
c56c6d69
BS
286 if (!given ||
287 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
288 {
9f77b3f6
RB
289 const char *rule =
290 git_config__get_string_force(cfg, "diff.renames", "true");
291 int boolval;
292
293 if (!git__parse_bool(&boolval, rule) && !boolval)
294 /* don't set FIND_RENAMES if bool value is false */;
295 else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
296 opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
297 else
298 opts->flags |= GIT_DIFF_FIND_RENAMES;
db106d01 299 }
ca901e7b 300
db106d01
RB
301 /* some flags imply others */
302
9be5be47
RB
303 if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
304 /* if we are only looking for exact matches, then don't turn
305 * MODIFIED items into ADD/DELETE pairs because it's too picky
306 */
307 opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
308
309 /* similarly, don't look for self-rewrites to split */
310 opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
311 }
312
db106d01
RB
313 if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
314 opts->flags |= GIT_DIFF_FIND_RENAMES;
315
316 if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
317 opts->flags |= GIT_DIFF_FIND_COPIES;
318
d958e37a
RB
319 if (opts->flags & GIT_DIFF_BREAK_REWRITES)
320 opts->flags |= GIT_DIFF_FIND_REWRITES;
321
db106d01
RB
322#define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
323
324 if (USE_DEFAULT(opts->rename_threshold))
325 opts->rename_threshold = DEFAULT_THRESHOLD;
326
327 if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
328 opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
329
330 if (USE_DEFAULT(opts->copy_threshold))
331 opts->copy_threshold = DEFAULT_THRESHOLD;
332
333 if (USE_DEFAULT(opts->break_rewrite_threshold))
334 opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
335
336#undef USE_DEFAULT
337
a21cbb12 338 if (!opts->rename_limit) {
9f77b3f6
RB
339 opts->rename_limit = git_config__get_int_force(
340 cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
db106d01 341
9f77b3f6
RB
342 if (opts->rename_limit <= 0)
343 opts->rename_limit = DEFAULT_RENAME_LIMIT;
db106d01
RB
344 }
345
f8275890 346 /* assign the internal metric with whitespace flag as payload */
9bc8be3d 347 if (!opts->metric) {
f8275890
RB
348 opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
349 GITERR_CHECK_ALLOC(opts->metric);
350
0462fba5
ET
351 opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
352 opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
353 opts->metric->free_signature = git_diff_find_similar__hashsig_free;
354 opts->metric->similarity = git_diff_find_similar__calc_similarity;
f8275890 355
9bc8be3d 356 if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
f8275890 357 opts->metric->payload = (void *)GIT_HASHSIG_IGNORE_WHITESPACE;
9bc8be3d 358 else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
f8275890 359 opts->metric->payload = (void *)GIT_HASHSIG_NORMAL;
9bc8be3d 360 else
f8275890 361 opts->metric->payload = (void *)GIT_HASHSIG_SMART_WHITESPACE;
9bc8be3d
RB
362 }
363
db106d01
RB
364 return 0;
365}
366
2123a17f
RB
367static int insert_delete_side_of_split(
368 git_diff *diff, git_vector *onto, const git_diff_delta *delta)
369{
370 /* make new record for DELETED side of split */
371 git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool);
372 GITERR_CHECK_ALLOC(deleted);
373
374 deleted->status = GIT_DELTA_DELETED;
375 deleted->nfiles = 1;
376 memset(&deleted->new_file, 0, sizeof(deleted->new_file));
377 deleted->new_file.path = deleted->old_file.path;
378 deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
379
380 return git_vector_insert(onto, deleted);
381}
382
d958e37a 383static int apply_splits_and_deletes(
3ff1d123 384 git_diff *diff, size_t expected_size, bool actually_split)
db106d01
RB
385{
386 git_vector onto = GIT_VECTOR_INIT;
387 size_t i;
2123a17f 388 git_diff_delta *delta;
db106d01
RB
389
390 if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
391 return -1;
392
393 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
394 git_vector_foreach(&diff->deltas, i, delta) {
71a3d27e 395 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
db106d01 396 continue;
db106d01 397
a21cbb12 398 if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
d958e37a
RB
399 delta->similarity = 0;
400
2123a17f 401 if (insert_delete_side_of_split(diff, &onto, delta) < 0)
11d9f6b3 402 goto on_error;
db106d01 403
9be5be47
RB
404 if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR)
405 delta->status = GIT_DELTA_UNTRACKED;
406 else
407 delta->status = GIT_DELTA_ADDED;
74a627f0 408 delta->nfiles = 1;
db106d01
RB
409 memset(&delta->old_file, 0, sizeof(delta->old_file));
410 delta->old_file.path = delta->new_file.path;
71a3d27e 411 delta->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
db106d01
RB
412 }
413
c68b09dc
RB
414 /* clean up delta before inserting into new list */
415 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
416
417 if (delta->status != GIT_DELTA_COPIED &&
418 delta->status != GIT_DELTA_RENAMED &&
419 (delta->status != GIT_DELTA_MODIFIED || actually_split))
420 delta->similarity = 0;
421
422 /* insert into new list */
11d9f6b3
PK
423 if (git_vector_insert(&onto, delta) < 0)
424 goto on_error;
db106d01
RB
425 }
426
11d9f6b3 427 /* cannot return an error past this point */
c68b09dc
RB
428
429 /* free deltas from old list that didn't make it to the new one */
a21cbb12 430 git_vector_foreach(&diff->deltas, i, delta) {
71a3d27e 431 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
11d9f6b3 432 git__free(delta);
a21cbb12
RB
433 }
434
db106d01 435 /* swap new delta list into place */
db106d01
RB
436 git_vector_swap(&diff->deltas, &onto);
437 git_vector_free(&onto);
a21cbb12 438 git_vector_sort(&diff->deltas);
db106d01
RB
439
440 return 0;
11d9f6b3
PK
441
442on_error:
9cfce273 443 git_vector_free_deep(&onto);
11d9f6b3
PK
444
445 return -1;
db106d01
RB
446}
447
3ff1d123 448GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
960a04dd
RB
449{
450 git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
451 return (idx & 1) ? &delta->new_file : &delta->old_file;
452}
99ba8f23 453
a5140f4d
RB
454typedef struct {
455 size_t idx;
456 git_iterator_type_t src;
457 git_repository *repo;
458 git_diff_file *file;
459 git_buf data;
effdbeb3 460 git_odb_object *odb_obj;
a5140f4d 461 git_blob *blob;
a5140f4d
RB
462} similarity_info;
463
effdbeb3 464static int similarity_init(
3ff1d123 465 similarity_info *info, git_diff *diff, size_t file_idx)
a5140f4d
RB
466{
467 info->idx = file_idx;
468 info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
469 info->repo = diff->repo;
470 info->file = similarity_get_file(diff, file_idx);
effdbeb3 471 info->odb_obj = NULL;
a5140f4d 472 info->blob = NULL;
a5140f4d 473 git_buf_init(&info->data, 0);
09fae31d 474
410a8e6f 475 if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR)
effdbeb3 476 return 0;
5e5848eb 477
effdbeb3
RB
478 return git_diff_file__resolve_zero_size(
479 info->file, &info->odb_obj, info->repo);
a5140f4d 480}
960a04dd 481
d730d3f4 482static int similarity_sig(
a5140f4d
RB
483 similarity_info *info,
484 const git_diff_find_options *opts,
485 void **cache)
486{
487 int error = 0;
effdbeb3 488 git_diff_file *file = info->file;
8cfd54f0 489
effdbeb3
RB
490 if (info->src == GIT_ITERATOR_TYPE_WORKDIR) {
491 if ((error = git_buf_joinpath(
492 &info->data, git_repository_workdir(info->repo), file->path)) < 0)
493 return error;
960a04dd 494
effdbeb3
RB
495 /* if path is not a regular file, just skip this item */
496 if (!git_path_isfile(info->data.ptr))
497 return 0;
a5140f4d 498
a5140f4d
RB
499 /* TODO: apply wd-to-odb filters to file data if necessary */
500
501 error = opts->metric->file_signature(
502 &cache[info->idx], info->file,
503 info->data.ptr, opts->metric->payload);
504 } else {
effdbeb3
RB
505 /* if we didn't initially know the size, we might have an odb_obj
506 * around from earlier, so convert that, otherwise load the blob now
507 */
508 if (info->odb_obj != NULL)
509 error = git_object__from_odb_object(
510 (git_object **)&info->blob, info->repo,
511 info->odb_obj, GIT_OBJ_BLOB);
512 else
513 error = git_blob_lookup(&info->blob, info->repo, &file->oid);
514
515 if (error < 0) {
516 /* if lookup fails, just skip this item in similarity calc */
517 giterr_clear();
518 } else {
a16e4172
RB
519 size_t sz;
520
521 /* index size may not be actual blob size if filtered */
522 if (file->size != git_blob_rawsize(info->blob))
523 file->size = git_blob_rawsize(info->blob);
524
525 sz = (size_t)(git__is_sizet(file->size) ? file->size : -1);
effdbeb3
RB
526
527 error = opts->metric->buffer_signature(
528 &cache[info->idx], info->file,
529 git_blob_rawcontent(info->blob), sz, opts->metric->payload);
530 }
960a04dd
RB
531 }
532
533 return error;
534}
535
effdbeb3
RB
536static void similarity_unload(similarity_info *info)
537{
538 if (info->odb_obj)
539 git_odb_object_free(info->odb_obj);
540
541 if (info->blob)
542 git_blob_free(info->blob);
543 else
544 git_buf_free(&info->data);
545}
546
a21cbb12 547#define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
9be5be47
RB
548
549/* - score < 0 means files cannot be compared
550 * - score >= 100 means files are exact match
551 * - score == 0 means files are completely different
552 */
960a04dd 553static int similarity_measure(
9be5be47 554 int *score,
3ff1d123 555 git_diff *diff,
a21cbb12 556 const git_diff_find_options *opts,
960a04dd
RB
557 void **cache,
558 size_t a_idx,
559 size_t b_idx)
560{
960a04dd
RB
561 git_diff_file *a_file = similarity_get_file(diff, a_idx);
562 git_diff_file *b_file = similarity_get_file(diff, b_idx);
a21cbb12 563 bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
a5140f4d
RB
564 int error = 0;
565 similarity_info a_info, b_info;
9be5be47
RB
566
567 *score = -1;
960a04dd 568
9be5be47 569 /* don't try to compare files of different types */
960a04dd
RB
570 if (GIT_MODE_TYPE(a_file->mode) != GIT_MODE_TYPE(b_file->mode))
571 return 0;
572
a1683f28 573 /* if exact match is requested, force calculation of missing OIDs now */
9be5be47
RB
574 if (exact_match) {
575 if (git_oid_iszero(&a_file->oid) &&
576 diff->old_src == GIT_ITERATOR_TYPE_WORKDIR &&
577 !git_diff__oid_for_file(diff->repo, a_file->path,
578 a_file->mode, a_file->size, &a_file->oid))
579 a_file->flags |= GIT_DIFF_FLAG_VALID_OID;
580
581 if (git_oid_iszero(&b_file->oid) &&
582 diff->new_src == GIT_ITERATOR_TYPE_WORKDIR &&
583 !git_diff__oid_for_file(diff->repo, b_file->path,
584 b_file->mode, b_file->size, &b_file->oid))
585 b_file->flags |= GIT_DIFF_FLAG_VALID_OID;
586 }
587
588 /* check OID match as a quick test */
589 if (git_oid__cmp(&a_file->oid, &b_file->oid) == 0) {
590 *score = 100;
591 return 0;
592 }
593
594 /* don't calculate signatures if we are doing exact match */
595 if (exact_match) {
596 *score = 0;
597 return 0;
598 }
db106d01 599
effdbeb3
RB
600 memset(&a_info, 0, sizeof(a_info));
601 memset(&b_info, 0, sizeof(b_info));
a5140f4d 602
effdbeb3
RB
603 /* set up similarity data (will try to update missing file sizes) */
604 if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
605 return error;
606 if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
607 goto cleanup;
a5140f4d 608
f5c4d022 609 /* check if file sizes are nowhere near each other */
18e9efc4
RB
610 if (a_file->size > 127 &&
611 b_file->size > 127 &&
d730d3f4
RB
612 (a_file->size > (b_file->size << 3) ||
613 b_file->size > (a_file->size << 3)))
effdbeb3 614 goto cleanup;
18e9efc4 615
960a04dd 616 /* update signature cache if needed */
d730d3f4
RB
617 if (!cache[a_idx]) {
618 if ((error = similarity_sig(&a_info, opts, cache)) < 0)
619 goto cleanup;
620 }
621 if (!cache[b_idx]) {
622 if ((error = similarity_sig(&b_info, opts, cache)) < 0)
623 goto cleanup;
624 }
1fed6b07 625
a5140f4d
RB
626 /* calculate similarity provided that the metric choose to process
627 * both the a and b files (some may not if file is too big, etc).
628 */
629 if (cache[a_idx] && cache[b_idx])
630 error = opts->metric->similarity(
631 score, cache[a_idx], cache[b_idx], opts->metric->payload);
db106d01 632
effdbeb3 633cleanup:
a5140f4d
RB
634 similarity_unload(&a_info);
635 similarity_unload(&b_info);
636
637 return error;
db106d01
RB
638}
639
a21cbb12 640static int calc_self_similarity(
3ff1d123 641 git_diff *diff,
a21cbb12
RB
642 const git_diff_find_options *opts,
643 size_t delta_idx,
644 void **cache)
9be5be47 645{
a21cbb12
RB
646 int error, similarity = -1;
647 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
648
649 if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
650 return 0;
651
652 error = similarity_measure(
653 &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
654 if (error < 0)
655 return error;
656
657 if (similarity >= 0) {
74a627f0 658 delta->similarity = (uint16_t)similarity;
a21cbb12
RB
659 delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
660 }
661
662 return 0;
663}
664
665static bool is_rename_target(
3ff1d123 666 git_diff *diff,
a21cbb12
RB
667 const git_diff_find_options *opts,
668 size_t delta_idx,
669 void **cache)
670{
671 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
672
673 /* skip things that aren't plain blobs */
674 if (!GIT_MODE_ISBLOB(delta->new_file.mode))
675 return false;
676
677 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
678 * targets; maybe include UNTRACKED and IGNORED if requested.
679 */
680 switch (delta->status) {
681 case GIT_DELTA_UNMODIFIED:
682 case GIT_DELTA_DELETED:
683 return false;
684
685 case GIT_DELTA_MODIFIED:
686 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
687 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
688 return false;
689
690 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
691 return false;
692
693 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
694 delta->similarity < opts->break_rewrite_threshold) {
695 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
696 break;
697 }
698 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
699 delta->similarity < opts->rename_from_rewrite_threshold)
700 break;
701
702 return false;
703
704 case GIT_DELTA_UNTRACKED:
a21cbb12
RB
705 if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
706 return false;
707 break;
708
d55bed1a
ET
709 case GIT_DELTA_IGNORED:
710 return false;
711
a21cbb12
RB
712 default: /* all other status values should be checked */
713 break;
714 }
715
716 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
717 return true;
718}
719
720static bool is_rename_source(
3ff1d123 721 git_diff *diff,
a21cbb12
RB
722 const git_diff_find_options *opts,
723 size_t delta_idx,
724 void **cache)
725{
726 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
727
728 /* skip things that aren't blobs */
729 if (!GIT_MODE_ISBLOB(delta->old_file.mode))
730 return false;
731
732 switch (delta->status) {
733 case GIT_DELTA_ADDED:
734 case GIT_DELTA_UNTRACKED:
735 case GIT_DELTA_IGNORED:
736 return false;
737
738 case GIT_DELTA_DELETED:
739 case GIT_DELTA_TYPECHANGE:
740 break;
741
742 case GIT_DELTA_UNMODIFIED:
743 if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
744 return false;
f62c174d 745 if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
97ad85b8 746 delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
a21cbb12
RB
747 break;
748
749 default: /* MODIFIED, RENAMED, COPIED */
750 /* if we're finding copies, this could be a source */
751 if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
752 break;
753
754 /* otherwise, this is only a source if we can split it */
755 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
756 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
757 return false;
758
759 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
760 return false;
761
762 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
763 delta->similarity < opts->break_rewrite_threshold) {
764 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
765 break;
766 }
767
768 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
769 delta->similarity < opts->rename_from_rewrite_threshold)
770 break;
771
772 return false;
773 }
774
775 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
776 return true;
777}
778
779GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
780{
781 return (delta->status == GIT_DELTA_TYPECHANGE ||
782 (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
783}
784
785GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
786{
787 return (delta->status == GIT_DELTA_ADDED ||
788 delta->status == GIT_DELTA_UNTRACKED ||
789 delta->status == GIT_DELTA_IGNORED);
9be5be47 790}
db106d01 791
e4acc3ba 792GIT_INLINE(void) delta_make_rename(
74a627f0 793 git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
e4acc3ba
RB
794{
795 to->status = GIT_DELTA_RENAMED;
796 to->similarity = similarity;
74a627f0 797 to->nfiles = 2;
e4acc3ba
RB
798 memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
799 to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
800}
801
d958e37a 802typedef struct {
74a627f0
RB
803 size_t idx;
804 uint16_t similarity;
d958e37a
RB
805} diff_find_match;
806
db106d01 807int git_diff_find_similar(
3ff1d123 808 git_diff *diff,
10672e3e 809 const git_diff_find_options *given_opts)
db106d01 810{
d730d3f4 811 size_t s, t;
74a627f0
RB
812 int error = 0, result;
813 uint16_t similarity;
d730d3f4 814 git_diff_delta *src, *tgt;
7e3ed419 815 git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
d730d3f4
RB
816 size_t num_deltas, num_srcs = 0, num_tgts = 0;
817 size_t tried_srcs = 0, tried_tgts = 0;
e4acc3ba 818 size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
7e3ed419 819 void **sigcache = NULL; /* cache of similarity metric file signatures */
d730d3f4
RB
820 diff_find_match *tgt2src = NULL;
821 diff_find_match *src2tgt = NULL;
822 diff_find_match *tgt2src_copy = NULL;
823 diff_find_match *best_match;
e4acc3ba 824 git_diff_file swap;
db106d01 825
960a04dd 826 if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
628e92cd
BS
827 return error;
828
d730d3f4
RB
829 num_deltas = diff->deltas.length;
830
a21cbb12 831 /* TODO: maybe abort if deltas.length > rename_limit ??? */
d730d3f4 832 if (!git__is_uint32(num_deltas))
7e3ed419
RB
833 goto cleanup;
834
835 /* No flags set; nothing to do */
836 if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
837 goto cleanup;
960a04dd 838
d730d3f4 839 sigcache = git__calloc(num_deltas * 2, sizeof(void *));
e4acc3ba
RB
840 GITERR_CHECK_ALLOC(sigcache);
841
842 /* Label rename sources and targets
843 *
844 * This will also set self-similarity scores for MODIFIED files and
845 * mark them for splitting if break-rewrites is enabled
846 */
d730d3f4
RB
847 git_vector_foreach(&diff->deltas, t, tgt) {
848 if (is_rename_source(diff, &opts, t, sigcache))
e4acc3ba
RB
849 ++num_srcs;
850
d730d3f4 851 if (is_rename_target(diff, &opts, t, sigcache))
e4acc3ba 852 ++num_tgts;
17c7fbf6
ET
853
854 if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
855 num_rewrites++;
e4acc3ba 856 }
960a04dd 857
e4acc3ba
RB
858 /* if there are no candidate srcs or tgts, we're done */
859 if (!num_srcs || !num_tgts)
860 goto cleanup;
960a04dd 861
d730d3f4
RB
862 src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
863 GITERR_CHECK_ALLOC(src2tgt);
864 tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
865 GITERR_CHECK_ALLOC(tgt2src);
866
867 if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
868 tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
869 GITERR_CHECK_ALLOC(tgt2src_copy);
870 }
db106d01 871
e4acc3ba
RB
872 /*
873 * Find best-fit matches for rename / copy candidates
874 */
d958e37a 875
e4acc3ba
RB
876find_best_matches:
877 tried_tgts = num_bumped = 0;
db106d01 878
d730d3f4 879 git_vector_foreach(&diff->deltas, t, tgt) {
a21cbb12 880 /* skip things that are not rename targets */
d730d3f4 881 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
d958e37a
RB
882 continue;
883
e4acc3ba 884 tried_srcs = 0;
db106d01 885
d730d3f4 886 git_vector_foreach(&diff->deltas, s, src) {
a21cbb12 887 /* skip things that are not rename sources */
d730d3f4 888 if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
960a04dd
RB
889 continue;
890
d958e37a 891 /* calculate similarity for this pair and find best match */
d730d3f4 892 if (s == t)
74a627f0 893 result = -1; /* don't measure self-similarity here */
e4acc3ba 894 else if ((error = similarity_measure(
74a627f0 895 &result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
960a04dd 896 goto cleanup;
a21cbb12 897
74a627f0 898 if (result < 0)
d730d3f4 899 continue;
74a627f0 900 similarity = (uint16_t)result;
d730d3f4
RB
901
902 /* is this a better rename? */
74a627f0
RB
903 if (tgt2src[t].similarity < similarity &&
904 src2tgt[s].similarity < similarity)
e4acc3ba 905 {
d730d3f4
RB
906 /* eject old mapping */
907 if (src2tgt[s].similarity > 0) {
908 tgt2src[src2tgt[s].idx].similarity = 0;
909 num_bumped++;
910 }
911 if (tgt2src[t].similarity > 0) {
912 src2tgt[tgt2src[t].idx].similarity = 0;
913 num_bumped++;
e4acc3ba
RB
914 }
915
d730d3f4 916 /* write new mapping */
74a627f0
RB
917 tgt2src[t].idx = s;
918 tgt2src[t].similarity = similarity;
919 src2tgt[s].idx = t;
920 src2tgt[s].similarity = similarity;
d730d3f4 921 }
a21cbb12 922
d730d3f4
RB
923 /* keep best absolute match for copies */
924 if (tgt2src_copy != NULL &&
74a627f0 925 tgt2src_copy[t].similarity < similarity)
d730d3f4 926 {
74a627f0
RB
927 tgt2src_copy[t].idx = s;
928 tgt2src_copy[t].similarity = similarity;
db106d01 929 }
e4acc3ba
RB
930
931 if (++tried_srcs >= num_srcs)
932 break;
933
d730d3f4 934 /* cap on maximum targets we'll examine (per "tgt" file) */
e4acc3ba
RB
935 if (tried_srcs > opts.rename_limit)
936 break;
db106d01 937 }
e4acc3ba
RB
938
939 if (++tried_tgts >= num_tgts)
940 break;
db106d01
RB
941 }
942
e4acc3ba
RB
943 if (num_bumped > 0) /* try again if we bumped some items */
944 goto find_best_matches;
945
946 /*
947 * Rewrite the diffs with renames / copies
948 */
949
950 tried_tgts = 0;
db106d01 951
d730d3f4 952 git_vector_foreach(&diff->deltas, t, tgt) {
e4acc3ba 953 /* skip things that are not rename targets */
d730d3f4 954 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
a21cbb12 955 continue;
690bf41c 956
e4acc3ba 957 /* check if this delta was the target of a similarity */
d730d3f4
RB
958 if (tgt2src[t].similarity)
959 best_match = &tgt2src[t];
960 else if (tgt2src_copy && tgt2src_copy[t].similarity)
961 best_match = &tgt2src_copy[t];
962 else
e4acc3ba 963 continue;
d958e37a 964
d730d3f4
RB
965 s = best_match->idx;
966 src = GIT_VECTOR_GET(&diff->deltas, s);
d958e37a 967
a21cbb12
RB
968 /* possible scenarios:
969 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
970 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
971 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
972 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
973 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
db106d01
RB
974 */
975
d730d3f4 976 if (src->status == GIT_DELTA_DELETED) {
db106d01 977
d730d3f4 978 if (delta_is_new_only(tgt)) {
db106d01 979
e4acc3ba 980 if (best_match->similarity < opts.rename_threshold)
a21cbb12 981 continue;
960a04dd 982
d730d3f4 983 delta_make_rename(tgt, src, best_match->similarity);
d958e37a 984
d730d3f4 985 src->flags |= GIT_DIFF_FLAG__TO_DELETE;
a21cbb12
RB
986 num_rewrites++;
987 } else {
d730d3f4 988 assert(delta_is_split(tgt));
960a04dd 989
e4acc3ba 990 if (best_match->similarity < opts.rename_from_rewrite_threshold)
a21cbb12 991 continue;
db106d01 992
d730d3f4 993 memcpy(&swap, &tgt->old_file, sizeof(swap));
db106d01 994
d730d3f4 995 delta_make_rename(tgt, src, best_match->similarity);
e4acc3ba
RB
996 num_rewrites--;
997
74a627f0 998 assert(src->status == GIT_DELTA_DELETED);
d730d3f4
RB
999 memcpy(&src->old_file, &swap, sizeof(src->old_file));
1000 memset(&src->new_file, 0, sizeof(src->new_file));
1001 src->new_file.path = src->old_file.path;
1002 src->new_file.flags |= GIT_DIFF_FLAG_VALID_OID;
db106d01 1003
d958e37a 1004 num_updates++;
7edb74d3
RB
1005
1006 if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1007 /* what used to be at src t is now at src s */
74a627f0 1008 tgt2src[src2tgt[t].idx].idx = s;
7edb74d3 1009 }
db106d01
RB
1010 }
1011 }
1012
d730d3f4 1013 else if (delta_is_split(src)) {
a21cbb12 1014
d730d3f4 1015 if (delta_is_new_only(tgt)) {
db106d01 1016
e4acc3ba 1017 if (best_match->similarity < opts.rename_threshold)
a21cbb12 1018 continue;
d958e37a 1019
d730d3f4 1020 delta_make_rename(tgt, src, best_match->similarity);
a21cbb12 1021
d730d3f4 1022 src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ?
a21cbb12 1023 GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
74a627f0 1024 src->nfiles = 1;
d730d3f4
RB
1025 memset(&src->old_file, 0, sizeof(src->old_file));
1026 src->old_file.path = src->new_file.path;
1027 src->old_file.flags |= GIT_DIFF_FLAG_VALID_OID;
e4acc3ba 1028
d730d3f4 1029 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
e4acc3ba 1030 num_rewrites--;
a21cbb12
RB
1031
1032 num_updates++;
1033 } else {
d730d3f4 1034 assert(delta_is_split(src));
a21cbb12 1035
e4acc3ba 1036 if (best_match->similarity < opts.rename_from_rewrite_threshold)
a21cbb12
RB
1037 continue;
1038
d730d3f4 1039 memcpy(&swap, &tgt->old_file, sizeof(swap));
a21cbb12 1040
d730d3f4 1041 delta_make_rename(tgt, src, best_match->similarity);
e4acc3ba
RB
1042 num_rewrites--;
1043 num_updates++;
a21cbb12 1044
d730d3f4 1045 memcpy(&src->old_file, &swap, sizeof(src->old_file));
a21cbb12 1046
e4acc3ba
RB
1047 /* if we've just swapped the new element into the correct
1048 * place, clear the SPLIT flag
67db583d 1049 */
d730d3f4
RB
1050 if (tgt2src[s].idx == t &&
1051 tgt2src[s].similarity >
67db583d 1052 opts.rename_from_rewrite_threshold) {
d730d3f4
RB
1053 src->status = GIT_DELTA_RENAMED;
1054 src->similarity = tgt2src[s].similarity;
1055 tgt2src[s].similarity = 0;
1056 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
67db583d
RB
1057 num_rewrites--;
1058 }
e4acc3ba 1059 /* otherwise, if we just overwrote a source, update mapping */
7edb74d3 1060 else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
d730d3f4 1061 /* what used to be at src t is now at src s */
74a627f0 1062 tgt2src[src2tgt[t].idx].idx = s;
e4acc3ba 1063 }
67db583d 1064
a21cbb12
RB
1065 num_updates++;
1066 }
1067 }
1068
2123a17f 1069 else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
d730d3f4 1070 if (tgt2src_copy[t].similarity < opts.copy_threshold)
a21cbb12
RB
1071 continue;
1072
d730d3f4
RB
1073 /* always use best possible source for copy */
1074 best_match = &tgt2src_copy[t];
1075 src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
1076
2123a17f
RB
1077 if (delta_is_split(tgt)) {
1078 error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
1079 if (error < 0)
1080 goto cleanup;
1081 num_rewrites--;
1082 }
1083
1084 if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
1085 continue;
1086
d730d3f4
RB
1087 tgt->status = GIT_DELTA_COPIED;
1088 tgt->similarity = best_match->similarity;
74a627f0 1089 tgt->nfiles = 2;
d730d3f4 1090 memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
2123a17f 1091 tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
a21cbb12
RB
1092
1093 num_updates++;
1094 }
db106d01
RB
1095 }
1096
e4acc3ba
RB
1097 /*
1098 * Actually split and delete entries as needed
1099 */
1100
a21cbb12 1101 if (num_rewrites > 0 || num_updates > 0)
960a04dd 1102 error = apply_splits_and_deletes(
d958e37a 1103 diff, diff->deltas.length - num_rewrites,
17c7fbf6
ET
1104 FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
1105 !FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
d958e37a 1106
960a04dd 1107cleanup:
d730d3f4
RB
1108 git__free(tgt2src);
1109 git__free(src2tgt);
1110 git__free(tgt2src_copy);
db106d01 1111
7e3ed419
RB
1112 if (sigcache) {
1113 for (t = 0; t < num_deltas * 2; ++t) {
1114 if (sigcache[t] != NULL)
1115 opts.metric->free_signature(sigcache[t], opts.metric->payload);
1116 }
1117 git__free(sigcache);
db106d01
RB
1118 }
1119
f8275890
RB
1120 if (!given_opts || !given_opts->metric)
1121 git__free(opts.metric);
1122
960a04dd 1123 return error;
db106d01
RB
1124}
1125
1126#undef FLAG_SET