]>
Commit | Line | Data |
---|---|---|
db106d01 RB |
1 | /* |
2 | * Copyright (C) 2012 the libgit2 contributors | |
3 | * | |
4 | * This file is part of libgit2, distributed under the GNU GPL v2 with | |
5 | * a Linking Exception. For full terms see the included COPYING file. | |
6 | */ | |
7 | #include "common.h" | |
8 | #include "diff.h" | |
9 | #include "git2/config.h" | |
10 | ||
11 | static git_diff_delta *diff_delta__dup( | |
12 | const git_diff_delta *d, git_pool *pool) | |
13 | { | |
14 | git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); | |
15 | if (!delta) | |
16 | return NULL; | |
17 | ||
18 | memcpy(delta, d, sizeof(git_diff_delta)); | |
19 | ||
20 | delta->old_file.path = git_pool_strdup(pool, d->old_file.path); | |
21 | if (delta->old_file.path == NULL) | |
22 | goto fail; | |
23 | ||
24 | if (d->new_file.path != d->old_file.path) { | |
25 | delta->new_file.path = git_pool_strdup(pool, d->new_file.path); | |
26 | if (delta->new_file.path == NULL) | |
27 | goto fail; | |
28 | } else { | |
29 | delta->new_file.path = delta->old_file.path; | |
30 | } | |
31 | ||
32 | return delta; | |
33 | ||
34 | fail: | |
35 | git__free(delta); | |
36 | return NULL; | |
37 | } | |
38 | ||
39 | static git_diff_delta *diff_delta__merge_like_cgit( | |
40 | const git_diff_delta *a, const git_diff_delta *b, git_pool *pool) | |
41 | { | |
42 | git_diff_delta *dup; | |
43 | ||
44 | /* Emulate C git for merging two diffs (a la 'git diff <sha>'). | |
45 | * | |
46 | * When C git does a diff between the work dir and a tree, it actually | |
47 | * diffs with the index but uses the workdir contents. This emulates | |
48 | * those choices so we can emulate the type of diff. | |
49 | * | |
50 | * We have three file descriptions here, let's call them: | |
51 | * f1 = a->old_file | |
52 | * f2 = a->new_file AND b->old_file | |
53 | * f3 = b->new_file | |
54 | */ | |
55 | ||
56 | /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ | |
57 | if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) | |
58 | return diff_delta__dup(a, pool); | |
59 | ||
60 | /* otherwise, base this diff on the 'b' diff */ | |
61 | if ((dup = diff_delta__dup(b, pool)) == NULL) | |
62 | return NULL; | |
63 | ||
64 | /* If 'a' status is uninteresting, then we're done */ | |
65 | if (a->status == GIT_DELTA_UNMODIFIED) | |
66 | return dup; | |
67 | ||
68 | assert(a->status != GIT_DELTA_UNMODIFIED); | |
69 | assert(b->status != GIT_DELTA_UNMODIFIED); | |
70 | ||
71 | /* A cgit exception is that the diff of a file that is only in the | |
72 | * index (i.e. not in HEAD nor workdir) is given as empty. | |
73 | */ | |
74 | if (dup->status == GIT_DELTA_DELETED) { | |
75 | if (a->status == GIT_DELTA_ADDED) | |
76 | dup->status = GIT_DELTA_UNMODIFIED; | |
77 | /* else don't overwrite DELETE status */ | |
78 | } else { | |
79 | dup->status = a->status; | |
80 | } | |
81 | ||
82 | git_oid_cpy(&dup->old_file.oid, &a->old_file.oid); | |
83 | dup->old_file.mode = a->old_file.mode; | |
84 | dup->old_file.size = a->old_file.size; | |
85 | dup->old_file.flags = a->old_file.flags; | |
86 | ||
87 | return dup; | |
88 | } | |
89 | ||
90 | int git_diff_merge( | |
91 | git_diff_list *onto, | |
92 | const git_diff_list *from) | |
93 | { | |
94 | int error = 0; | |
95 | git_pool onto_pool; | |
96 | git_vector onto_new; | |
97 | git_diff_delta *delta; | |
98 | bool ignore_case = false; | |
99 | unsigned int i, j; | |
100 | ||
101 | assert(onto && from); | |
102 | ||
103 | if (!from->deltas.length) | |
104 | return 0; | |
105 | ||
106 | if (git_vector_init( | |
107 | &onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 || | |
108 | git_pool_init(&onto_pool, 1, 0) < 0) | |
109 | return -1; | |
110 | ||
111 | if ((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 || | |
112 | (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0) | |
113 | { | |
114 | ignore_case = true; | |
115 | ||
116 | /* This function currently only supports merging diff lists that | |
117 | * are sorted identically. */ | |
118 | assert((onto->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0 && | |
119 | (from->opts.flags & GIT_DIFF_DELTAS_ARE_ICASE) != 0); | |
120 | } | |
121 | ||
122 | for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { | |
123 | git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); | |
124 | const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); | |
125 | int cmp = !f ? -1 : !o ? 1 : STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); | |
126 | ||
127 | if (cmp < 0) { | |
128 | delta = diff_delta__dup(o, &onto_pool); | |
129 | i++; | |
130 | } else if (cmp > 0) { | |
131 | delta = diff_delta__dup(f, &onto_pool); | |
132 | j++; | |
133 | } else { | |
134 | delta = diff_delta__merge_like_cgit(o, f, &onto_pool); | |
135 | i++; | |
136 | j++; | |
137 | } | |
138 | ||
139 | /* the ignore rules for the target may not match the source | |
140 | * or the result of a merged delta could be skippable... | |
141 | */ | |
142 | if (git_diff_delta__should_skip(&onto->opts, delta)) { | |
143 | git__free(delta); | |
144 | continue; | |
145 | } | |
146 | ||
147 | if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) | |
148 | break; | |
149 | } | |
150 | ||
151 | if (!error) { | |
152 | git_vector_swap(&onto->deltas, &onto_new); | |
153 | git_pool_swap(&onto->pool, &onto_pool); | |
154 | onto->new_src = from->new_src; | |
155 | ||
156 | /* prefix strings also come from old pool, so recreate those.*/ | |
157 | onto->opts.old_prefix = | |
158 | git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); | |
159 | onto->opts.new_prefix = | |
160 | git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); | |
161 | } | |
162 | ||
163 | git_vector_foreach(&onto_new, i, delta) | |
164 | git__free(delta); | |
165 | git_vector_free(&onto_new); | |
166 | git_pool_clear(&onto_pool); | |
167 | ||
168 | return error; | |
169 | } | |
170 | ||
171 | #define DEFAULT_THRESHOLD 50 | |
172 | #define DEFAULT_BREAK_REWRITE_THRESHOLD 60 | |
173 | #define DEFAULT_TARGET_LIMIT 200 | |
174 | ||
175 | static int normalize_find_opts( | |
176 | git_diff_list *diff, | |
177 | git_diff_find_options *opts, | |
178 | git_diff_find_options *given) | |
179 | { | |
180 | git_config *cfg = NULL; | |
181 | const char *val; | |
182 | ||
183 | if (diff->repo != NULL && | |
184 | git_repository_config__weakptr(&cfg, diff->repo) < 0) | |
185 | return -1; | |
186 | ||
187 | if (given != NULL) | |
188 | memcpy(opts, given, sizeof(*opts)); | |
189 | else { | |
ca901e7b BS |
190 | git_diff_find_options init = GIT_DIFF_FIND_OPTIONS_INIT; |
191 | memmove(opts, &init, sizeof(init)); | |
db106d01 RB |
192 | |
193 | opts->flags = GIT_DIFF_FIND_RENAMES; | |
194 | ||
195 | if (git_config_get_string(&val, cfg, "diff.renames") < 0) | |
196 | giterr_clear(); | |
197 | else if (val && | |
198 | (!strcasecmp(val, "copies") || !strcasecmp(val, "copy"))) | |
199 | opts->flags = GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES; | |
200 | } | |
201 | ||
c7231c45 | 202 | GITERR_CHECK_VERSION(opts, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options"); |
ca901e7b | 203 | |
db106d01 RB |
204 | /* some flags imply others */ |
205 | ||
206 | if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) | |
207 | opts->flags |= GIT_DIFF_FIND_RENAMES; | |
208 | ||
209 | if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) | |
210 | opts->flags |= GIT_DIFF_FIND_COPIES; | |
211 | ||
212 | #define USE_DEFAULT(X) ((X) == 0 || (X) > 100) | |
213 | ||
214 | if (USE_DEFAULT(opts->rename_threshold)) | |
215 | opts->rename_threshold = DEFAULT_THRESHOLD; | |
216 | ||
217 | if (USE_DEFAULT(opts->rename_from_rewrite_threshold)) | |
218 | opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD; | |
219 | ||
220 | if (USE_DEFAULT(opts->copy_threshold)) | |
221 | opts->copy_threshold = DEFAULT_THRESHOLD; | |
222 | ||
223 | if (USE_DEFAULT(opts->break_rewrite_threshold)) | |
224 | opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD; | |
225 | ||
226 | #undef USE_DEFAULT | |
227 | ||
228 | if (!opts->target_limit) { | |
229 | int32_t limit = 0; | |
230 | ||
231 | opts->target_limit = DEFAULT_TARGET_LIMIT; | |
232 | ||
233 | if (git_config_get_int32(&limit, cfg, "diff.renameLimit") < 0) | |
234 | giterr_clear(); | |
235 | else if (limit > 0) | |
236 | opts->target_limit = limit; | |
237 | } | |
238 | ||
239 | return 0; | |
240 | } | |
241 | ||
242 | static int apply_splits_and_deletes(git_diff_list *diff, size_t expected_size) | |
243 | { | |
244 | git_vector onto = GIT_VECTOR_INIT; | |
245 | size_t i; | |
246 | git_diff_delta *delta; | |
247 | ||
248 | if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) | |
249 | return -1; | |
250 | ||
251 | /* build new delta list without TO_DELETE and splitting TO_SPLIT */ | |
252 | git_vector_foreach(&diff->deltas, i, delta) { | |
253 | if (delta->status == GIT_DELTA__TO_DELETE) { | |
254 | git__free(delta); | |
255 | continue; | |
256 | } | |
257 | ||
258 | if (delta->status == GIT_DELTA__TO_SPLIT) { | |
259 | git_diff_delta *deleted = diff_delta__dup(delta, &diff->pool); | |
260 | if (!deleted) | |
261 | return -1; | |
262 | ||
263 | deleted->status = GIT_DELTA_DELETED; | |
264 | memset(&deleted->new_file, 0, sizeof(deleted->new_file)); | |
265 | deleted->new_file.path = deleted->old_file.path; | |
266 | deleted->new_file.flags |= GIT_DIFF_FILE_VALID_OID; | |
267 | ||
268 | git_vector_insert(&onto, deleted); | |
269 | ||
270 | delta->status = GIT_DELTA_ADDED; | |
271 | memset(&delta->old_file, 0, sizeof(delta->old_file)); | |
272 | delta->old_file.path = delta->new_file.path; | |
273 | delta->old_file.flags |= GIT_DIFF_FILE_VALID_OID; | |
274 | } | |
275 | ||
276 | git_vector_insert(&onto, delta); | |
277 | } | |
278 | ||
279 | /* swap new delta list into place */ | |
280 | git_vector_sort(&onto); | |
281 | git_vector_swap(&diff->deltas, &onto); | |
282 | git_vector_free(&onto); | |
283 | ||
284 | return 0; | |
285 | } | |
286 | ||
287 | static unsigned int calc_similarity( | |
288 | void *cache, git_diff_file *old_file, git_diff_file *new_file) | |
289 | { | |
290 | GIT_UNUSED(cache); | |
291 | ||
292 | if (git_oid_cmp(&old_file->oid, &new_file->oid) == 0) | |
293 | return 100; | |
294 | ||
295 | /* TODO: insert actual similarity algo here */ | |
296 | ||
297 | return 0; | |
298 | } | |
299 | ||
300 | #define FLAG_SET(opts,flag_name) ((opts.flags & flag_name) != 0) | |
301 | ||
302 | int git_diff_find_similar( | |
303 | git_diff_list *diff, | |
304 | git_diff_find_options *given_opts) | |
305 | { | |
306 | unsigned int i, j, similarity; | |
307 | git_diff_delta *from, *to; | |
308 | git_diff_find_options opts; | |
309 | unsigned int tried_targets, num_changes = 0; | |
310 | git_vector matches = GIT_VECTOR_INIT; | |
311 | ||
312 | if (normalize_find_opts(diff, &opts, given_opts) < 0) | |
313 | return -1; | |
314 | ||
315 | /* first do splits if requested */ | |
316 | ||
317 | if (FLAG_SET(opts, GIT_DIFF_FIND_AND_BREAK_REWRITES)) { | |
318 | git_vector_foreach(&diff->deltas, i, from) { | |
319 | if (from->status != GIT_DELTA_MODIFIED) | |
320 | continue; | |
321 | ||
322 | /* Right now, this doesn't work right because the similarity | |
323 | * algorithm isn't actually implemented... | |
324 | */ | |
325 | similarity = 100; | |
326 | /* calc_similarity(NULL, &from->old_file, from->new_file); */ | |
327 | ||
328 | if (similarity < opts.break_rewrite_threshold) { | |
329 | from->status = GIT_DELTA__TO_SPLIT; | |
330 | num_changes++; | |
331 | } | |
332 | } | |
333 | ||
334 | /* apply splits as needed */ | |
335 | if (num_changes > 0 && | |
336 | apply_splits_and_deletes( | |
337 | diff, diff->deltas.length + num_changes) < 0) | |
338 | return -1; | |
339 | } | |
340 | ||
341 | /* next find the most similar delta for each rename / copy candidate */ | |
342 | ||
343 | if (git_vector_init(&matches, diff->deltas.length, git_diff_delta__cmp) < 0) | |
344 | return -1; | |
345 | ||
346 | git_vector_foreach(&diff->deltas, i, from) { | |
347 | tried_targets = 0; | |
348 | ||
349 | git_vector_foreach(&diff->deltas, j, to) { | |
350 | if (i == j) | |
351 | continue; | |
352 | ||
353 | switch (to->status) { | |
354 | case GIT_DELTA_ADDED: | |
355 | case GIT_DELTA_UNTRACKED: | |
356 | case GIT_DELTA_RENAMED: | |
357 | case GIT_DELTA_COPIED: | |
358 | break; | |
359 | default: | |
360 | /* only the above status values should be checked */ | |
361 | continue; | |
362 | } | |
363 | ||
364 | /* skip all but DELETED files unless copy detection is on */ | |
365 | if (from->status != GIT_DELTA_DELETED && | |
366 | !FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) | |
367 | continue; | |
368 | ||
369 | /* don't check UNMODIFIED files as source unless given option */ | |
370 | if (from->status == GIT_DELTA_UNMODIFIED && | |
371 | !FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) | |
372 | continue; | |
373 | ||
374 | /* cap on maximum files we'll examine */ | |
375 | if (++tried_targets > opts.target_limit) | |
376 | break; | |
377 | ||
378 | /* calculate similarity and see if this pair beats the | |
379 | * similarity score of the current best pair. | |
380 | */ | |
381 | similarity = calc_similarity(NULL, &from->old_file, &to->new_file); | |
382 | ||
383 | if (to->similarity < similarity) { | |
384 | to->similarity = similarity; | |
385 | if (git_vector_set(NULL, &matches, j, from) < 0) | |
386 | return -1; | |
387 | } | |
388 | } | |
389 | } | |
390 | ||
391 | /* next rewrite the diffs with renames / copies */ | |
392 | ||
393 | num_changes = 0; | |
394 | ||
395 | git_vector_foreach(&diff->deltas, j, to) { | |
396 | from = GIT_VECTOR_GET(&matches, j); | |
397 | if (!from) { | |
398 | assert(to->similarity == 0); | |
399 | continue; | |
400 | } | |
401 | ||
402 | /* three possible outcomes here: | |
403 | * 1. old DELETED and if over rename threshold, | |
404 | * new becomes RENAMED and old goes away | |
405 | * 2. old was MODIFIED but FIND_RENAMES_FROM_REWRITES is on and | |
406 | * old is more similar to new than it is to itself, in which | |
407 | * case, new becomes RENAMED and old becomed ADDED | |
408 | * 3. otherwise if over copy threshold, new becomes COPIED | |
409 | */ | |
410 | ||
411 | if (from->status == GIT_DELTA_DELETED) { | |
412 | if (to->similarity < opts.rename_threshold) { | |
413 | to->similarity = 0; | |
414 | continue; | |
415 | } | |
416 | ||
417 | to->status = GIT_DELTA_RENAMED; | |
418 | memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); | |
419 | ||
420 | from->status = GIT_DELTA__TO_DELETE; | |
421 | num_changes++; | |
422 | ||
423 | continue; | |
424 | } | |
425 | ||
426 | if (from->status == GIT_DELTA_MODIFIED && | |
427 | FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && | |
428 | to->similarity > opts.rename_threshold) | |
429 | { | |
430 | similarity = 100; | |
431 | /* calc_similarity(NULL, &from->old_file, from->new_file); */ | |
432 | ||
433 | if (similarity < opts.rename_from_rewrite_threshold) { | |
434 | to->status = GIT_DELTA_RENAMED; | |
435 | memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); | |
436 | ||
437 | from->status = GIT_DELTA_ADDED; | |
438 | memset(&from->old_file, 0, sizeof(from->old_file)); | |
439 | from->old_file.path = to->old_file.path; | |
440 | from->old_file.flags |= GIT_DIFF_FILE_VALID_OID; | |
441 | ||
442 | continue; | |
443 | } | |
444 | } | |
445 | ||
446 | if (to->similarity < opts.copy_threshold) { | |
447 | to->similarity = 0; | |
448 | continue; | |
449 | } | |
450 | ||
451 | /* convert "to" to a COPIED record */ | |
452 | to->status = GIT_DELTA_COPIED; | |
453 | memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); | |
454 | } | |
455 | ||
456 | git_vector_free(&matches); | |
457 | ||
458 | if (num_changes > 0) { | |
459 | assert(num_changes < diff->deltas.length); | |
460 | ||
461 | if (apply_splits_and_deletes( | |
462 | diff, diff->deltas.length - num_changes) < 0) | |
463 | return -1; | |
464 | } | |
465 | ||
466 | return 0; | |
467 | } | |
468 | ||
469 | #undef FLAG_SET |