]> git.proxmox.com Git - libgit2.git/blob - src/hashsig.c
Merge branch 'development'
[libgit2.git] / src / hashsig.c
1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7 #include "hashsig.h"
8 #include "fileops.h"
9 #include "util.h"
10
11 typedef uint32_t hashsig_t;
12 typedef uint64_t hashsig_state;
13
14 #define HASHSIG_SCALE 100
15
16 #define HASHSIG_HASH_WINDOW 32
17 #define HASHSIG_HASH_START 0
18 #define HASHSIG_HASH_SHIFT 5
19 #define HASHSIG_HASH_MASK 0x7FFFFFFF
20
21 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
22
23 typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
24
25 typedef struct {
26 int size, asize;
27 hashsig_cmp cmp;
28 hashsig_t values[HASHSIG_HEAP_SIZE];
29 } hashsig_heap;
30
31 typedef struct {
32 hashsig_state state, shift_n;
33 char window[HASHSIG_HASH_WINDOW];
34 int win_len, win_pos, saw_lf;
35 } hashsig_in_progress;
36
37 #define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 }
38
39 struct git_hashsig {
40 hashsig_heap mins;
41 hashsig_heap maxs;
42 git_hashsig_option_t opt;
43 int considered;
44 };
45
46 #define HEAP_LCHILD_OF(I) (((I)*2)+1)
47 #define HEAP_RCHILD_OF(I) (((I)*2)+2)
48 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
49
50 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
51 {
52 h->size = 0;
53 h->asize = HASHSIG_HEAP_SIZE;
54 h->cmp = cmp;
55 }
56
57 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
58 {
59 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
60 GIT_UNUSED(payload);
61 return (av < bv) ? -1 : (av > bv) ? 1 : 0;
62 }
63
64 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
65 {
66 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
67 GIT_UNUSED(payload);
68 return (av > bv) ? -1 : (av < bv) ? 1 : 0;
69 }
70
71 static void hashsig_heap_up(hashsig_heap *h, int el)
72 {
73 int parent_el = HEAP_PARENT_OF(el);
74
75 while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
76 hashsig_t t = h->values[el];
77 h->values[el] = h->values[parent_el];
78 h->values[parent_el] = t;
79
80 el = parent_el;
81 parent_el = HEAP_PARENT_OF(el);
82 }
83 }
84
85 static void hashsig_heap_down(hashsig_heap *h, int el)
86 {
87 hashsig_t v, lv, rv;
88
89 /* 'el < h->size / 2' tests if el is bottom row of heap */
90
91 while (el < h->size / 2) {
92 int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
93
94 v = h->values[el];
95 lv = h->values[lel];
96 rv = h->values[rel];
97
98 if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
99 break;
100
101 swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
102
103 h->values[el] = h->values[swapel];
104 h->values[swapel] = v;
105
106 el = swapel;
107 }
108 }
109
110 static void hashsig_heap_sort(hashsig_heap *h)
111 {
112 /* only need to do this at the end for signature comparison */
113 git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
114 }
115
116 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
117 {
118 /* if heap is full, pop top if new element should replace it */
119 if (h->size == h->asize && h->cmp(&val, &h->values[0], NULL) > 0) {
120 h->size--;
121 h->values[0] = h->values[h->size];
122 hashsig_heap_down(h, 0);
123 }
124
125 /* if heap is not full, insert new element */
126 if (h->size < h->asize) {
127 h->values[h->size++] = val;
128 hashsig_heap_up(h, h->size - 1);
129 }
130 }
131
132 GIT_INLINE(bool) hashsig_include_char(
133 char ch, git_hashsig_option_t opt, int *saw_lf)
134 {
135 if ((opt & GIT_HASHSIG_IGNORE_WHITESPACE) && git__isspace(ch))
136 return false;
137
138 if (opt & GIT_HASHSIG_SMART_WHITESPACE) {
139 if (ch == '\r' || (*saw_lf && git__isspace(ch)))
140 return false;
141
142 *saw_lf = (ch == '\n');
143 }
144
145 return true;
146 }
147
148 static void hashsig_initial_window(
149 git_hashsig *sig,
150 const char **data,
151 size_t size,
152 hashsig_in_progress *prog)
153 {
154 hashsig_state state, shift_n;
155 int win_len;
156 const char *scan, *end;
157
158 /* init until we have processed at least HASHSIG_HASH_WINDOW data */
159
160 if (prog->win_len >= HASHSIG_HASH_WINDOW)
161 return;
162
163 state = prog->state;
164 win_len = prog->win_len;
165 shift_n = prog->shift_n;
166
167 scan = *data;
168 end = scan + size;
169
170 while (scan < end && win_len < HASHSIG_HASH_WINDOW) {
171 char ch = *scan++;
172
173 if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf))
174 continue;
175
176 state = (state * HASHSIG_HASH_SHIFT + ch) & HASHSIG_HASH_MASK;
177
178 if (!win_len)
179 shift_n = 1;
180 else
181 shift_n = (shift_n * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK;
182
183 prog->window[win_len++] = ch;
184 }
185
186 /* insert initial hash if we just finished */
187
188 if (win_len == HASHSIG_HASH_WINDOW) {
189 hashsig_heap_insert(&sig->mins, (hashsig_t)state);
190 hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
191 sig->considered = 1;
192 }
193
194 prog->state = state;
195 prog->win_len = win_len;
196 prog->shift_n = shift_n;
197
198 *data = scan;
199 }
200
201 static int hashsig_add_hashes(
202 git_hashsig *sig,
203 const char *data,
204 size_t size,
205 hashsig_in_progress *prog)
206 {
207 const char *scan = data, *end = data + size;
208 hashsig_state state, shift_n, rmv;
209
210 if (prog->win_len < HASHSIG_HASH_WINDOW)
211 hashsig_initial_window(sig, &scan, size, prog);
212
213 state = prog->state;
214 shift_n = prog->shift_n;
215
216 /* advance window, adding new chars and removing old */
217
218 for (; scan < end; ++scan) {
219 char ch = *scan;
220
221 if (!hashsig_include_char(ch, sig->opt, &prog->saw_lf))
222 continue;
223
224 rmv = shift_n * prog->window[prog->win_pos];
225
226 state = (state - rmv) & HASHSIG_HASH_MASK;
227 state = (state * HASHSIG_HASH_SHIFT) & HASHSIG_HASH_MASK;
228 state = (state + ch) & HASHSIG_HASH_MASK;
229
230 hashsig_heap_insert(&sig->mins, (hashsig_t)state);
231 hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
232 sig->considered++;
233
234 prog->window[prog->win_pos] = ch;
235 prog->win_pos = (prog->win_pos + 1) % HASHSIG_HASH_WINDOW;
236 }
237
238 prog->state = state;
239
240 return 0;
241 }
242
243 static int hashsig_finalize_hashes(git_hashsig *sig)
244 {
245 if (sig->mins.size < HASHSIG_HEAP_SIZE) {
246 giterr_set(GITERR_INVALID,
247 "File too small for similarity signature calculation");
248 return GIT_EBUFS;
249 }
250
251 hashsig_heap_sort(&sig->mins);
252 hashsig_heap_sort(&sig->maxs);
253
254 return 0;
255 }
256
257 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
258 {
259 git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
260 if (!sig)
261 return NULL;
262
263 hashsig_heap_init(&sig->mins, hashsig_cmp_min);
264 hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
265 sig->opt = opts;
266
267 return sig;
268 }
269
270 int git_hashsig_create(
271 git_hashsig **out,
272 const char *buf,
273 size_t buflen,
274 git_hashsig_option_t opts)
275 {
276 int error;
277 hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT;
278 git_hashsig *sig = hashsig_alloc(opts);
279 GITERR_CHECK_ALLOC(sig);
280
281 error = hashsig_add_hashes(sig, buf, buflen, &prog);
282
283 if (!error)
284 error = hashsig_finalize_hashes(sig);
285
286 if (!error)
287 *out = sig;
288 else
289 git_hashsig_free(sig);
290
291 return error;
292 }
293
294 int git_hashsig_create_fromfile(
295 git_hashsig **out,
296 const char *path,
297 git_hashsig_option_t opts)
298 {
299 char buf[4096];
300 ssize_t buflen = 0;
301 int error = 0, fd;
302 hashsig_in_progress prog = HASHSIG_IN_PROGRESS_INIT;
303 git_hashsig *sig = hashsig_alloc(opts);
304 GITERR_CHECK_ALLOC(sig);
305
306 if ((fd = git_futils_open_ro(path)) < 0) {
307 git__free(sig);
308 return fd;
309 }
310
311 while (!error) {
312 if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
313 if ((error = (int)buflen) < 0)
314 giterr_set(GITERR_OS,
315 "Read error on '%s' calculating similarity hashes", path);
316 break;
317 }
318
319 error = hashsig_add_hashes(sig, buf, buflen, &prog);
320 }
321
322 p_close(fd);
323
324 if (!error)
325 error = hashsig_finalize_hashes(sig);
326
327 if (!error)
328 *out = sig;
329 else
330 git_hashsig_free(sig);
331
332 return error;
333 }
334
335 void git_hashsig_free(git_hashsig *sig)
336 {
337 git__free(sig);
338 }
339
340 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
341 {
342 int matches = 0, i, j, cmp;
343
344 assert(a->cmp == b->cmp);
345
346 /* hash heaps are sorted - just look for overlap vs total */
347
348 for (i = 0, j = 0; i < a->size && j < b->size; ) {
349 cmp = a->cmp(&a->values[i], &b->values[j], NULL);
350
351 if (cmp < 0)
352 ++i;
353 else if (cmp > 0)
354 ++j;
355 else {
356 ++i; ++j; ++matches;
357 }
358 }
359
360 return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
361 }
362
363 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
364 {
365 return (hashsig_heap_compare(&a->mins, &b->mins) +
366 hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
367 }