]> git.proxmox.com Git - libgit2.git/blob - src/hashsig.c
abebd7a54f16c1c210213e3b625d10f413fe3af5
[libgit2.git] / src / hashsig.c
1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "common.h"
9
10 #include "git2/sys/hashsig.h"
11 #include "fileops.h"
12 #include "util.h"
13
14 typedef uint32_t hashsig_t;
15 typedef uint64_t hashsig_state;
16
17 #define HASHSIG_SCALE 100
18
19 #define HASHSIG_MAX_RUN 80
20 #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
21 #define HASHSIG_HASH_SHIFT 5
22
23 #define HASHSIG_HASH_MIX(S,CH) \
24 (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
25
26 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27 #define HASHSIG_HEAP_MIN_SIZE 4
28
29 typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
30
31 typedef struct {
32 int size, asize;
33 hashsig_cmp cmp;
34 hashsig_t values[HASHSIG_HEAP_SIZE];
35 } hashsig_heap;
36
37 struct git_hashsig {
38 hashsig_heap mins;
39 hashsig_heap maxs;
40 size_t lines;
41 git_hashsig_option_t opt;
42 };
43
44 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
45 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
47
48 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
49 {
50 h->size = 0;
51 h->asize = HASHSIG_HEAP_SIZE;
52 h->cmp = cmp;
53 }
54
55 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
56 {
57 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
58 GIT_UNUSED(payload);
59 return (av < bv) ? -1 : (av > bv) ? 1 : 0;
60 }
61
62 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
63 {
64 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
65 GIT_UNUSED(payload);
66 return (av > bv) ? -1 : (av < bv) ? 1 : 0;
67 }
68
69 static void hashsig_heap_up(hashsig_heap *h, int el)
70 {
71 int parent_el = HEAP_PARENT_OF(el);
72
73 while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
74 hashsig_t t = h->values[el];
75 h->values[el] = h->values[parent_el];
76 h->values[parent_el] = t;
77
78 el = parent_el;
79 parent_el = HEAP_PARENT_OF(el);
80 }
81 }
82
83 static void hashsig_heap_down(hashsig_heap *h, int el)
84 {
85 hashsig_t v, lv, rv;
86
87 /* 'el < h->size / 2' tests if el is bottom row of heap */
88
89 while (el < h->size / 2) {
90 int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
91
92 v = h->values[el];
93 lv = h->values[lel];
94 rv = h->values[rel];
95
96 if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
97 break;
98
99 swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
100
101 h->values[el] = h->values[swapel];
102 h->values[swapel] = v;
103
104 el = swapel;
105 }
106 }
107
108 static void hashsig_heap_sort(hashsig_heap *h)
109 {
110 /* only need to do this at the end for signature comparison */
111 git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
112 }
113
114 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
115 {
116 /* if heap is not full, insert new element */
117 if (h->size < h->asize) {
118 h->values[h->size++] = val;
119 hashsig_heap_up(h, h->size - 1);
120 }
121
122 /* if heap is full, pop top if new element should replace it */
123 else if (h->cmp(&val, &h->values[0], NULL) > 0) {
124 h->size--;
125 h->values[0] = h->values[h->size];
126 hashsig_heap_down(h, 0);
127 }
128
129 }
130
131 typedef struct {
132 int use_ignores;
133 uint8_t ignore_ch[256];
134 } hashsig_in_progress;
135
136 static void hashsig_in_progress_init(
137 hashsig_in_progress *prog, git_hashsig *sig)
138 {
139 int i;
140
141 /* no more than one can be set */
142 assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
143 !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
144
145 if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
146 for (i = 0; i < 256; ++i)
147 prog->ignore_ch[i] = git__isspace_nonlf(i);
148 prog->use_ignores = 1;
149 } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
150 for (i = 0; i < 256; ++i)
151 prog->ignore_ch[i] = git__isspace(i);
152 prog->use_ignores = 1;
153 } else {
154 memset(prog, 0, sizeof(*prog));
155 }
156 }
157
158 static int hashsig_add_hashes(
159 git_hashsig *sig,
160 const uint8_t *data,
161 size_t size,
162 hashsig_in_progress *prog)
163 {
164 const uint8_t *scan = data, *end = data + size;
165 hashsig_state state = HASHSIG_HASH_START;
166 int use_ignores = prog->use_ignores, len;
167 uint8_t ch;
168
169 while (scan < end) {
170 state = HASHSIG_HASH_START;
171
172 for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
173 ch = *scan;
174
175 if (use_ignores)
176 for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
177 ++scan;
178 else if (sig->opt &
179 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
180 for (; scan < end && ch == '\r'; ch = *scan)
181 ++scan;
182
183 /* peek at next character to decide what to do next */
184 if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
185 use_ignores = (ch == '\n');
186
187 if (scan >= end)
188 break;
189 ++scan;
190
191 /* check run terminator */
192 if (ch == '\n' || ch == '\0') {
193 sig->lines++;
194 break;
195 }
196
197 ++len;
198 HASHSIG_HASH_MIX(state, ch);
199 }
200
201 if (len > 0) {
202 hashsig_heap_insert(&sig->mins, (hashsig_t)state);
203 hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
204
205 while (scan < end && (*scan == '\n' || !*scan))
206 ++scan;
207 }
208 }
209
210 prog->use_ignores = use_ignores;
211
212 return 0;
213 }
214
215 static int hashsig_finalize_hashes(git_hashsig *sig)
216 {
217 if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
218 !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
219 git_error_set(GIT_ERROR_INVALID,
220 "file too small for similarity signature calculation");
221 return GIT_EBUFS;
222 }
223
224 hashsig_heap_sort(&sig->mins);
225 hashsig_heap_sort(&sig->maxs);
226
227 return 0;
228 }
229
230 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
231 {
232 git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
233 if (!sig)
234 return NULL;
235
236 hashsig_heap_init(&sig->mins, hashsig_cmp_min);
237 hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
238 sig->opt = opts;
239
240 return sig;
241 }
242
243 int git_hashsig_create(
244 git_hashsig **out,
245 const char *buf,
246 size_t buflen,
247 git_hashsig_option_t opts)
248 {
249 int error;
250 hashsig_in_progress prog;
251 git_hashsig *sig = hashsig_alloc(opts);
252 GIT_ERROR_CHECK_ALLOC(sig);
253
254 hashsig_in_progress_init(&prog, sig);
255
256 error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
257
258 if (!error)
259 error = hashsig_finalize_hashes(sig);
260
261 if (!error)
262 *out = sig;
263 else
264 git_hashsig_free(sig);
265
266 return error;
267 }
268
269 int git_hashsig_create_fromfile(
270 git_hashsig **out,
271 const char *path,
272 git_hashsig_option_t opts)
273 {
274 uint8_t buf[0x1000];
275 ssize_t buflen = 0;
276 int error = 0, fd;
277 hashsig_in_progress prog;
278 git_hashsig *sig = hashsig_alloc(opts);
279 GIT_ERROR_CHECK_ALLOC(sig);
280
281 if ((fd = git_futils_open_ro(path)) < 0) {
282 git__free(sig);
283 return fd;
284 }
285
286 hashsig_in_progress_init(&prog, sig);
287
288 while (!error) {
289 if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
290 if ((error = (int)buflen) < 0)
291 git_error_set(GIT_ERROR_OS,
292 "read error on '%s' calculating similarity hashes", path);
293 break;
294 }
295
296 error = hashsig_add_hashes(sig, buf, buflen, &prog);
297 }
298
299 p_close(fd);
300
301 if (!error)
302 error = hashsig_finalize_hashes(sig);
303
304 if (!error)
305 *out = sig;
306 else
307 git_hashsig_free(sig);
308
309 return error;
310 }
311
312 void git_hashsig_free(git_hashsig *sig)
313 {
314 git__free(sig);
315 }
316
317 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
318 {
319 int matches = 0, i, j, cmp;
320
321 assert(a->cmp == b->cmp);
322
323 /* hash heaps are sorted - just look for overlap vs total */
324
325 for (i = 0, j = 0; i < a->size && j < b->size; ) {
326 cmp = a->cmp(&a->values[i], &b->values[j], NULL);
327
328 if (cmp < 0)
329 ++i;
330 else if (cmp > 0)
331 ++j;
332 else {
333 ++i; ++j; ++matches;
334 }
335 }
336
337 return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
338 }
339
340 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
341 {
342 /* if we have no elements in either file then each file is either
343 * empty or blank. if we're ignoring whitespace then the files are
344 * similar, otherwise they're dissimilar.
345 */
346 if (a->mins.size == 0 && b->mins.size == 0) {
347 if ((!a->lines && !b->lines) ||
348 (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
349 return HASHSIG_SCALE;
350 else
351 return 0;
352 }
353
354 /* if we have fewer than the maximum number of elements, then just use
355 * one array since the two arrays will be the same
356 */
357 if (a->mins.size < HASHSIG_HEAP_SIZE)
358 return hashsig_heap_compare(&a->mins, &b->mins);
359 else
360 return (hashsig_heap_compare(&a->mins, &b->mins) +
361 hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
362 }