]> git.proxmox.com Git - libgit2.git/blob - src/hashsig.c
Merge pull request #4049 from libgit2/ethomson/error_msgs
[libgit2.git] / src / hashsig.c
1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7 #include "git2/sys/hashsig.h"
8 #include "fileops.h"
9 #include "util.h"
10
11 typedef uint32_t hashsig_t;
12 typedef uint64_t hashsig_state;
13
14 #define HASHSIG_SCALE 100
15
16 #define HASHSIG_MAX_RUN 80
17 #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
18 #define HASHSIG_HASH_SHIFT 5
19
20 #define HASHSIG_HASH_MIX(S,CH) \
21 (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
22
23 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
24 #define HASHSIG_HEAP_MIN_SIZE 4
25
26 typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
27
28 typedef struct {
29 int size, asize;
30 hashsig_cmp cmp;
31 hashsig_t values[HASHSIG_HEAP_SIZE];
32 } hashsig_heap;
33
34 struct git_hashsig {
35 hashsig_heap mins;
36 hashsig_heap maxs;
37 size_t lines;
38 git_hashsig_option_t opt;
39 };
40
41 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
42 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
43 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
44
45 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
46 {
47 h->size = 0;
48 h->asize = HASHSIG_HEAP_SIZE;
49 h->cmp = cmp;
50 }
51
52 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
53 {
54 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
55 GIT_UNUSED(payload);
56 return (av < bv) ? -1 : (av > bv) ? 1 : 0;
57 }
58
59 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
60 {
61 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
62 GIT_UNUSED(payload);
63 return (av > bv) ? -1 : (av < bv) ? 1 : 0;
64 }
65
66 static void hashsig_heap_up(hashsig_heap *h, int el)
67 {
68 int parent_el = HEAP_PARENT_OF(el);
69
70 while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
71 hashsig_t t = h->values[el];
72 h->values[el] = h->values[parent_el];
73 h->values[parent_el] = t;
74
75 el = parent_el;
76 parent_el = HEAP_PARENT_OF(el);
77 }
78 }
79
80 static void hashsig_heap_down(hashsig_heap *h, int el)
81 {
82 hashsig_t v, lv, rv;
83
84 /* 'el < h->size / 2' tests if el is bottom row of heap */
85
86 while (el < h->size / 2) {
87 int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
88
89 v = h->values[el];
90 lv = h->values[lel];
91 rv = h->values[rel];
92
93 if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
94 break;
95
96 swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
97
98 h->values[el] = h->values[swapel];
99 h->values[swapel] = v;
100
101 el = swapel;
102 }
103 }
104
105 static void hashsig_heap_sort(hashsig_heap *h)
106 {
107 /* only need to do this at the end for signature comparison */
108 git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
109 }
110
111 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
112 {
113 /* if heap is not full, insert new element */
114 if (h->size < h->asize) {
115 h->values[h->size++] = val;
116 hashsig_heap_up(h, h->size - 1);
117 }
118
119 /* if heap is full, pop top if new element should replace it */
120 else if (h->cmp(&val, &h->values[0], NULL) > 0) {
121 h->size--;
122 h->values[0] = h->values[h->size];
123 hashsig_heap_down(h, 0);
124 }
125
126 }
127
128 typedef struct {
129 int use_ignores;
130 uint8_t ignore_ch[256];
131 } hashsig_in_progress;
132
133 static void hashsig_in_progress_init(
134 hashsig_in_progress *prog, git_hashsig *sig)
135 {
136 int i;
137
138 /* no more than one can be set */
139 assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
140 !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
141
142 if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
143 for (i = 0; i < 256; ++i)
144 prog->ignore_ch[i] = git__isspace_nonlf(i);
145 prog->use_ignores = 1;
146 } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
147 for (i = 0; i < 256; ++i)
148 prog->ignore_ch[i] = git__isspace(i);
149 prog->use_ignores = 1;
150 } else {
151 memset(prog, 0, sizeof(*prog));
152 }
153 }
154
155 static int hashsig_add_hashes(
156 git_hashsig *sig,
157 const uint8_t *data,
158 size_t size,
159 hashsig_in_progress *prog)
160 {
161 const uint8_t *scan = data, *end = data + size;
162 hashsig_state state = HASHSIG_HASH_START;
163 int use_ignores = prog->use_ignores, len;
164 uint8_t ch;
165
166 while (scan < end) {
167 state = HASHSIG_HASH_START;
168
169 for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
170 ch = *scan;
171
172 if (use_ignores)
173 for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
174 ++scan;
175 else if (sig->opt &
176 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
177 for (; scan < end && ch == '\r'; ch = *scan)
178 ++scan;
179
180 /* peek at next character to decide what to do next */
181 if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
182 use_ignores = (ch == '\n');
183
184 if (scan >= end)
185 break;
186 ++scan;
187
188 /* check run terminator */
189 if (ch == '\n' || ch == '\0') {
190 sig->lines++;
191 break;
192 }
193
194 ++len;
195 HASHSIG_HASH_MIX(state, ch);
196 }
197
198 if (len > 0) {
199 hashsig_heap_insert(&sig->mins, (hashsig_t)state);
200 hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
201
202 while (scan < end && (*scan == '\n' || !*scan))
203 ++scan;
204 }
205 }
206
207 prog->use_ignores = use_ignores;
208
209 return 0;
210 }
211
212 static int hashsig_finalize_hashes(git_hashsig *sig)
213 {
214 if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
215 !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
216 giterr_set(GITERR_INVALID,
217 "file too small for similarity signature calculation");
218 return GIT_EBUFS;
219 }
220
221 hashsig_heap_sort(&sig->mins);
222 hashsig_heap_sort(&sig->maxs);
223
224 return 0;
225 }
226
227 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
228 {
229 git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
230 if (!sig)
231 return NULL;
232
233 hashsig_heap_init(&sig->mins, hashsig_cmp_min);
234 hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
235 sig->opt = opts;
236
237 return sig;
238 }
239
240 int git_hashsig_create(
241 git_hashsig **out,
242 const char *buf,
243 size_t buflen,
244 git_hashsig_option_t opts)
245 {
246 int error;
247 hashsig_in_progress prog;
248 git_hashsig *sig = hashsig_alloc(opts);
249 GITERR_CHECK_ALLOC(sig);
250
251 hashsig_in_progress_init(&prog, sig);
252
253 error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
254
255 if (!error)
256 error = hashsig_finalize_hashes(sig);
257
258 if (!error)
259 *out = sig;
260 else
261 git_hashsig_free(sig);
262
263 return error;
264 }
265
266 int git_hashsig_create_fromfile(
267 git_hashsig **out,
268 const char *path,
269 git_hashsig_option_t opts)
270 {
271 uint8_t buf[0x1000];
272 ssize_t buflen = 0;
273 int error = 0, fd;
274 hashsig_in_progress prog;
275 git_hashsig *sig = hashsig_alloc(opts);
276 GITERR_CHECK_ALLOC(sig);
277
278 if ((fd = git_futils_open_ro(path)) < 0) {
279 git__free(sig);
280 return fd;
281 }
282
283 hashsig_in_progress_init(&prog, sig);
284
285 while (!error) {
286 if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
287 if ((error = (int)buflen) < 0)
288 giterr_set(GITERR_OS,
289 "read error on '%s' calculating similarity hashes", path);
290 break;
291 }
292
293 error = hashsig_add_hashes(sig, buf, buflen, &prog);
294 }
295
296 p_close(fd);
297
298 if (!error)
299 error = hashsig_finalize_hashes(sig);
300
301 if (!error)
302 *out = sig;
303 else
304 git_hashsig_free(sig);
305
306 return error;
307 }
308
309 void git_hashsig_free(git_hashsig *sig)
310 {
311 git__free(sig);
312 }
313
314 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
315 {
316 int matches = 0, i, j, cmp;
317
318 assert(a->cmp == b->cmp);
319
320 /* hash heaps are sorted - just look for overlap vs total */
321
322 for (i = 0, j = 0; i < a->size && j < b->size; ) {
323 cmp = a->cmp(&a->values[i], &b->values[j], NULL);
324
325 if (cmp < 0)
326 ++i;
327 else if (cmp > 0)
328 ++j;
329 else {
330 ++i; ++j; ++matches;
331 }
332 }
333
334 return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
335 }
336
337 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
338 {
339 /* if we have no elements in either file then each file is either
340 * empty or blank. if we're ignoring whitespace then the files are
341 * similar, otherwise they're dissimilar.
342 */
343 if (a->mins.size == 0 && b->mins.size == 0) {
344 if ((!a->lines && !b->lines) ||
345 (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
346 return HASHSIG_SCALE;
347 else
348 return 0;
349 }
350
351 /* if we have fewer than the maximum number of elements, then just use
352 * one array since the two arrays will be the same
353 */
354 if (a->mins.size < HASHSIG_HEAP_SIZE)
355 return hashsig_heap_compare(&a->mins, &b->mins);
356 else
357 return (hashsig_heap_compare(&a->mins, &b->mins) +
358 hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
359 }