2 * Copyright (C) the libgit2 contributors. All rights reserved.
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
10 #include "git2/sys/hashsig.h"
14 typedef uint32_t hashsig_t
;
15 typedef uint64_t hashsig_state
;
17 #define HASHSIG_SCALE 100
19 #define HASHSIG_MAX_RUN 80
20 #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
21 #define HASHSIG_HASH_SHIFT 5
23 #define HASHSIG_HASH_MIX(S,CH) \
24 (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
26 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27 #define HASHSIG_HEAP_MIN_SIZE 4
29 typedef int (*hashsig_cmp
)(const void *a
, const void *b
, void *);
34 hashsig_t values
[HASHSIG_HEAP_SIZE
];
41 git_hashsig_option_t opt
;
44 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
45 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
48 static void hashsig_heap_init(hashsig_heap
*h
, hashsig_cmp cmp
)
51 h
->asize
= HASHSIG_HEAP_SIZE
;
55 static int hashsig_cmp_max(const void *a
, const void *b
, void *payload
)
57 hashsig_t av
= *(const hashsig_t
*)a
, bv
= *(const hashsig_t
*)b
;
59 return (av
< bv
) ? -1 : (av
> bv
) ? 1 : 0;
62 static int hashsig_cmp_min(const void *a
, const void *b
, void *payload
)
64 hashsig_t av
= *(const hashsig_t
*)a
, bv
= *(const hashsig_t
*)b
;
66 return (av
> bv
) ? -1 : (av
< bv
) ? 1 : 0;
69 static void hashsig_heap_up(hashsig_heap
*h
, int el
)
71 int parent_el
= HEAP_PARENT_OF(el
);
73 while (el
> 0 && h
->cmp(&h
->values
[parent_el
], &h
->values
[el
], NULL
) > 0) {
74 hashsig_t t
= h
->values
[el
];
75 h
->values
[el
] = h
->values
[parent_el
];
76 h
->values
[parent_el
] = t
;
79 parent_el
= HEAP_PARENT_OF(el
);
83 static void hashsig_heap_down(hashsig_heap
*h
, int el
)
87 /* 'el < h->size / 2' tests if el is bottom row of heap */
89 while (el
< h
->size
/ 2) {
90 int lel
= HEAP_LCHILD_OF(el
), rel
= HEAP_RCHILD_OF(el
), swapel
;
96 if (h
->cmp(&v
, &lv
, NULL
) < 0 && h
->cmp(&v
, &rv
, NULL
) < 0)
99 swapel
= (h
->cmp(&lv
, &rv
, NULL
) < 0) ? lel
: rel
;
101 h
->values
[el
] = h
->values
[swapel
];
102 h
->values
[swapel
] = v
;
108 static void hashsig_heap_sort(hashsig_heap
*h
)
110 /* only need to do this at the end for signature comparison */
111 git__qsort_r(h
->values
, h
->size
, sizeof(hashsig_t
), h
->cmp
, NULL
);
114 static void hashsig_heap_insert(hashsig_heap
*h
, hashsig_t val
)
116 /* if heap is not full, insert new element */
117 if (h
->size
< h
->asize
) {
118 h
->values
[h
->size
++] = val
;
119 hashsig_heap_up(h
, h
->size
- 1);
122 /* if heap is full, pop top if new element should replace it */
123 else if (h
->cmp(&val
, &h
->values
[0], NULL
) > 0) {
125 h
->values
[0] = h
->values
[h
->size
];
126 hashsig_heap_down(h
, 0);
133 uint8_t ignore_ch
[256];
134 } hashsig_in_progress
;
136 static void hashsig_in_progress_init(
137 hashsig_in_progress
*prog
, git_hashsig
*sig
)
141 /* no more than one can be set */
142 assert(!(sig
->opt
& GIT_HASHSIG_IGNORE_WHITESPACE
) ||
143 !(sig
->opt
& GIT_HASHSIG_SMART_WHITESPACE
));
145 if (sig
->opt
& GIT_HASHSIG_IGNORE_WHITESPACE
) {
146 for (i
= 0; i
< 256; ++i
)
147 prog
->ignore_ch
[i
] = git__isspace_nonlf(i
);
148 prog
->use_ignores
= 1;
149 } else if (sig
->opt
& GIT_HASHSIG_SMART_WHITESPACE
) {
150 for (i
= 0; i
< 256; ++i
)
151 prog
->ignore_ch
[i
] = git__isspace(i
);
152 prog
->use_ignores
= 1;
154 memset(prog
, 0, sizeof(*prog
));
158 static int hashsig_add_hashes(
162 hashsig_in_progress
*prog
)
164 const uint8_t *scan
= data
, *end
= data
+ size
;
165 hashsig_state state
= HASHSIG_HASH_START
;
166 int use_ignores
= prog
->use_ignores
, len
;
170 state
= HASHSIG_HASH_START
;
172 for (len
= 0; scan
< end
&& len
< HASHSIG_MAX_RUN
; ) {
176 for (; scan
< end
&& git__isspace_nonlf(ch
); ch
= *scan
)
179 (GIT_HASHSIG_IGNORE_WHITESPACE
| GIT_HASHSIG_SMART_WHITESPACE
))
180 for (; scan
< end
&& ch
== '\r'; ch
= *scan
)
183 /* peek at next character to decide what to do next */
184 if (sig
->opt
& GIT_HASHSIG_SMART_WHITESPACE
)
185 use_ignores
= (ch
== '\n');
191 /* check run terminator */
192 if (ch
== '\n' || ch
== '\0') {
198 HASHSIG_HASH_MIX(state
, ch
);
202 hashsig_heap_insert(&sig
->mins
, (hashsig_t
)state
);
203 hashsig_heap_insert(&sig
->maxs
, (hashsig_t
)state
);
205 while (scan
< end
&& (*scan
== '\n' || !*scan
))
210 prog
->use_ignores
= use_ignores
;
215 static int hashsig_finalize_hashes(git_hashsig
*sig
)
217 if (sig
->mins
.size
< HASHSIG_HEAP_MIN_SIZE
&&
218 !(sig
->opt
& GIT_HASHSIG_ALLOW_SMALL_FILES
)) {
219 git_error_set(GIT_ERROR_INVALID
,
220 "file too small for similarity signature calculation");
224 hashsig_heap_sort(&sig
->mins
);
225 hashsig_heap_sort(&sig
->maxs
);
230 static git_hashsig
*hashsig_alloc(git_hashsig_option_t opts
)
232 git_hashsig
*sig
= git__calloc(1, sizeof(git_hashsig
));
236 hashsig_heap_init(&sig
->mins
, hashsig_cmp_min
);
237 hashsig_heap_init(&sig
->maxs
, hashsig_cmp_max
);
243 int git_hashsig_create(
247 git_hashsig_option_t opts
)
250 hashsig_in_progress prog
;
251 git_hashsig
*sig
= hashsig_alloc(opts
);
252 GIT_ERROR_CHECK_ALLOC(sig
);
254 hashsig_in_progress_init(&prog
, sig
);
256 error
= hashsig_add_hashes(sig
, (const uint8_t *)buf
, buflen
, &prog
);
259 error
= hashsig_finalize_hashes(sig
);
264 git_hashsig_free(sig
);
269 int git_hashsig_create_fromfile(
272 git_hashsig_option_t opts
)
277 hashsig_in_progress prog
;
278 git_hashsig
*sig
= hashsig_alloc(opts
);
279 GIT_ERROR_CHECK_ALLOC(sig
);
281 if ((fd
= git_futils_open_ro(path
)) < 0) {
286 hashsig_in_progress_init(&prog
, sig
);
289 if ((buflen
= p_read(fd
, buf
, sizeof(buf
))) <= 0) {
290 if ((error
= (int)buflen
) < 0)
291 git_error_set(GIT_ERROR_OS
,
292 "read error on '%s' calculating similarity hashes", path
);
296 error
= hashsig_add_hashes(sig
, buf
, buflen
, &prog
);
302 error
= hashsig_finalize_hashes(sig
);
307 git_hashsig_free(sig
);
312 void git_hashsig_free(git_hashsig
*sig
)
317 static int hashsig_heap_compare(const hashsig_heap
*a
, const hashsig_heap
*b
)
319 int matches
= 0, i
, j
, cmp
;
321 assert(a
->cmp
== b
->cmp
);
323 /* hash heaps are sorted - just look for overlap vs total */
325 for (i
= 0, j
= 0; i
< a
->size
&& j
< b
->size
; ) {
326 cmp
= a
->cmp(&a
->values
[i
], &b
->values
[j
], NULL
);
337 return HASHSIG_SCALE
* (matches
* 2) / (a
->size
+ b
->size
);
340 int git_hashsig_compare(const git_hashsig
*a
, const git_hashsig
*b
)
342 /* if we have no elements in either file then each file is either
343 * empty or blank. if we're ignoring whitespace then the files are
344 * similar, otherwise they're dissimilar.
346 if (a
->mins
.size
== 0 && b
->mins
.size
== 0) {
347 if ((!a
->lines
&& !b
->lines
) ||
348 (a
->opt
& GIT_HASHSIG_IGNORE_WHITESPACE
))
349 return HASHSIG_SCALE
;
354 /* if we have fewer than the maximum number of elements, then just use
355 * one array since the two arrays will be the same
357 if (a
->mins
.size
< HASHSIG_HEAP_SIZE
)
358 return hashsig_heap_compare(&a
->mins
, &b
->mins
);
360 return (hashsig_heap_compare(&a
->mins
, &b
->mins
) +
361 hashsig_heap_compare(&a
->maxs
, &b
->maxs
)) / 2;