2 * Copyright (C) the libgit2 contributors. All rights reserved.
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
7 #include "git2/sys/hashsig.h"
11 typedef uint32_t hashsig_t
;
12 typedef uint64_t hashsig_state
;
14 #define HASHSIG_SCALE 100
16 #define HASHSIG_MAX_RUN 80
17 #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
18 #define HASHSIG_HASH_SHIFT 5
20 #define HASHSIG_HASH_MIX(S,CH) \
21 (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
23 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
24 #define HASHSIG_HEAP_MIN_SIZE 4
26 typedef int (*hashsig_cmp
)(const void *a
, const void *b
, void *);
31 hashsig_t values
[HASHSIG_HEAP_SIZE
];
37 git_hashsig_option_t opt
;
41 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
42 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
43 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
45 static void hashsig_heap_init(hashsig_heap
*h
, hashsig_cmp cmp
)
48 h
->asize
= HASHSIG_HEAP_SIZE
;
52 static int hashsig_cmp_max(const void *a
, const void *b
, void *payload
)
54 hashsig_t av
= *(const hashsig_t
*)a
, bv
= *(const hashsig_t
*)b
;
56 return (av
< bv
) ? -1 : (av
> bv
) ? 1 : 0;
59 static int hashsig_cmp_min(const void *a
, const void *b
, void *payload
)
61 hashsig_t av
= *(const hashsig_t
*)a
, bv
= *(const hashsig_t
*)b
;
63 return (av
> bv
) ? -1 : (av
< bv
) ? 1 : 0;
66 static void hashsig_heap_up(hashsig_heap
*h
, int el
)
68 int parent_el
= HEAP_PARENT_OF(el
);
70 while (el
> 0 && h
->cmp(&h
->values
[parent_el
], &h
->values
[el
], NULL
) > 0) {
71 hashsig_t t
= h
->values
[el
];
72 h
->values
[el
] = h
->values
[parent_el
];
73 h
->values
[parent_el
] = t
;
76 parent_el
= HEAP_PARENT_OF(el
);
80 static void hashsig_heap_down(hashsig_heap
*h
, int el
)
84 /* 'el < h->size / 2' tests if el is bottom row of heap */
86 while (el
< h
->size
/ 2) {
87 int lel
= HEAP_LCHILD_OF(el
), rel
= HEAP_RCHILD_OF(el
), swapel
;
93 if (h
->cmp(&v
, &lv
, NULL
) < 0 && h
->cmp(&v
, &rv
, NULL
) < 0)
96 swapel
= (h
->cmp(&lv
, &rv
, NULL
) < 0) ? lel
: rel
;
98 h
->values
[el
] = h
->values
[swapel
];
99 h
->values
[swapel
] = v
;
105 static void hashsig_heap_sort(hashsig_heap
*h
)
107 /* only need to do this at the end for signature comparison */
108 git__qsort_r(h
->values
, h
->size
, sizeof(hashsig_t
), h
->cmp
, NULL
);
111 static void hashsig_heap_insert(hashsig_heap
*h
, hashsig_t val
)
113 /* if heap is not full, insert new element */
114 if (h
->size
< h
->asize
) {
115 h
->values
[h
->size
++] = val
;
116 hashsig_heap_up(h
, h
->size
- 1);
119 /* if heap is full, pop top if new element should replace it */
120 else if (h
->cmp(&val
, &h
->values
[0], NULL
) > 0) {
122 h
->values
[0] = h
->values
[h
->size
];
123 hashsig_heap_down(h
, 0);
130 uint8_t ignore_ch
[256];
131 } hashsig_in_progress
;
133 static void hashsig_in_progress_init(
134 hashsig_in_progress
*prog
, git_hashsig
*sig
)
139 case GIT_HASHSIG_IGNORE_WHITESPACE
:
140 for (i
= 0; i
< 256; ++i
)
141 prog
->ignore_ch
[i
] = git__isspace_nonlf(i
);
142 prog
->use_ignores
= 1;
144 case GIT_HASHSIG_SMART_WHITESPACE
:
145 for (i
= 0; i
< 256; ++i
)
146 prog
->ignore_ch
[i
] = git__isspace(i
);
147 prog
->use_ignores
= 1;
150 memset(prog
, 0, sizeof(*prog
));
155 #define HASHSIG_IN_PROGRESS_INIT { 1 }
157 static int hashsig_add_hashes(
161 hashsig_in_progress
*prog
)
163 const uint8_t *scan
= data
, *end
= data
+ size
;
164 hashsig_state state
= HASHSIG_HASH_START
;
165 int use_ignores
= prog
->use_ignores
, len
;
169 state
= HASHSIG_HASH_START
;
171 for (len
= 0; scan
< end
&& len
< HASHSIG_MAX_RUN
; ) {
175 for (; scan
< end
&& git__isspace_nonlf(ch
); ch
= *scan
)
177 else if (sig
->opt
!= GIT_HASHSIG_NORMAL
)
178 for (; scan
< end
&& ch
== '\r'; ch
= *scan
)
181 /* peek at next character to decide what to do next */
182 if (sig
->opt
== GIT_HASHSIG_SMART_WHITESPACE
)
183 use_ignores
= (ch
== '\n');
189 /* check run terminator */
190 if (ch
== '\n' || ch
== '\0')
194 HASHSIG_HASH_MIX(state
, ch
);
198 hashsig_heap_insert(&sig
->mins
, (hashsig_t
)state
);
199 hashsig_heap_insert(&sig
->maxs
, (hashsig_t
)state
);
203 while (scan
< end
&& (*scan
== '\n' || !*scan
))
208 prog
->use_ignores
= use_ignores
;
213 static int hashsig_finalize_hashes(git_hashsig
*sig
)
215 if (sig
->mins
.size
< HASHSIG_HEAP_MIN_SIZE
) {
216 giterr_set(GITERR_INVALID
,
217 "File too small for similarity signature calculation");
221 hashsig_heap_sort(&sig
->mins
);
222 hashsig_heap_sort(&sig
->maxs
);
227 static git_hashsig
*hashsig_alloc(git_hashsig_option_t opts
)
229 git_hashsig
*sig
= git__calloc(1, sizeof(git_hashsig
));
233 hashsig_heap_init(&sig
->mins
, hashsig_cmp_min
);
234 hashsig_heap_init(&sig
->maxs
, hashsig_cmp_max
);
240 int git_hashsig_create(
244 git_hashsig_option_t opts
)
247 hashsig_in_progress prog
;
248 git_hashsig
*sig
= hashsig_alloc(opts
);
249 GITERR_CHECK_ALLOC(sig
);
251 hashsig_in_progress_init(&prog
, sig
);
253 error
= hashsig_add_hashes(sig
, (const uint8_t *)buf
, buflen
, &prog
);
256 error
= hashsig_finalize_hashes(sig
);
261 git_hashsig_free(sig
);
266 int git_hashsig_create_fromfile(
269 git_hashsig_option_t opts
)
274 hashsig_in_progress prog
;
275 git_hashsig
*sig
= hashsig_alloc(opts
);
276 GITERR_CHECK_ALLOC(sig
);
278 if ((fd
= git_futils_open_ro(path
)) < 0) {
283 hashsig_in_progress_init(&prog
, sig
);
286 if ((buflen
= p_read(fd
, buf
, sizeof(buf
))) <= 0) {
287 if ((error
= (int)buflen
) < 0)
288 giterr_set(GITERR_OS
,
289 "Read error on '%s' calculating similarity hashes", path
);
293 error
= hashsig_add_hashes(sig
, buf
, buflen
, &prog
);
299 error
= hashsig_finalize_hashes(sig
);
304 git_hashsig_free(sig
);
309 void git_hashsig_free(git_hashsig
*sig
)
314 static int hashsig_heap_compare(const hashsig_heap
*a
, const hashsig_heap
*b
)
316 int matches
= 0, i
, j
, cmp
;
318 assert(a
->cmp
== b
->cmp
);
320 /* hash heaps are sorted - just look for overlap vs total */
322 for (i
= 0, j
= 0; i
< a
->size
&& j
< b
->size
; ) {
323 cmp
= a
->cmp(&a
->values
[i
], &b
->values
[j
], NULL
);
334 return HASHSIG_SCALE
* (matches
* 2) / (a
->size
+ b
->size
);
337 int git_hashsig_compare(const git_hashsig
*a
, const git_hashsig
*b
)
339 /* if we have fewer than the maximum number of elements, then just use
340 * one array since the two arrays will be the same
342 if (a
->mins
.size
< HASHSIG_HEAP_SIZE
)
343 return hashsig_heap_compare(&a
->mins
, &b
->mins
);
345 return (hashsig_heap_compare(&a
->mins
, &b
->mins
) +
346 hashsig_heap_compare(&a
->maxs
, &b
->maxs
)) / 2;