2 * Copyright (C) the libgit2 contributors. All rights reserved.
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
11 typedef uint32_t hashsig_t
;
12 typedef uint64_t hashsig_state
;
14 #define HASHSIG_SCALE 100
16 #define HASHSIG_HASH_WINDOW 32
17 #define HASHSIG_HASH_START 0
18 #define HASHSIG_HASH_SHIFT 5
19 #define HASHSIG_HASH_MASK 0x7FFFFFFF
21 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
23 typedef int (*hashsig_cmp
)(const void *a
, const void *b
, void *);
28 hashsig_t values
[HASHSIG_HEAP_SIZE
];
32 hashsig_state state
, shift_n
;
33 char window
[HASHSIG_HASH_WINDOW
];
34 int win_len
, win_pos
, saw_lf
;
35 } hashsig_in_progress
;
37 #define HASHSIG_IN_PROGRESS_INIT { HASHSIG_HASH_START, 1, {0}, 0, 0, 1 }
42 git_hashsig_option_t opt
;
46 #define HEAP_LCHILD_OF(I) (((I)*2)+1)
47 #define HEAP_RCHILD_OF(I) (((I)*2)+2)
48 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
50 static void hashsig_heap_init(hashsig_heap
*h
, hashsig_cmp cmp
)
53 h
->asize
= HASHSIG_HEAP_SIZE
;
57 static int hashsig_cmp_max(const void *a
, const void *b
, void *payload
)
59 hashsig_t av
= *(const hashsig_t
*)a
, bv
= *(const hashsig_t
*)b
;
61 return (av
< bv
) ? -1 : (av
> bv
) ? 1 : 0;
64 static int hashsig_cmp_min(const void *a
, const void *b
, void *payload
)
66 hashsig_t av
= *(const hashsig_t
*)a
, bv
= *(const hashsig_t
*)b
;
68 return (av
> bv
) ? -1 : (av
< bv
) ? 1 : 0;
71 static void hashsig_heap_up(hashsig_heap
*h
, int el
)
73 int parent_el
= HEAP_PARENT_OF(el
);
75 while (el
> 0 && h
->cmp(&h
->values
[parent_el
], &h
->values
[el
], NULL
) > 0) {
76 hashsig_t t
= h
->values
[el
];
77 h
->values
[el
] = h
->values
[parent_el
];
78 h
->values
[parent_el
] = t
;
81 parent_el
= HEAP_PARENT_OF(el
);
85 static void hashsig_heap_down(hashsig_heap
*h
, int el
)
89 /* 'el < h->size / 2' tests if el is bottom row of heap */
91 while (el
< h
->size
/ 2) {
92 int lel
= HEAP_LCHILD_OF(el
), rel
= HEAP_RCHILD_OF(el
), swapel
;
98 if (h
->cmp(&v
, &lv
, NULL
) < 0 && h
->cmp(&v
, &rv
, NULL
) < 0)
101 swapel
= (h
->cmp(&lv
, &rv
, NULL
) < 0) ? lel
: rel
;
103 h
->values
[el
] = h
->values
[swapel
];
104 h
->values
[swapel
] = v
;
110 static void hashsig_heap_sort(hashsig_heap
*h
)
112 /* only need to do this at the end for signature comparison */
113 git__qsort_r(h
->values
, h
->size
, sizeof(hashsig_t
), h
->cmp
, NULL
);
116 static void hashsig_heap_insert(hashsig_heap
*h
, hashsig_t val
)
118 /* if heap is full, pop top if new element should replace it */
119 if (h
->size
== h
->asize
&& h
->cmp(&val
, &h
->values
[0], NULL
) > 0) {
121 h
->values
[0] = h
->values
[h
->size
];
122 hashsig_heap_down(h
, 0);
125 /* if heap is not full, insert new element */
126 if (h
->size
< h
->asize
) {
127 h
->values
[h
->size
++] = val
;
128 hashsig_heap_up(h
, h
->size
- 1);
132 GIT_INLINE(bool) hashsig_include_char(
133 char ch
, git_hashsig_option_t opt
, int *saw_lf
)
135 if ((opt
& GIT_HASHSIG_IGNORE_WHITESPACE
) && git__isspace(ch
))
138 if (opt
& GIT_HASHSIG_SMART_WHITESPACE
) {
139 if (ch
== '\r' || (*saw_lf
&& git__isspace(ch
)))
142 *saw_lf
= (ch
== '\n');
148 static void hashsig_initial_window(
152 hashsig_in_progress
*prog
)
154 hashsig_state state
, shift_n
;
156 const char *scan
, *end
;
158 /* init until we have processed at least HASHSIG_HASH_WINDOW data */
160 if (prog
->win_len
>= HASHSIG_HASH_WINDOW
)
164 win_len
= prog
->win_len
;
165 shift_n
= prog
->shift_n
;
170 while (scan
< end
&& win_len
< HASHSIG_HASH_WINDOW
) {
173 if (!hashsig_include_char(ch
, sig
->opt
, &prog
->saw_lf
))
176 state
= (state
* HASHSIG_HASH_SHIFT
+ ch
) & HASHSIG_HASH_MASK
;
181 shift_n
= (shift_n
* HASHSIG_HASH_SHIFT
) & HASHSIG_HASH_MASK
;
183 prog
->window
[win_len
++] = ch
;
186 /* insert initial hash if we just finished */
188 if (win_len
== HASHSIG_HASH_WINDOW
) {
189 hashsig_heap_insert(&sig
->mins
, (hashsig_t
)state
);
190 hashsig_heap_insert(&sig
->maxs
, (hashsig_t
)state
);
195 prog
->win_len
= win_len
;
196 prog
->shift_n
= shift_n
;
201 static int hashsig_add_hashes(
205 hashsig_in_progress
*prog
)
207 const char *scan
= data
, *end
= data
+ size
;
208 hashsig_state state
, shift_n
, rmv
;
210 if (prog
->win_len
< HASHSIG_HASH_WINDOW
)
211 hashsig_initial_window(sig
, &scan
, size
, prog
);
214 shift_n
= prog
->shift_n
;
216 /* advance window, adding new chars and removing old */
218 for (; scan
< end
; ++scan
) {
221 if (!hashsig_include_char(ch
, sig
->opt
, &prog
->saw_lf
))
224 rmv
= shift_n
* prog
->window
[prog
->win_pos
];
226 state
= (state
- rmv
) & HASHSIG_HASH_MASK
;
227 state
= (state
* HASHSIG_HASH_SHIFT
) & HASHSIG_HASH_MASK
;
228 state
= (state
+ ch
) & HASHSIG_HASH_MASK
;
230 hashsig_heap_insert(&sig
->mins
, (hashsig_t
)state
);
231 hashsig_heap_insert(&sig
->maxs
, (hashsig_t
)state
);
234 prog
->window
[prog
->win_pos
] = ch
;
235 prog
->win_pos
= (prog
->win_pos
+ 1) % HASHSIG_HASH_WINDOW
;
243 static int hashsig_finalize_hashes(git_hashsig
*sig
)
245 if (sig
->mins
.size
< HASHSIG_HEAP_SIZE
) {
246 giterr_set(GITERR_INVALID
,
247 "File too small for similarity signature calculation");
251 hashsig_heap_sort(&sig
->mins
);
252 hashsig_heap_sort(&sig
->maxs
);
257 static git_hashsig
*hashsig_alloc(git_hashsig_option_t opts
)
259 git_hashsig
*sig
= git__calloc(1, sizeof(git_hashsig
));
263 hashsig_heap_init(&sig
->mins
, hashsig_cmp_min
);
264 hashsig_heap_init(&sig
->maxs
, hashsig_cmp_max
);
270 int git_hashsig_create(
274 git_hashsig_option_t opts
)
277 hashsig_in_progress prog
= HASHSIG_IN_PROGRESS_INIT
;
278 git_hashsig
*sig
= hashsig_alloc(opts
);
279 GITERR_CHECK_ALLOC(sig
);
281 error
= hashsig_add_hashes(sig
, buf
, buflen
, &prog
);
284 error
= hashsig_finalize_hashes(sig
);
289 git_hashsig_free(sig
);
294 int git_hashsig_create_fromfile(
297 git_hashsig_option_t opts
)
302 hashsig_in_progress prog
= HASHSIG_IN_PROGRESS_INIT
;
303 git_hashsig
*sig
= hashsig_alloc(opts
);
304 GITERR_CHECK_ALLOC(sig
);
306 if ((fd
= git_futils_open_ro(path
)) < 0) {
312 if ((buflen
= p_read(fd
, buf
, sizeof(buf
))) <= 0) {
313 if ((error
= (int)buflen
) < 0)
314 giterr_set(GITERR_OS
,
315 "Read error on '%s' calculating similarity hashes", path
);
319 error
= hashsig_add_hashes(sig
, buf
, buflen
, &prog
);
325 error
= hashsig_finalize_hashes(sig
);
330 git_hashsig_free(sig
);
335 void git_hashsig_free(git_hashsig
*sig
)
340 static int hashsig_heap_compare(const hashsig_heap
*a
, const hashsig_heap
*b
)
342 int matches
= 0, i
, j
, cmp
;
344 assert(a
->cmp
== b
->cmp
);
346 /* hash heaps are sorted - just look for overlap vs total */
348 for (i
= 0, j
= 0; i
< a
->size
&& j
< b
->size
; ) {
349 cmp
= a
->cmp(&a
->values
[i
], &b
->values
[j
], NULL
);
360 return HASHSIG_SCALE
* (matches
* 2) / (a
->size
+ b
->size
);
363 int git_hashsig_compare(const git_hashsig
*a
, const git_hashsig
*b
)
365 return (hashsig_heap_compare(&a
->mins
, &b
->mins
) +
366 hashsig_heap_compare(&a
->maxs
, &b
->maxs
)) / 2;