]>
Commit | Line | Data |
---|---|---|
dd453c4d | 1 | /* |
359fc2d2 | 2 | * Copyright (C) the libgit2 contributors. All rights reserved. |
dd453c4d | 3 | * |
bb742ede VM |
4 | * This file is part of libgit2, distributed under the GNU GPL v2 with |
5 | * a Linking Exception. For full terms see the included COPYING file. | |
dd453c4d MP |
6 | */ |
7 | ||
8 | #include <stdio.h> | |
9 | ||
10 | #include "sha1_lookup.h" | |
11 | #include "common.h" | |
67591c8c | 12 | #include "oid.h" |
dd453c4d MP |
13 | |
14 | /* | |
15 | * Conventional binary search loop looks like this: | |
16 | * | |
17 | * unsigned lo, hi; | |
87d9869f VM |
18 | * do { |
19 | * unsigned mi = (lo + hi) / 2; | |
20 | * int cmp = "entry pointed at by mi" minus "target"; | |
21 | * if (!cmp) | |
22 | * return (mi is the wanted one) | |
23 | * if (cmp > 0) | |
24 | * hi = mi; "mi is larger than target" | |
25 | * else | |
26 | * lo = mi+1; "mi is smaller than target" | |
27 | * } while (lo < hi); | |
dd453c4d MP |
28 | * |
29 | * The invariants are: | |
30 | * | |
31 | * - When entering the loop, lo points at a slot that is never | |
87d9869f VM |
32 | * above the target (it could be at the target), hi points at a |
33 | * slot that is guaranteed to be above the target (it can never | |
34 | * be at the target). | |
dd453c4d MP |
35 | * |
36 | * - We find a point 'mi' between lo and hi (mi could be the same | |
87d9869f VM |
37 | * as lo, but never can be as same as hi), and check if it hits |
38 | * the target. There are three cases: | |
dd453c4d | 39 | * |
87d9869f | 40 | * - if it is a hit, we are happy. |
dd453c4d | 41 | * |
87d9869f VM |
42 | * - if it is strictly higher than the target, we set it to hi, |
43 | * and repeat the search. | |
dd453c4d | 44 | * |
87d9869f VM |
45 | * - if it is strictly lower than the target, we update lo to |
46 | * one slot after it, because we allow lo to be at the target. | |
dd453c4d | 47 | * |
87d9869f | 48 | * If the loop exits, there is no matching entry. |
dd453c4d MP |
49 | * |
50 | * When choosing 'mi', we do not have to take the "middle" but | |
51 | * anywhere in between lo and hi, as long as lo <= mi < hi is | |
87d9869f | 52 | * satisfied. When we somehow know that the distance between the |
dd453c4d MP |
53 | * target and lo is much shorter than the target and hi, we could |
54 | * pick mi that is much closer to lo than the midway. | |
55 | * | |
56 | * Now, we can take advantage of the fact that SHA-1 is a good hash | |
57 | * function, and as long as there are enough entries in the table, we | |
87d9869f | 58 | * can expect uniform distribution. An entry that begins with for |
dd453c4d | 59 | * example "deadbeef..." is much likely to appear much later than in |
87d9869f | 60 | * the midway of the table. It can reasonably be expected to be near |
dd453c4d MP |
61 | * 87% (222/256) from the top of the table. |
62 | * | |
87d9869f | 63 | * However, we do not want to pick "mi" too precisely. If the entry at |
dd453c4d MP |
64 | * the 87% in the above example turns out to be higher than the target |
65 | * we are looking for, we would end up narrowing the search space down | |
66 | * only by 13%, instead of 50% we would get if we did a simple binary | |
87d9869f | 67 | * search. So we would want to hedge our bets by being less aggressive. |
dd453c4d MP |
68 | * |
69 | * The table at "table" holds at least "nr" entries of "elem_size" | |
87d9869f VM |
70 | * bytes each. Each entry has the SHA-1 key at "key_offset". The |
71 | * table is sorted by the SHA-1 key of the entries. The caller wants | |
dd453c4d MP |
72 | * to find the entry with "key", and knows that the entry at "lo" is |
73 | * not higher than the entry it is looking for, and that the entry at | |
74 | * "hi" is higher than the entry it is looking for. | |
75 | */ | |
76 | int sha1_entry_pos(const void *table, | |
87d9869f VM |
77 | size_t elem_size, |
78 | size_t key_offset, | |
79 | unsigned lo, unsigned hi, unsigned nr, | |
80 | const unsigned char *key) | |
dd453c4d | 81 | { |
bb88da7f | 82 | const unsigned char *base = (const unsigned char*)table; |
dd453c4d MP |
83 | const unsigned char *hi_key, *lo_key; |
84 | unsigned ofs_0; | |
85 | ||
86 | if (!nr || lo >= hi) | |
87 | return -1; | |
88 | ||
89 | if (nr == hi) | |
90 | hi_key = NULL; | |
91 | else | |
92 | hi_key = base + elem_size * hi + key_offset; | |
93 | lo_key = base + elem_size * lo + key_offset; | |
94 | ||
95 | ofs_0 = 0; | |
96 | do { | |
97 | int cmp; | |
98 | unsigned ofs, mi, range; | |
99 | unsigned lov, hiv, kyv; | |
100 | const unsigned char *mi_key; | |
101 | ||
102 | range = hi - lo; | |
103 | if (hi_key) { | |
104 | for (ofs = ofs_0; ofs < 20; ofs++) | |
105 | if (lo_key[ofs] != hi_key[ofs]) | |
106 | break; | |
107 | ofs_0 = ofs; | |
108 | /* | |
109 | * byte 0 thru (ofs-1) are the same between | |
110 | * lo and hi; ofs is the first byte that is | |
111 | * different. | |
74b38d19 VM |
112 | * |
113 | * If ofs==20, then no bytes are different, | |
114 | * meaning we have entries with duplicate | |
115 | * keys. We know that we are in a solid run | |
116 | * of this entry (because the entries are | |
117 | * sorted, and our lo and hi are the same, | |
118 | * there can be nothing but this single key | |
119 | * in between). So we can stop the search. | |
120 | * Either one of these entries is it (and | |
121 | * we do not care which), or we do not have | |
122 | * it. | |
123 | * | |
124 | * Furthermore, we know that one of our | |
125 | * endpoints must be the edge of the run of | |
126 | * duplicates. For example, given this | |
127 | * sequence: | |
128 | * | |
129 | * idx 0 1 2 3 4 5 | |
130 | * key A C C C C D | |
131 | * | |
132 | * If we are searching for "B", we might | |
133 | * hit the duplicate run at lo=1, hi=3 | |
134 | * (e.g., by first mi=3, then mi=0). But we | |
135 | * can never have lo > 1, because B < C. | |
136 | * That is, if our key is less than the | |
137 | * run, we know that "lo" is the edge, but | |
138 | * we can say nothing of "hi". Similarly, | |
139 | * if our key is greater than the run, we | |
140 | * know that "hi" is the edge, but we can | |
141 | * say nothing of "lo". | |
142 | * | |
143 | * Therefore if we do not find it, we also | |
144 | * know where it would go if it did exist: | |
145 | * just on the far side of the edge that we | |
146 | * know about. | |
dd453c4d | 147 | */ |
74b38d19 VM |
148 | if (ofs == 20) { |
149 | mi = lo; | |
150 | mi_key = base + elem_size * mi + key_offset; | |
151 | cmp = memcmp(mi_key, key, 20); | |
152 | if (!cmp) | |
153 | return mi; | |
154 | if (cmp < 0) | |
155 | return -1 - hi; | |
156 | else | |
157 | return -1 - lo; | |
158 | } | |
159 | ||
dd453c4d MP |
160 | hiv = hi_key[ofs_0]; |
161 | if (ofs_0 < 19) | |
162 | hiv = (hiv << 8) | hi_key[ofs_0+1]; | |
163 | } else { | |
164 | hiv = 256; | |
165 | if (ofs_0 < 19) | |
166 | hiv <<= 8; | |
167 | } | |
168 | lov = lo_key[ofs_0]; | |
169 | kyv = key[ofs_0]; | |
170 | if (ofs_0 < 19) { | |
171 | lov = (lov << 8) | lo_key[ofs_0+1]; | |
172 | kyv = (kyv << 8) | key[ofs_0+1]; | |
173 | } | |
1e94df08 | 174 | assert(lov < hiv); |
dd453c4d MP |
175 | |
176 | if (kyv < lov) | |
177 | return -1 - lo; | |
178 | if (hiv < kyv) | |
179 | return -1 - hi; | |
180 | ||
181 | /* | |
182 | * Even if we know the target is much closer to 'hi' | |
183 | * than 'lo', if we pick too precisely and overshoot | |
184 | * (e.g. when we know 'mi' is closer to 'hi' than to | |
185 | * 'lo', pick 'mi' that is higher than the target), we | |
186 | * end up narrowing the search space by a smaller | |
187 | * amount (i.e. the distance between 'mi' and 'hi') | |
188 | * than what we would have (i.e. about half of 'lo' | |
87d9869f | 189 | * and 'hi'). Hedge our bets to pick 'mi' less |
dd453c4d MP |
190 | * aggressively, i.e. make 'mi' a bit closer to the |
191 | * middle than we would otherwise pick. | |
192 | */ | |
193 | kyv = (kyv * 6 + lov + hiv) / 8; | |
194 | if (lov < hiv - 1) { | |
195 | if (kyv == lov) | |
196 | kyv++; | |
197 | else if (kyv == hiv) | |
198 | kyv--; | |
199 | } | |
200 | mi = (range - 1) * (kyv - lov) / (hiv - lov) + lo; | |
201 | ||
202 | #ifdef INDEX_DEBUG_LOOKUP | |
203 | printf("lo %u hi %u rg %u mi %u ", lo, hi, range, mi); | |
204 | printf("ofs %u lov %x, hiv %x, kyv %x\n", | |
87d9869f | 205 | ofs_0, lov, hiv, kyv); |
dd453c4d MP |
206 | #endif |
207 | ||
208 | if (!(lo <= mi && mi < hi)) { | |
4aa7de15 RB |
209 | giterr_set(GITERR_INVALID, "Assertion failure. Binary search invariant is false"); |
210 | return -1; | |
dd453c4d MP |
211 | } |
212 | ||
213 | mi_key = base + elem_size * mi + key_offset; | |
214 | cmp = memcmp(mi_key + ofs_0, key + ofs_0, 20 - ofs_0); | |
215 | if (!cmp) | |
216 | return mi; | |
217 | if (cmp > 0) { | |
218 | hi = mi; | |
219 | hi_key = mi_key; | |
220 | } else { | |
221 | lo = mi + 1; | |
222 | lo_key = mi_key + elem_size; | |
223 | } | |
224 | } while (lo < hi); | |
bb88da7f | 225 | return -((int)lo)-1; |
dd453c4d | 226 | } |
67591c8c VM |
227 | |
228 | int sha1_position(const void *table, | |
229 | size_t stride, | |
230 | unsigned lo, unsigned hi, | |
231 | const unsigned char *key) | |
232 | { | |
e2164da5 VM |
233 | const unsigned char *base = table; |
234 | ||
67591c8c VM |
235 | do { |
236 | unsigned mi = (lo + hi) / 2; | |
59547ce7 | 237 | int cmp = git_oid__hashcmp(base + mi * stride, key); |
67591c8c VM |
238 | |
239 | if (!cmp) | |
240 | return mi; | |
241 | ||
242 | if (cmp > 0) | |
243 | hi = mi; | |
244 | else | |
245 | lo = mi+1; | |
246 | } while (lo < hi); | |
247 | ||
248 | return -((int)lo)-1; | |
249 | } |