]>
Commit | Line | Data |
---|---|---|
dd453c4d | 1 | /* |
bb742ede | 2 | * Copyright (C) 2009-2011 the libgit2 contributors |
dd453c4d | 3 | * |
bb742ede VM |
4 | * This file is part of libgit2, distributed under the GNU GPL v2 with |
5 | * a Linking Exception. For full terms see the included COPYING file. | |
dd453c4d MP |
6 | */ |
7 | ||
8 | #include <stdio.h> | |
9 | ||
10 | #include "sha1_lookup.h" | |
11 | #include "common.h" | |
12 | ||
13 | /* | |
14 | * Conventional binary search loop looks like this: | |
15 | * | |
16 | * unsigned lo, hi; | |
87d9869f VM |
17 | * do { |
18 | * unsigned mi = (lo + hi) / 2; | |
19 | * int cmp = "entry pointed at by mi" minus "target"; | |
20 | * if (!cmp) | |
21 | * return (mi is the wanted one) | |
22 | * if (cmp > 0) | |
23 | * hi = mi; "mi is larger than target" | |
24 | * else | |
25 | * lo = mi+1; "mi is smaller than target" | |
26 | * } while (lo < hi); | |
dd453c4d MP |
27 | * |
28 | * The invariants are: | |
29 | * | |
30 | * - When entering the loop, lo points at a slot that is never | |
87d9869f VM |
31 | * above the target (it could be at the target), hi points at a |
32 | * slot that is guaranteed to be above the target (it can never | |
33 | * be at the target). | |
dd453c4d MP |
34 | * |
35 | * - We find a point 'mi' between lo and hi (mi could be the same | |
87d9869f VM |
36 | * as lo, but never can be as same as hi), and check if it hits |
37 | * the target. There are three cases: | |
dd453c4d | 38 | * |
87d9869f | 39 | * - if it is a hit, we are happy. |
dd453c4d | 40 | * |
87d9869f VM |
41 | * - if it is strictly higher than the target, we set it to hi, |
42 | * and repeat the search. | |
dd453c4d | 43 | * |
87d9869f VM |
44 | * - if it is strictly lower than the target, we update lo to |
45 | * one slot after it, because we allow lo to be at the target. | |
dd453c4d | 46 | * |
87d9869f | 47 | * If the loop exits, there is no matching entry. |
dd453c4d MP |
48 | * |
49 | * When choosing 'mi', we do not have to take the "middle" but | |
50 | * anywhere in between lo and hi, as long as lo <= mi < hi is | |
87d9869f | 51 | * satisfied. When we somehow know that the distance between the |
dd453c4d MP |
52 | * target and lo is much shorter than the target and hi, we could |
53 | * pick mi that is much closer to lo than the midway. | |
54 | * | |
55 | * Now, we can take advantage of the fact that SHA-1 is a good hash | |
56 | * function, and as long as there are enough entries in the table, we | |
87d9869f | 57 | * can expect uniform distribution. An entry that begins with for |
dd453c4d | 58 | * example "deadbeef..." is much likely to appear much later than in |
87d9869f | 59 | * the midway of the table. It can reasonably be expected to be near |
dd453c4d MP |
60 | * 87% (222/256) from the top of the table. |
61 | * | |
87d9869f | 62 | * However, we do not want to pick "mi" too precisely. If the entry at |
dd453c4d MP |
63 | * the 87% in the above example turns out to be higher than the target |
64 | * we are looking for, we would end up narrowing the search space down | |
65 | * only by 13%, instead of 50% we would get if we did a simple binary | |
87d9869f | 66 | * search. So we would want to hedge our bets by being less aggressive. |
dd453c4d MP |
67 | * |
68 | * The table at "table" holds at least "nr" entries of "elem_size" | |
87d9869f VM |
69 | * bytes each. Each entry has the SHA-1 key at "key_offset". The |
70 | * table is sorted by the SHA-1 key of the entries. The caller wants | |
dd453c4d MP |
71 | * to find the entry with "key", and knows that the entry at "lo" is |
72 | * not higher than the entry it is looking for, and that the entry at | |
73 | * "hi" is higher than the entry it is looking for. | |
74 | */ | |
75 | int sha1_entry_pos(const void *table, | |
87d9869f VM |
76 | size_t elem_size, |
77 | size_t key_offset, | |
78 | unsigned lo, unsigned hi, unsigned nr, | |
79 | const unsigned char *key) | |
dd453c4d | 80 | { |
bb88da7f | 81 | const unsigned char *base = (const unsigned char*)table; |
dd453c4d MP |
82 | const unsigned char *hi_key, *lo_key; |
83 | unsigned ofs_0; | |
84 | ||
85 | if (!nr || lo >= hi) | |
86 | return -1; | |
87 | ||
88 | if (nr == hi) | |
89 | hi_key = NULL; | |
90 | else | |
91 | hi_key = base + elem_size * hi + key_offset; | |
92 | lo_key = base + elem_size * lo + key_offset; | |
93 | ||
94 | ofs_0 = 0; | |
95 | do { | |
96 | int cmp; | |
97 | unsigned ofs, mi, range; | |
98 | unsigned lov, hiv, kyv; | |
99 | const unsigned char *mi_key; | |
100 | ||
101 | range = hi - lo; | |
102 | if (hi_key) { | |
103 | for (ofs = ofs_0; ofs < 20; ofs++) | |
104 | if (lo_key[ofs] != hi_key[ofs]) | |
105 | break; | |
106 | ofs_0 = ofs; | |
107 | /* | |
108 | * byte 0 thru (ofs-1) are the same between | |
109 | * lo and hi; ofs is the first byte that is | |
110 | * different. | |
111 | */ | |
112 | hiv = hi_key[ofs_0]; | |
113 | if (ofs_0 < 19) | |
114 | hiv = (hiv << 8) | hi_key[ofs_0+1]; | |
115 | } else { | |
116 | hiv = 256; | |
117 | if (ofs_0 < 19) | |
118 | hiv <<= 8; | |
119 | } | |
120 | lov = lo_key[ofs_0]; | |
121 | kyv = key[ofs_0]; | |
122 | if (ofs_0 < 19) { | |
123 | lov = (lov << 8) | lo_key[ofs_0+1]; | |
124 | kyv = (kyv << 8) | key[ofs_0+1]; | |
125 | } | |
126 | assert(lov < hiv); | |
127 | ||
128 | if (kyv < lov) | |
129 | return -1 - lo; | |
130 | if (hiv < kyv) | |
131 | return -1 - hi; | |
132 | ||
133 | /* | |
134 | * Even if we know the target is much closer to 'hi' | |
135 | * than 'lo', if we pick too precisely and overshoot | |
136 | * (e.g. when we know 'mi' is closer to 'hi' than to | |
137 | * 'lo', pick 'mi' that is higher than the target), we | |
138 | * end up narrowing the search space by a smaller | |
139 | * amount (i.e. the distance between 'mi' and 'hi') | |
140 | * than what we would have (i.e. about half of 'lo' | |
87d9869f | 141 | * and 'hi'). Hedge our bets to pick 'mi' less |
dd453c4d MP |
142 | * aggressively, i.e. make 'mi' a bit closer to the |
143 | * middle than we would otherwise pick. | |
144 | */ | |
145 | kyv = (kyv * 6 + lov + hiv) / 8; | |
146 | if (lov < hiv - 1) { | |
147 | if (kyv == lov) | |
148 | kyv++; | |
149 | else if (kyv == hiv) | |
150 | kyv--; | |
151 | } | |
152 | mi = (range - 1) * (kyv - lov) / (hiv - lov) + lo; | |
153 | ||
154 | #ifdef INDEX_DEBUG_LOOKUP | |
155 | printf("lo %u hi %u rg %u mi %u ", lo, hi, range, mi); | |
156 | printf("ofs %u lov %x, hiv %x, kyv %x\n", | |
87d9869f | 157 | ofs_0, lov, hiv, kyv); |
dd453c4d MP |
158 | #endif |
159 | ||
160 | if (!(lo <= mi && mi < hi)) { | |
161 | return git__throw(GIT_ERROR, "Assertion failure. Binary search invariant is false"); | |
162 | } | |
163 | ||
164 | mi_key = base + elem_size * mi + key_offset; | |
165 | cmp = memcmp(mi_key + ofs_0, key + ofs_0, 20 - ofs_0); | |
166 | if (!cmp) | |
167 | return mi; | |
168 | if (cmp > 0) { | |
169 | hi = mi; | |
170 | hi_key = mi_key; | |
171 | } else { | |
172 | lo = mi + 1; | |
173 | lo_key = mi_key + elem_size; | |
174 | } | |
175 | } while (lo < hi); | |
bb88da7f | 176 | return -((int)lo)-1; |
dd453c4d | 177 | } |