]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | ||
6 | #pragma once | |
7 | ||
8 | #include <array> | |
9 | #include <memory> | |
10 | ||
11 | #include "util/math128.h" | |
12 | ||
13 | namespace ROCKSDB_NAMESPACE { | |
14 | ||
15 | namespace ribbon { | |
16 | ||
17 | // RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) | |
18 | // | |
19 | // ribbon_alg.h: generic versions of core algorithms. | |
20 | // | |
21 | // Ribbon is a Perfect Hash Static Function construction useful as a compact | |
22 | // static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear | |
23 | // system construction that approximates a Band Matrix with hashing, | |
24 | // (b) an incremental, on-the-fly Gaussian Elimination algorithm that is | |
25 | // remarkably efficient and adaptable at constructing an upper-triangular | |
26 | // band matrix from a set of band-approximating inputs from (a), and | |
27 | // (c) a storage layout that is fast and adaptable as a filter. | |
28 | // | |
29 | // Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices | |
30 | // with One Short Random Block per Row, with Applications" by Stefan | |
31 | // Walzer and Martin Dietzfelbinger ("DW paper") | |
32 | // (b) developed by Peter C. Dillinger, though not the first on-the-fly | |
33 | // GE algorithm. See "On the fly Gaussian Elimination for LT codes" by | |
34 | // Bioglio, Grangetto, Gaeta, and Sereno. | |
35 | // (c) see "interleaved" solution storage below. | |
36 | // | |
37 | // See ribbon_impl.h for high-level behavioral summary. This file focuses | |
38 | // on the core design details. | |
39 | // | |
40 | // ###################################################################### | |
41 | // ################# PHSF -> static filter reduction #################### | |
42 | // | |
43 | // A Perfect Hash Static Function is a data structure representing a | |
44 | // map from anything hashable (a "key") to values of some fixed size. | |
45 | // Crucially, it is allowed to return garbage values for anything not in | |
46 | // the original set of map keys, and it is a "static" structure: entries | |
47 | // cannot be added or deleted after construction. PHSFs representing n | |
48 | // mappings to b-bit values (assume uniformly distributed) require at least | |
49 | // n * b bits to represent, or at least b bits per entry. We typically | |
50 | // describe the compactness of a PHSF by typical bits per entry as some | |
51 | // function of b. For example, the MWHC construction (k=3 "peeling") | |
52 | // requires about 1.0222*b and a variant called Xor+ requires about | |
53 | // 1.08*b + 0.5 bits per entry. | |
54 | // | |
55 | // With more hashing, a PHSF can over-approximate a set as a Bloom filter | |
56 | // does, with no FN queries and predictable false positive (FP) query | |
57 | // rate. Instead of the user providing a value to map each input key to, | |
58 | // a hash function provides the value. Keys in the original set will | |
59 | // return a positive membership query because the underlying PHSF returns | |
60 | // the same value as hashing the key. When a key is not in the original set, | |
61 | // the PHSF returns a "garbage" value, which is only equal to the key's | |
62 | // hash with (false positive) probability 1 in 2^b. | |
63 | // | |
64 | // For a matching false positive rate, standard Bloom filters require | |
65 | // 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h) | |
66 | // require a bit more, around 1.5*b bits per entry. Thus, a Bloom | |
67 | // alternative could save up to or nearly 1/3rd of memory and storage | |
68 | // that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter | |
69 | // is dynamic.) | |
70 | // | |
71 | // Recommended reading: | |
72 | // "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" | |
73 | // by Graf and Lemire | |
74 | // First three sections of "Fast Scalable Construction of (Minimal | |
75 | // Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna | |
76 | // | |
77 | // ###################################################################### | |
78 | // ################## PHSF vs. hash table vs. Bloom ##################### | |
79 | // | |
80 | // You can think of traditional hash tables and related filter variants | |
81 | // such as Cuckoo filters as utilizing an "OR" construction: a hash | |
82 | // function associates a key with some slots and the data is returned if | |
83 | // the data is found in any one of those slots. The collision resolution | |
84 | // is visible in the final data structure and requires extra information. | |
85 | // For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and | |
86 | // Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data | |
87 | // structure associates each input key with data in one slot, the | |
88 | // structure implicitly constructs a (near-)minimal (near-)perfect hash | |
89 | // (MPH) of the keys, which requires at least 1.44 bits per key to | |
90 | // represent. This is why approaches with visible collision resolution | |
91 | // have a fixed + 1.5 or more in storage overhead per entry, often in | |
92 | // addition to an overhead multiplier on b. | |
93 | // | |
94 | // By contrast Bloom filters utilize an "AND" construction: a query only | |
95 | // returns true if all bit positions associated with a key are set to 1. | |
96 | // There is no collision resolution, so Bloom filters do not suffer a | |
97 | // fixed bits per entry overhead like the above structures. | |
98 | // | |
99 | // PHSFs typically use a bitwise XOR construction: the data you want is | |
100 | // not in a single slot, but in a linear combination of several slots. | |
101 | // For static data, this gives the best of "AND" and "OR" constructions: | |
102 | // avoids the +1.44 or more fixed overhead by not approximating a MPH and | |
103 | // can do much better than Bloom's 1.44 factor on b with collision | |
104 | // resolution, which here is done ahead of time and invisible at query | |
105 | // time. | |
106 | // | |
107 | // ###################################################################### | |
108 | // ######################## PHSF construction ########################### | |
109 | // | |
110 | // For a typical PHSF, construction is solving a linear system of | |
111 | // equations, typically in GF(2), which is to say that values are boolean | |
112 | // and XOR serves both as addition and subtraction. We can use matrices to | |
113 | // represent the problem: | |
114 | // | |
115 | // C * S = R | |
116 | // (n x m) (m x b) (n x b) | |
117 | // where C = coefficients, S = solution, R = results | |
118 | // and solving for S given C and R. | |
119 | // | |
120 | // Note that C and R each have n rows, one for each input entry for the | |
121 | // PHSF. A row in C is given by a hash function on the PHSF input key, | |
122 | // and the corresponding row in R is the b-bit value to associate with | |
123 | // that input key. (In a filter, rows of R are given by another hash | |
124 | // function on the input key.) | |
125 | // | |
126 | // On solving, the matrix S (solution) is the final PHSF data, as it | |
127 | // maps any row from the original C to its corresponding desired result | |
128 | // in R. We just have to hash our query inputs and compute a linear | |
129 | // combination of rows in S. | |
130 | // | |
131 | // In theory, we could chose m = n and let a hash function associate | |
132 | // each input key with random rows in C. A solution exists with high | |
133 | // probability, and uses essentially minimum space, b bits per entry | |
134 | // (because we set m = n) but this has terrible scaling, something | |
135 | // like O(n^2) space and O(n^3) time during construction (Gaussian | |
136 | // elimination) and O(n) query time. But computational efficiency is | |
137 | // key, and the core of this is avoiding scanning all of S to answer | |
138 | // each query. | |
139 | // | |
140 | // The traditional approach (MWHC, aka Xor filter) starts with setting | |
141 | // only some small fixed number of columns (typically k=3) to 1 for each | |
142 | // row of C, with remaining entries implicitly 0. This is implemented as | |
143 | // three hash functions over [0,m), and S can be implemented as a vector | |
144 | // vector of b-bit values. Now, a query only involves looking up k rows | |
145 | // (values) in S and computing their bitwise XOR. Additionally, this | |
146 | // construction can use a linear time algorithm called "peeling" for | |
147 | // finding a solution in many cases of one existing, but peeling | |
148 | // generally requires a larger space overhead factor in the solution | |
149 | // (m/n) than is required with Gaussian elimination. | |
150 | // | |
151 | // Recommended reading: | |
152 | // "Peeling Close to the Orientability Threshold – Spatial Coupling in | |
153 | // Hashing-Based Data Structures" by Stefan Walzer | |
154 | // | |
155 | // ###################################################################### | |
156 | // ##################### Ribbon PHSF construction ####################### | |
157 | // | |
158 | // Ribbon constructs coefficient rows essentially the same as in the | |
159 | // Walzer/Dietzfelbinger paper cited above: for some chosen fixed width | |
160 | // r (kCoeffBits in code), each key is hashed to a starting column in | |
161 | // [0, m - r] (GetStart() in code) and an r-bit sequence of boolean | |
162 | // coefficients (GetCoeffRow() in code). If you sort the rows by start, | |
163 | // the C matrix would look something like this: | |
164 | // | |
165 | // [####00000000000000000000] | |
166 | // [####00000000000000000000] | |
167 | // [000####00000000000000000] | |
168 | // [0000####0000000000000000] | |
169 | // [0000000####0000000000000] | |
170 | // [000000000####00000000000] | |
171 | // [000000000####00000000000] | |
172 | // [0000000000000####0000000] | |
173 | // [0000000000000000####0000] | |
174 | // [00000000000000000####000] | |
175 | // [00000000000000000000####] | |
176 | // | |
177 | // where each # could be a 0 or 1, chosen uniformly by a hash function. | |
178 | // (Except we typically set the start column value to 1.) This scheme | |
179 | // uses hashing to approximate a band matrix, and it has a solution iff | |
180 | // it reduces to an upper-triangular boolean r-band matrix, like this: | |
181 | // | |
182 | // [1###00000000000000000000] | |
183 | // [01##00000000000000000000] | |
184 | // [000000000000000000000000] | |
185 | // [0001###00000000000000000] | |
186 | // [000000000000000000000000] | |
187 | // [000001##0000000000000000] | |
188 | // [000000000000000000000000] | |
189 | // [00000001###0000000000000] | |
190 | // [000000001###000000000000] | |
191 | // [0000000001##000000000000] | |
192 | // ... | |
193 | // [00000000000000000000001#] | |
194 | // [000000000000000000000001] | |
195 | // | |
196 | // where we have expanded to an m x m matrix by filling with rows of | |
197 | // all zeros as needed. As in Gaussian elimination, this form is ready for | |
198 | // generating a solution through back-substitution. | |
199 | // | |
200 | // The awesome thing about the Ribbon construction (from the DW paper) is | |
201 | // how row reductions keep each row representable as a start column and | |
202 | // r coefficients, because row reductions are only needed when two rows | |
203 | // have the same number of leading zero columns. Thus, the combination | |
204 | // of those rows, the bitwise XOR of the r-bit coefficient rows, cancels | |
205 | // out the leading 1s, so starts (at least) one column later and only | |
206 | // needs (at most) r - 1 coefficients. | |
207 | // | |
208 | // ###################################################################### | |
209 | // ###################### Ribbon PHSF scalability ####################### | |
210 | // | |
211 | // Although more practical detail is in ribbon_impl.h, it's worth | |
212 | // understanding some of the overall benefits and limitations of the | |
213 | // Ribbon PHSFs. | |
214 | // | |
215 | // High-end scalability is a primary issue for Ribbon PHSFs, because in | |
216 | // a single Ribbon linear system with fixed r and fixed m/n ratio, the | |
217 | // solution probability approaches zero as n approaches infinity. | |
218 | // For a given n, solution probability improves with larger r and larger | |
219 | // m/n. | |
220 | // | |
221 | // By contrast, peeling-based PHSFs have somewhat worse storage ratio | |
222 | // or solution probability for small n (less than ~1000). This is | |
223 | // especially true with spatial-coupling, where benefits are only | |
224 | // notable for n on the order of 100k or 1m or more. | |
225 | // | |
226 | // To make best use of current hardware, r=128 seems to be closest to | |
227 | // a "generally good" choice for Ribbon, at least in RocksDB where SST | |
228 | // Bloom filters typically hold around 10-100k keys, and almost always | |
229 | // less than 10m keys. r=128 ribbon has a high chance of encoding success | |
230 | // (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05) | |
231 | // for roughly 10k - 10m keys in a single linear system. r=64 only scales | |
232 | // up to about 10k keys with the same storage overhead. Construction and | |
233 | // access times for r=128 are similar to r=64. r=128 tracks nearly | |
234 | // twice as much data during construction, but in most cases we expect | |
235 | // the scalability benefits of r=128 vs. r=64 to make it preferred. | |
236 | // | |
237 | // A natural approach to scaling Ribbon beyond ~10m keys is splitting | |
238 | // (or "sharding") the inputs into multiple linear systems with their | |
239 | // own hash seeds. This can also help to control peak memory consumption. | |
240 | // TODO: much more to come | |
241 | // | |
242 | // ###################################################################### | |
243 | // #################### Ribbon on-the-fly banding ####################### | |
244 | // | |
245 | // "Banding" is what we call the process of reducing the inputs to an | |
246 | // upper-triangular r-band matrix ready for finishing a solution with | |
247 | // back-substitution. Although the DW paper presents an algorithm for | |
248 | // this ("SGauss"), the awesome properties of their construction enable | |
249 | // an even simpler, faster, and more backtrackable algorithm. In simplest | |
250 | // terms, the SGauss algorithm requires sorting the inputs by start | |
251 | // columns, but it's possible to make Gaussian elimination resemble hash | |
252 | // table insertion! | |
253 | // | |
254 | // The enhanced algorithm is based on these observations: | |
255 | // - When processing a coefficient row with first 1 in column j, | |
256 | // - If it's the first at column j to be processed, it can be part of | |
257 | // the banding at row j. (And that decision never overwritten, with | |
258 | // no loss of generality!) | |
259 | // - Else, it can be combined with existing row j and re-processed, | |
260 | // which will look for a later "empty" row or reach "no solution". | |
261 | // | |
262 | // We call our banding algorithm "incremental" and "on-the-fly" because | |
263 | // (like hash table insertion) we are "finished" after each input | |
264 | // processed, with respect to all inputs processed so far. Although the | |
265 | // band matrix is an intermediate step to the solution structure, we have | |
266 | // eliminated intermediate steps and unnecessary data tracking for | |
267 | // banding. | |
268 | // | |
269 | // Building on "incremental" and "on-the-fly", the banding algorithm is | |
270 | // easily backtrackable because no (non-empty) rows are overwritten in | |
271 | // the banding. Thus, if we want to "try" adding an additional set of | |
272 | // inputs to the banding, we only have to record which rows were written | |
273 | // in order to efficiently backtrack to our state before considering | |
274 | // the additional set. (TODO: how this can mitigate scalability and | |
275 | // reach sub-1% overheads) | |
276 | // | |
277 | // Like in a linear-probed hash table, as the occupancy approaches and | |
278 | // surpasses 90-95%, collision resolution dominates the construction | |
279 | // time. (Ribbon doesn't usually pay at query time; see solution | |
280 | // storage below.) This means that we can speed up construction time | |
281 | // by using a higher m/n ratio, up to negative returns around 1.2. | |
282 | // At m/n ~= 1.2, which still saves memory substantially vs. Bloom | |
283 | // filter's 1.5, construction speed (including back-substitution) is not | |
284 | // far from sorting speed, but still a few times slower than cache-local | |
285 | // Bloom construction speed. | |
286 | // | |
287 | // Back-substitution from an upper-triangular boolean band matrix is | |
288 | // especially fast and easy. All the memory accesses are sequential or at | |
289 | // least local, no random. If the number of result bits (b) is a | |
290 | // compile-time constant, the back-substitution state can even be tracked | |
291 | // in CPU registers. Regardless of the solution representation, we prefer | |
292 | // column-major representation for tracking back-substitution state, as | |
293 | // r (the band width) will typically be much larger than b (result bits | |
294 | // or columns), so better to handle r-bit values b times (per solution | |
295 | // row) than b-bit values r times. | |
296 | // | |
297 | // ###################################################################### | |
298 | // ##################### Ribbon solution storage ######################## | |
299 | // | |
300 | // Row-major layout is typical for boolean (bit) matrices, including for | |
301 | // MWHC (Xor) filters where a query combines k b-bit values, and k is | |
302 | // typically smaller than b. Even for k=4 and b=2, at least k=4 random | |
303 | // look-ups are required regardless of layout. | |
304 | // | |
305 | // Ribbon PHSFs are quite different, however, because | |
306 | // (a) all of the solution rows relevant to a query are within a single | |
307 | // range of r rows, and | |
308 | // (b) the number of solution rows involved (r/2 on average, or r if | |
309 | // avoiding conditional accesses) is typically much greater than | |
310 | // b, the number of solution columns. | |
311 | // | |
312 | // Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead | |
313 | // by processing (up to) r entries of b bits each, where b is typically | |
314 | // less than 10 for filter applications. | |
315 | // | |
316 | // Column-major layout has poor locality because of accessing up to b | |
317 | // memory locations in different pages (and obviously cache lines). Note | |
318 | // that negative filter queries do not typically need to access all | |
319 | // solution columns, as they can return when a mismatch is found in any | |
320 | // result/solution column. This optimization doesn't always pay off on | |
321 | // recent hardware, where the penalty for unpredictable conditional | |
322 | // branching can exceed the penalty for unnecessary work, but the | |
323 | // optimization is essentially unavailable with row-major layout. | |
324 | // | |
325 | // The best compromise seems to be interleaving column-major on the small | |
326 | // scale with row-major on the large scale. For example, let a solution | |
327 | // "block" be r rows column-major encoded as b r-bit values in sequence. | |
328 | // Each query accesses (up to) 2 adjacent blocks, which will typically | |
329 | // span 1-3 cache lines in adjacent memory. We get very close to the same | |
330 | // locality as row-major, but with much faster reconstruction of each | |
331 | // result column, at least for filter applications where b is relatively | |
332 | // small and negative queries can return early. | |
333 | // | |
334 | // ###################################################################### | |
335 | // ###################### Fractional result bits ######################## | |
336 | // | |
337 | // Bloom filters have great flexibility that alternatives mostly do not | |
338 | // have. One of those flexibilities is in utilizing any ratio of data | |
339 | // structure bits per key. With a typical memory allocator like jemalloc, | |
340 | // this flexibility can save roughly 10% of the filters' footprint in | |
341 | // DRAM by rounding up and down filter sizes to minimize memory internal | |
342 | // fragmentation (see optimize_filters_for_memory RocksDB option). | |
343 | // | |
344 | // At first glance, PHSFs only offer a whole number of bits per "slot" | |
345 | // (m rather than number of keys n), but coefficient locality in the | |
346 | // Ribbon construction makes fractional bits/key quite possible and | |
347 | // attractive for filter applications. This works by a prefix of the | |
348 | // structure using b-1 solution columns and the rest using b solution | |
349 | // columns. See InterleavedSolutionStorage below for more detail. | |
350 | // | |
351 | // Because false positive rates are non-linear in bits/key, this approach | |
352 | // is not quite optimal in terms of information theory. In common cases, | |
353 | // we see additional space overhead up to about 1.5% vs. theoretical | |
354 | // optimal to achieve the same FP rate. We consider this a quite acceptable | |
355 | // overhead for very efficiently utilizing space that might otherwise be | |
356 | // wasted. | |
357 | // | |
358 | // This property of Ribbon even makes it "elastic." A Ribbon filter and | |
359 | // its small metadata for answering queries can be adapted into another | |
360 | // Ribbon filter filling any smaller multiple of r bits (plus small | |
361 | // metadata), with a correspondingly higher FP rate. None of the data | |
362 | // thrown away during construction needs to be recalled for this reduction. | |
363 | // Similarly a single Ribbon construction can be separated (by solution | |
364 | // column) into two or more structures (or "layers" or "levels") with | |
365 | // independent filtering ability (no FP correlation, just as solution or | |
366 | // result columns in a single structure) despite being constructed as part | |
367 | // of a single linear system. (TODO: implement) | |
368 | // See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards | |
369 | // Efficient Read for LSM-tree-based KV Stores." | |
370 | // | |
371 | ||
372 | // ###################################################################### | |
373 | // ################### CODE: Ribbon core algorithms ##################### | |
374 | // ###################################################################### | |
375 | // | |
376 | // These algorithms are templatized for genericity but near-maximum | |
377 | // performance in a given application. The template parameters | |
378 | // adhere to informal class/struct type concepts outlined below. (This | |
379 | // code is written for C++11 so does not use formal C++ concepts.) | |
380 | ||
381 | // Rough architecture for these algorithms: | |
382 | // | |
383 | // +-----------+ +---+ +-----------------+ | |
384 | // | AddInputs | --> | H | --> | BandingStorage | | |
385 | // +-----------+ | a | +-----------------+ | |
386 | // | s | | | |
387 | // | h | Back substitution | |
388 | // | e | V | |
389 | // +-----------+ | r | +-----------------+ | |
390 | // | Query Key | --> | | >+< | SolutionStorage | | |
391 | // +-----------+ +---+ | +-----------------+ | |
392 | // V | |
393 | // Query result | |
394 | ||
395 | // Common to other concepts | |
396 | // concept RibbonTypes { | |
397 | // // An unsigned integer type for an r-bit subsequence of coefficients. | |
398 | // // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would | |
399 | // // generally only hurt scalability to leave bits of CoeffRow unused. | |
400 | // typename CoeffRow; | |
401 | // // An unsigned integer type big enough to hold a result row (b bits, | |
402 | // // or number of solution/result columns). | |
403 | // // In many applications, especially filters, the number of result | |
404 | // // columns is decided at run time, so ResultRow simply needs to be | |
405 | // // big enough for the largest number of columns allowed. | |
406 | // typename ResultRow; | |
407 | // // An unsigned integer type sufficient for representing the number of | |
408 | // // rows in the solution structure, and at least the arithmetic | |
409 | // // promotion size (usually 32 bits). uint32_t recommended because a | |
410 | // // single Ribbon construction doesn't really scale to billions of | |
411 | // // entries. | |
412 | // typename Index; | |
413 | // }; | |
414 | ||
415 | // ###################################################################### | |
416 | // ######################## Hashers and Banding ######################### | |
417 | ||
418 | // Hasher concepts abstract out hashing details. | |
419 | ||
420 | // concept PhsfQueryHasher extends RibbonTypes { | |
421 | // // Type for a lookup key, which is hashable. | |
422 | // typename Key; | |
423 | // | |
424 | // // Type for hashed summary of a Key. uint64_t is recommended. | |
425 | // typename Hash; | |
426 | // | |
427 | // // Compute a hash value summarizing a Key | |
428 | // Hash GetHash(const Key &) const; | |
429 | // | |
430 | // // Given a hash value and a number of columns that can start an | |
431 | // // r-sequence of coefficients (== m - r + 1), return the start | |
432 | // // column to associate with that hash value. (Starts can be chosen | |
433 | // // uniformly or "smash" extra entries into the beginning and end for | |
434 | // // better utilization at those extremes of the structure. Details in | |
435 | // // ribbon.impl.h) | |
436 | // Index GetStart(Hash, Index num_starts) const; | |
437 | // | |
438 | // // Given a hash value, return the r-bit sequence of coefficients to | |
439 | // // associate with it. It's generally OK if | |
440 | // // sizeof(CoeffRow) > sizeof(Hash) | |
441 | // // as long as the hash itself is not too prone to collisions for the | |
442 | // // applications and the CoeffRow is generated uniformly from | |
443 | // // available hash data, but relatively independent of the start. | |
444 | // // | |
445 | // // Must be non-zero, because that's required for a solution to exist | |
446 | // // when mapping to non-zero result row. (Note: BandingAdd could be | |
447 | // // modified to allow 0 coeff row if that only occurs with 0 result | |
448 | // // row, which really only makes sense for filter implementation, | |
449 | // // where both values are hash-derived. Or BandingAdd could reject 0 | |
450 | // // coeff row, forcing next seed, but that has potential problems with | |
451 | // // generality/scalability.) | |
452 | // CoeffRow GetCoeffRow(Hash) const; | |
453 | // }; | |
454 | ||
455 | // concept FilterQueryHasher extends PhsfQueryHasher { | |
456 | // // For building or querying a filter, this returns the expected | |
457 | // // result row associated with a hashed input. For general PHSF, | |
458 | // // this must return 0. | |
459 | // // | |
460 | // // Although not strictly required, there's a slightly better chance of | |
461 | // // solver success if result row is masked down here to only the bits | |
462 | // // actually needed. | |
463 | // ResultRow GetResultRowFromHash(Hash) const; | |
464 | // } | |
465 | ||
466 | // concept BandingHasher extends FilterQueryHasher { | |
467 | // // For a filter, this will generally be the same as Key. | |
468 | // // For a general PHSF, it must either | |
469 | // // (a) include a key and a result it maps to (e.g. in a std::pair), or | |
470 | // // (b) GetResultRowFromInput looks up the result somewhere rather than | |
471 | // // extracting it. | |
472 | // typename AddInput; | |
473 | // | |
474 | // // Instead of requiring a way to extract a Key from an | |
475 | // // AddInput, we require getting the hash of the Key part | |
476 | // // of an AddInput, which is trivial if AddInput == Key. | |
477 | // Hash GetHash(const AddInput &) const; | |
478 | // | |
479 | // // For building a non-filter PHSF, this extracts or looks up the result | |
480 | // // row to associate with an input. For filter PHSF, this must return 0. | |
481 | // ResultRow GetResultRowFromInput(const AddInput &) const; | |
482 | // | |
483 | // // Whether the solver can assume the lowest bit of GetCoeffRow is | |
484 | // // always 1. When true, it should improve solver efficiency slightly. | |
485 | // static bool kFirstCoeffAlwaysOne; | |
486 | // } | |
487 | ||
488 | // Abstract storage for the the result of "banding" the inputs (Gaussian | |
489 | // elimination to an upper-triangular boolean band matrix). Because the | |
490 | // banding is an incremental / on-the-fly algorithm, this also represents | |
491 | // all the intermediate state between input entries. | |
492 | // | |
493 | // concept BandingStorage extends RibbonTypes { | |
494 | // // Tells the banding algorithm to prefetch memory associated with | |
495 | // // the next input before processing the current input. Generally | |
496 | // // recommended iff the BandingStorage doesn't easily fit in CPU | |
497 | // // cache. | |
498 | // bool UsePrefetch() const; | |
499 | // | |
500 | // // Prefetches (e.g. __builtin_prefetch) memory associated with a | |
501 | // // slot index i. | |
502 | // void Prefetch(Index i) const; | |
503 | // | |
504 | // // Returns a pointer to CoeffRow for slot index i. | |
505 | // CoeffRow* CoeffRowPtr(Index i); | |
506 | // | |
507 | // // Returns a pointer to ResultRow for slot index i. (Gaussian row | |
508 | // // operations involve both side of the equation.) | |
509 | // ResultRow* ResultRowPtr(Index i); | |
510 | // | |
511 | // // Returns the number of columns that can start an r-sequence of | |
512 | // // coefficients, which is the number of slots minus r (kCoeffBits) | |
513 | // // plus one. (m - r + 1) | |
514 | // Index GetNumStarts() const; | |
515 | // }; | |
516 | ||
517 | // Optional storage for backtracking data in banding a set of input | |
518 | // entries. It exposes an array structure which will generally be | |
519 | // used as a stack. It must be able to accommodate as many entries | |
520 | // as are passed in as inputs to `BandingAddRange`. | |
521 | // | |
522 | // concept BacktrackStorage extends RibbonTypes { | |
523 | // // If false, backtracking support will be disabled in the algorithm. | |
524 | // // This should preferably be an inline compile-time constant function. | |
525 | // bool UseBacktrack() const; | |
526 | // | |
527 | // // Records `to_save` as the `i`th backtrack entry | |
528 | // void BacktrackPut(Index i, Index to_save); | |
529 | // | |
530 | // // Recalls the `i`th backtrack entry | |
531 | // Index BacktrackGet(Index i) const; | |
532 | // } | |
533 | ||
534 | // Adds a single entry to BandingStorage (and optionally, BacktrackStorage), | |
535 | // returning true if successful or false if solution is impossible with | |
536 | // current hasher (and presumably its seed) and number of "slots" (solution | |
537 | // or banding rows). (A solution is impossible when there is a linear | |
538 | // dependence among the inputs that doesn't "cancel out".) | |
539 | // | |
540 | // Pre- and post-condition: the BandingStorage represents a band matrix | |
541 | // ready for back substitution (row echelon form except for zero rows), | |
542 | // augmented with result values such that back substitution would give a | |
543 | // solution satisfying all the cr@start -> rr entries added. | |
544 | template <bool kFirstCoeffAlwaysOne, typename BandingStorage, | |
545 | typename BacktrackStorage> | |
546 | bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start, | |
547 | typename BandingStorage::ResultRow rr, | |
548 | typename BandingStorage::CoeffRow cr, BacktrackStorage *bts, | |
549 | typename BandingStorage::Index *backtrack_pos) { | |
550 | using CoeffRow = typename BandingStorage::CoeffRow; | |
551 | using Index = typename BandingStorage::Index; | |
552 | ||
553 | Index i = start; | |
554 | ||
555 | if (!kFirstCoeffAlwaysOne) { | |
556 | // Requires/asserts that cr != 0 | |
557 | int tz = CountTrailingZeroBits(cr); | |
558 | i += static_cast<Index>(tz); | |
559 | cr >>= tz; | |
560 | } | |
561 | ||
562 | for (;;) { | |
563 | assert((cr & 1) == 1); | |
564 | CoeffRow other = *(bs->CoeffRowPtr(i)); | |
565 | if (other == 0) { | |
566 | *(bs->CoeffRowPtr(i)) = cr; | |
567 | *(bs->ResultRowPtr(i)) = rr; | |
568 | bts->BacktrackPut(*backtrack_pos, i); | |
569 | ++*backtrack_pos; | |
570 | return true; | |
571 | } | |
572 | assert((other & 1) == 1); | |
573 | // Gaussian row reduction | |
574 | cr ^= other; | |
575 | rr ^= *(bs->ResultRowPtr(i)); | |
576 | if (cr == 0) { | |
577 | // Inconsistency or (less likely) redundancy | |
578 | break; | |
579 | } | |
580 | // Find relative offset of next non-zero coefficient. | |
581 | int tz = CountTrailingZeroBits(cr); | |
582 | i += static_cast<Index>(tz); | |
583 | cr >>= tz; | |
584 | } | |
585 | ||
586 | // Failed, unless result row == 0 because e.g. a duplicate input or a | |
587 | // stock hash collision, with same result row. (For filter, stock hash | |
588 | // collision implies same result row.) Or we could have a full equation | |
589 | // equal to sum of other equations, which is very possible with | |
590 | // small range of values for result row. | |
591 | return rr == 0; | |
592 | } | |
593 | ||
594 | // Adds a range of entries to BandingStorage returning true if successful | |
595 | // or false if solution is impossible with current hasher (and presumably | |
596 | // its seed) and number of "slots" (solution or banding rows). (A solution | |
597 | // is impossible when there is a linear dependence among the inputs that | |
598 | // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. | |
599 | // | |
600 | // If UseBacktrack in the BacktrackStorage, this function call rolls back | |
601 | // to prior state on failure. If !UseBacktrack, some subset of the entries | |
602 | // will have been added to the BandingStorage, so best considered to be in | |
603 | // an indeterminate state. | |
604 | // | |
605 | template <typename BandingStorage, typename BacktrackStorage, | |
606 | typename BandingHasher, typename InputIterator> | |
607 | bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts, | |
608 | const BandingHasher &bh, InputIterator begin, | |
609 | InputIterator end) { | |
610 | using CoeffRow = typename BandingStorage::CoeffRow; | |
611 | using Index = typename BandingStorage::Index; | |
612 | using ResultRow = typename BandingStorage::ResultRow; | |
613 | using Hash = typename BandingHasher::Hash; | |
614 | ||
615 | static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned"); | |
616 | static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned"); | |
617 | static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned"); | |
618 | ||
619 | constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne; | |
620 | ||
621 | if (begin == end) { | |
622 | // trivial | |
623 | return true; | |
624 | } | |
625 | ||
626 | const Index num_starts = bs->GetNumStarts(); | |
627 | ||
628 | InputIterator cur = begin; | |
629 | Index backtrack_pos = 0; | |
630 | if (!bs->UsePrefetch()) { | |
631 | // Simple version, no prefetch | |
632 | for (;;) { | |
633 | Hash h = bh.GetHash(*cur); | |
634 | Index start = bh.GetStart(h, num_starts); | |
635 | ResultRow rr = | |
636 | bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h); | |
637 | CoeffRow cr = bh.GetCoeffRow(h); | |
638 | ||
639 | if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) { | |
640 | break; | |
641 | } | |
642 | if ((++cur) == end) { | |
643 | return true; | |
644 | } | |
645 | } | |
646 | } else { | |
647 | // Pipelined w/prefetch | |
648 | // Prime the pipeline | |
649 | Hash h = bh.GetHash(*cur); | |
650 | Index start = bh.GetStart(h, num_starts); | |
651 | ResultRow rr = bh.GetResultRowFromInput(*cur); | |
652 | bs->Prefetch(start); | |
653 | ||
654 | // Pipeline | |
655 | for (;;) { | |
656 | rr |= bh.GetResultRowFromHash(h); | |
657 | CoeffRow cr = bh.GetCoeffRow(h); | |
658 | if ((++cur) == end) { | |
659 | if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) { | |
660 | break; | |
661 | } | |
662 | return true; | |
663 | } | |
664 | Hash next_h = bh.GetHash(*cur); | |
665 | Index next_start = bh.GetStart(next_h, num_starts); | |
666 | ResultRow next_rr = bh.GetResultRowFromInput(*cur); | |
667 | bs->Prefetch(next_start); | |
668 | if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) { | |
669 | break; | |
670 | } | |
671 | h = next_h; | |
672 | start = next_start; | |
673 | rr = next_rr; | |
674 | } | |
675 | } | |
676 | // failed; backtrack (if implemented) | |
677 | if (bts->UseBacktrack()) { | |
678 | while (backtrack_pos > 0) { | |
679 | --backtrack_pos; | |
680 | Index i = bts->BacktrackGet(backtrack_pos); | |
681 | *(bs->CoeffRowPtr(i)) = 0; | |
682 | // Not strictly required, but is required for good FP rate on | |
683 | // inputs that might have been backtracked out. (We don't want | |
684 | // anything we've backtracked on to leak into final result, as | |
685 | // that might not be "harmless".) | |
686 | *(bs->ResultRowPtr(i)) = 0; | |
687 | } | |
688 | } | |
689 | return false; | |
690 | } | |
691 | ||
692 | // Adds a range of entries to BandingStorage returning true if successful | |
693 | // or false if solution is impossible with current hasher (and presumably | |
694 | // its seed) and number of "slots" (solution or banding rows). (A solution | |
695 | // is impossible when there is a linear dependence among the inputs that | |
696 | // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. | |
697 | // | |
698 | // On failure, some subset of the entries will have been added to the | |
699 | // BandingStorage, so best considered to be in an indeterminate state. | |
700 | // | |
701 | template <typename BandingStorage, typename BandingHasher, | |
702 | typename InputIterator> | |
703 | bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh, | |
704 | InputIterator begin, InputIterator end) { | |
705 | using Index = typename BandingStorage::Index; | |
706 | struct NoopBacktrackStorage { | |
707 | bool UseBacktrack() { return false; } | |
708 | void BacktrackPut(Index, Index) {} | |
709 | Index BacktrackGet(Index) { | |
710 | assert(false); | |
711 | return 0; | |
712 | } | |
713 | } nbts; | |
714 | return BandingAddRange(bs, &nbts, bh, begin, end); | |
715 | } | |
716 | ||
717 | // ###################################################################### | |
718 | // ######################### Solution Storage ########################### | |
719 | ||
720 | // Back-substitution and query algorithms unfortunately depend on some | |
721 | // details of data layout in the final data structure ("solution"). Thus, | |
722 | // there is no common SolutionStorage covering all the reasonable | |
723 | // possibilities. | |
724 | ||
725 | // ###################### SimpleSolutionStorage ######################### | |
726 | ||
727 | // SimpleSolutionStorage is for a row-major storage, typically with no | |
728 | // unused bits in each ResultRow. This is mostly for demonstration | |
729 | // purposes as the simplest solution storage scheme. It is relatively slow | |
730 | // for filter queries. | |
731 | ||
732 | // concept SimpleSolutionStorage extends RibbonTypes { | |
733 | // // This is called at the beginning of back-substitution for the | |
734 | // // solution storage to do any remaining configuration before data | |
735 | // // is stored to it. If configuration is previously finalized, this | |
736 | // // could be a simple assertion or even no-op. Ribbon algorithms | |
737 | // // only call this from back-substitution, and only once per call, | |
738 | // // before other functions here. | |
739 | // void PrepareForNumStarts(Index num_starts) const; | |
740 | // // Must return num_starts passed to PrepareForNumStarts, or the most | |
741 | // // recent call to PrepareForNumStarts if this storage object can be | |
742 | // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because | |
743 | // // there must be a run of kCoeffBits slots starting from each start. | |
744 | // Index GetNumStarts() const; | |
745 | // // Load the solution row (type ResultRow) for a slot | |
746 | // ResultRow Load(Index slot_num) const; | |
747 | // // Store the solution row (type ResultRow) for a slot | |
748 | // void Store(Index slot_num, ResultRow data); | |
749 | // }; | |
750 | ||
751 | // Back-substitution for generating a solution from BandingStorage to | |
752 | // SimpleSolutionStorage. | |
753 | template <typename SimpleSolutionStorage, typename BandingStorage> | |
754 | void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) { | |
755 | using CoeffRow = typename BandingStorage::CoeffRow; | |
756 | using Index = typename BandingStorage::Index; | |
757 | using ResultRow = typename BandingStorage::ResultRow; | |
758 | ||
759 | static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index), | |
760 | "must be same"); | |
761 | static_assert( | |
762 | sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow), | |
763 | "must be same"); | |
764 | static_assert( | |
765 | sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow), | |
766 | "must be same"); | |
767 | ||
768 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); | |
769 | constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U); | |
770 | ||
771 | // A column-major buffer of the solution matrix, containing enough | |
772 | // recently-computed solution data to compute the next solution row | |
773 | // (based also on banding data). | |
774 | std::array<CoeffRow, kResultBits> state; | |
775 | state.fill(0); | |
776 | ||
777 | const Index num_starts = bs.GetNumStarts(); | |
778 | sss->PrepareForNumStarts(num_starts); | |
779 | const Index num_slots = num_starts + kCoeffBits - 1; | |
780 | ||
781 | for (Index i = num_slots; i > 0;) { | |
782 | --i; | |
783 | CoeffRow cr = *const_cast<BandingStorage &>(bs).CoeffRowPtr(i); | |
784 | ResultRow rr = *const_cast<BandingStorage &>(bs).ResultRowPtr(i); | |
785 | // solution row | |
786 | ResultRow sr = 0; | |
787 | for (Index j = 0; j < kResultBits; ++j) { | |
788 | // Compute next solution bit at row i, column j (see derivation below) | |
789 | CoeffRow tmp = state[j] << 1; | |
790 | bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0; | |
791 | tmp |= bit ? CoeffRow{1} : CoeffRow{0}; | |
792 | ||
793 | // Now tmp is solution at column j from row i for next kCoeffBits | |
794 | // more rows. Thus, for valid solution, the dot product of the | |
795 | // solution column with the coefficient row has to equal the result | |
796 | // at that column, | |
797 | // BitParity(tmp & cr) == ((rr >> j) & 1) | |
798 | ||
799 | // Update state. | |
800 | state[j] = tmp; | |
801 | // add to solution row | |
802 | sr |= (bit ? ResultRow{1} : ResultRow{0}) << j; | |
803 | } | |
804 | sss->Store(i, sr); | |
805 | } | |
806 | } | |
807 | ||
808 | // Common functionality for querying a key (already hashed) in | |
809 | // SimpleSolutionStorage. | |
810 | template <typename SimpleSolutionStorage> | |
811 | typename SimpleSolutionStorage::ResultRow SimpleQueryHelper( | |
812 | typename SimpleSolutionStorage::Index start_slot, | |
813 | typename SimpleSolutionStorage::CoeffRow cr, | |
814 | const SimpleSolutionStorage &sss) { | |
815 | using CoeffRow = typename SimpleSolutionStorage::CoeffRow; | |
816 | using ResultRow = typename SimpleSolutionStorage::ResultRow; | |
817 | ||
818 | constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U); | |
819 | ||
820 | ResultRow result = 0; | |
821 | for (unsigned i = 0; i < kCoeffBits; ++i) { | |
822 | // Bit masking whole value is generally faster here than 'if' | |
823 | result ^= sss.Load(start_slot + i) & | |
824 | (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1})); | |
825 | } | |
826 | return result; | |
827 | } | |
828 | ||
829 | // General PHSF query a key from SimpleSolutionStorage. | |
830 | template <typename SimpleSolutionStorage, typename PhsfQueryHasher> | |
831 | typename SimpleSolutionStorage::ResultRow SimplePhsfQuery( | |
832 | const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, | |
833 | const SimpleSolutionStorage &sss) { | |
834 | const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key); | |
835 | ||
836 | static_assert(sizeof(typename SimpleSolutionStorage::Index) == | |
837 | sizeof(typename PhsfQueryHasher::Index), | |
838 | "must be same"); | |
839 | static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == | |
840 | sizeof(typename PhsfQueryHasher::CoeffRow), | |
841 | "must be same"); | |
842 | ||
843 | return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), | |
844 | hasher.GetCoeffRow(hash), sss); | |
845 | } | |
846 | ||
847 | // Filter query a key from SimpleSolutionStorage. | |
848 | template <typename SimpleSolutionStorage, typename FilterQueryHasher> | |
849 | bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key, | |
850 | const FilterQueryHasher &hasher, | |
851 | const SimpleSolutionStorage &sss) { | |
852 | const typename FilterQueryHasher::Hash hash = hasher.GetHash(key); | |
853 | const typename SimpleSolutionStorage::ResultRow expected = | |
854 | hasher.GetResultRowFromHash(hash); | |
855 | ||
856 | static_assert(sizeof(typename SimpleSolutionStorage::Index) == | |
857 | sizeof(typename FilterQueryHasher::Index), | |
858 | "must be same"); | |
859 | static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == | |
860 | sizeof(typename FilterQueryHasher::CoeffRow), | |
861 | "must be same"); | |
862 | static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) == | |
863 | sizeof(typename FilterQueryHasher::ResultRow), | |
864 | "must be same"); | |
865 | ||
866 | return expected == | |
867 | SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), | |
868 | hasher.GetCoeffRow(hash), sss); | |
869 | } | |
870 | ||
871 | // #################### InterleavedSolutionStorage ###################### | |
872 | ||
873 | // InterleavedSolutionStorage is row-major at a high level, for good | |
874 | // locality, and column-major at a low level, for CPU efficiency | |
875 | // especially in filter queries or relatively small number of result bits | |
876 | // (== solution columns). The storage is a sequence of "blocks" where a | |
877 | // block has one CoeffRow-sized segment for each solution column. Each | |
878 | // query spans at most two blocks; the starting solution row is typically | |
879 | // in the row-logical middle of a block and spans to the middle of the | |
880 | // next block. (See diagram below.) | |
881 | // | |
882 | // InterleavedSolutionStorage supports choosing b (number of result or | |
883 | // solution columns) at run time, and even supports mixing b and b-1 solution | |
884 | // columns in a single linear system solution, for filters that can | |
885 | // effectively utilize any size space (multiple of CoeffRow) for minimizing | |
886 | // FP rate for any number of added keys. To simplify query implementation | |
887 | // (with lower-index columns first), the b-bit portion comes after the b-1 | |
888 | // portion of the structure. | |
889 | // | |
890 | // Diagram (=== marks logical block boundary; b=4; ### is data used by a | |
891 | // query crossing the b-1 to b boundary, each Segment has type CoeffRow): | |
892 | // ... | |
893 | // +======================+ | |
894 | // | S e g m e n t col=0 | | |
895 | // +----------------------+ | |
896 | // | S e g m e n t col=1 | | |
897 | // +----------------------+ | |
898 | // | S e g m e n t col=2 | | |
899 | // +======================+ | |
900 | // | S e g m e n #########| | |
901 | // +----------------------+ | |
902 | // | S e g m e n #########| | |
903 | // +----------------------+ | |
904 | // | S e g m e n #########| | |
905 | // +======================+ Result/solution columns: above = 3, below = 4 | |
906 | // |#############t col=0 | | |
907 | // +----------------------+ | |
908 | // |#############t col=1 | | |
909 | // +----------------------+ | |
910 | // |#############t col=2 | | |
911 | // +----------------------+ | |
912 | // | S e g m e n t col=3 | | |
913 | // +======================+ | |
914 | // | S e g m e n t col=0 | | |
915 | // +----------------------+ | |
916 | // | S e g m e n t col=1 | | |
917 | // +----------------------+ | |
918 | // | S e g m e n t col=2 | | |
919 | // +----------------------+ | |
920 | // | S e g m e n t col=3 | | |
921 | // +======================+ | |
922 | // ... | |
923 | // | |
924 | // InterleavedSolutionStorage will be adapted by the algorithms from | |
925 | // simple array-like segment storage. That array-like storage is templatized | |
926 | // in part so that an implementation may choose to handle byte ordering | |
927 | // at access time. | |
928 | // | |
929 | // concept InterleavedSolutionStorage extends RibbonTypes { | |
930 | // // This is called at the beginning of back-substitution for the | |
931 | // // solution storage to do any remaining configuration before data | |
932 | // // is stored to it. If configuration is previously finalized, this | |
933 | // // could be a simple assertion or even no-op. Ribbon algorithms | |
934 | // // only call this from back-substitution, and only once per call, | |
935 | // // before other functions here. | |
936 | // void PrepareForNumStarts(Index num_starts) const; | |
937 | // // Must return num_starts passed to PrepareForNumStarts, or the most | |
938 | // // recent call to PrepareForNumStarts if this storage object can be | |
939 | // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because | |
940 | // // there must be a run of kCoeffBits slots starting from each start. | |
941 | // Index GetNumStarts() const; | |
942 | // // The larger number of solution columns used (called "b" above). | |
943 | // Index GetUpperNumColumns() const; | |
944 | // // If returns > 0, then block numbers below that use | |
945 | // // GetUpperNumColumns() - 1 columns per solution row, and the rest | |
946 | // // use GetUpperNumColumns(). A block represents kCoeffBits "slots", | |
947 | // // where all but the last kCoeffBits - 1 slots are also starts. And | |
948 | // // a block contains a segment for each solution column. | |
949 | // // An implementation may only support uniform columns per solution | |
950 | // // row and return constant 0 here. | |
951 | // Index GetUpperStartBlock() const; | |
952 | // | |
953 | // // ### "Array of segments" portion of API ### | |
954 | // // The number of values of type CoeffRow used in this solution | |
955 | // // representation. (This value can be inferred from the previous | |
956 | // // three functions, but is expected at least for sanity / assertion | |
957 | // // checking.) | |
958 | // Index GetNumSegments() const; | |
959 | // // Load an entry from the logical array of segments | |
960 | // CoeffRow LoadSegment(Index segment_num) const; | |
961 | // // Store an entry to the logical array of segments | |
962 | // void StoreSegment(Index segment_num, CoeffRow data); | |
963 | // }; | |
964 | ||
965 | // A helper for InterleavedBackSubst. | |
966 | template <typename BandingStorage> | |
967 | inline void BackSubstBlock(typename BandingStorage::CoeffRow *state, | |
968 | typename BandingStorage::Index num_columns, | |
969 | const BandingStorage &bs, | |
970 | typename BandingStorage::Index start_slot) { | |
971 | using CoeffRow = typename BandingStorage::CoeffRow; | |
972 | using Index = typename BandingStorage::Index; | |
973 | using ResultRow = typename BandingStorage::ResultRow; | |
974 | ||
975 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); | |
976 | ||
977 | for (Index i = start_slot + kCoeffBits; i > start_slot;) { | |
978 | --i; | |
979 | CoeffRow cr = *const_cast<BandingStorage &>(bs).CoeffRowPtr(i); | |
980 | ResultRow rr = *const_cast<BandingStorage &>(bs).ResultRowPtr(i); | |
981 | for (Index j = 0; j < num_columns; ++j) { | |
982 | // Compute next solution bit at row i, column j (see derivation below) | |
983 | CoeffRow tmp = state[j] << 1; | |
984 | int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1); | |
985 | tmp |= static_cast<CoeffRow>(bit); | |
986 | ||
987 | // Now tmp is solution at column j from row i for next kCoeffBits | |
988 | // more rows. Thus, for valid solution, the dot product of the | |
989 | // solution column with the coefficient row has to equal the result | |
990 | // at that column, | |
991 | // BitParity(tmp & cr) == ((rr >> j) & 1) | |
992 | ||
993 | // Update state. | |
994 | state[j] = tmp; | |
995 | } | |
996 | } | |
997 | } | |
998 | ||
999 | // Back-substitution for generating a solution from BandingStorage to | |
1000 | // InterleavedSolutionStorage. | |
1001 | template <typename InterleavedSolutionStorage, typename BandingStorage> | |
1002 | void InterleavedBackSubst(InterleavedSolutionStorage *iss, | |
1003 | const BandingStorage &bs) { | |
1004 | using CoeffRow = typename BandingStorage::CoeffRow; | |
1005 | using Index = typename BandingStorage::Index; | |
1006 | ||
1007 | static_assert( | |
1008 | sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index), | |
1009 | "must be same"); | |
1010 | static_assert( | |
1011 | sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow), | |
1012 | "must be same"); | |
1013 | ||
1014 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); | |
1015 | ||
1016 | const Index num_starts = bs.GetNumStarts(); | |
1017 | // Although it might be nice to have a filter that returns "always false" | |
1018 | // when no key is added, we aren't specifically supporting that here | |
1019 | // because it would require another condition branch in the query. | |
1020 | assert(num_starts > 0); | |
1021 | iss->PrepareForNumStarts(num_starts); | |
1022 | ||
1023 | const Index num_slots = num_starts + kCoeffBits - 1; | |
1024 | assert(num_slots % kCoeffBits == 0); | |
1025 | const Index num_blocks = num_slots / kCoeffBits; | |
1026 | const Index num_segments = iss->GetNumSegments(); | |
1027 | ||
1028 | // For now upper, then lower | |
1029 | Index num_columns = iss->GetUpperNumColumns(); | |
1030 | const Index upper_start_block = iss->GetUpperStartBlock(); | |
1031 | ||
1032 | if (num_columns == 0) { | |
1033 | // Nothing to do, presumably because there's not enough space for even | |
1034 | // a single segment. | |
1035 | assert(num_segments == 0); | |
1036 | // When num_columns == 0, a Ribbon filter query will always return true, | |
1037 | // or a PHSF query always 0. | |
1038 | return; | |
1039 | } | |
1040 | ||
1041 | // We should be utilizing all available segments | |
1042 | assert(num_segments == (upper_start_block * (num_columns - 1)) + | |
1043 | ((num_blocks - upper_start_block) * num_columns)); | |
1044 | ||
1045 | // TODO: consider fixed-column specializations with stack-allocated state | |
1046 | ||
1047 | // A column-major buffer of the solution matrix, containing enough | |
1048 | // recently-computed solution data to compute the next solution row | |
1049 | // (based also on banding data). | |
1050 | std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()}; | |
1051 | ||
1052 | Index block = num_blocks; | |
1053 | Index segment = num_segments; | |
1054 | while (block > upper_start_block) { | |
1055 | --block; | |
1056 | BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); | |
1057 | segment -= num_columns; | |
1058 | for (Index i = 0; i < num_columns; ++i) { | |
1059 | iss->StoreSegment(segment + i, state[i]); | |
1060 | } | |
1061 | } | |
1062 | // Now (if applicable), region using lower number of columns | |
1063 | // (This should be optimized away if GetUpperStartBlock() returns | |
1064 | // constant 0.) | |
1065 | --num_columns; | |
1066 | while (block > 0) { | |
1067 | --block; | |
1068 | BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); | |
1069 | segment -= num_columns; | |
1070 | for (Index i = 0; i < num_columns; ++i) { | |
1071 | iss->StoreSegment(segment + i, state[i]); | |
1072 | } | |
1073 | } | |
1074 | // Verify everything processed | |
1075 | assert(block == 0); | |
1076 | assert(segment == 0); | |
1077 | } | |
1078 | ||
1079 | // General PHSF query a key from InterleavedSolutionStorage. | |
1080 | template <typename InterleavedSolutionStorage, typename PhsfQueryHasher> | |
1081 | typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery( | |
1082 | const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, | |
1083 | const InterleavedSolutionStorage &iss) { | |
1084 | using Hash = typename PhsfQueryHasher::Hash; | |
1085 | ||
1086 | using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; | |
1087 | using Index = typename InterleavedSolutionStorage::Index; | |
1088 | using ResultRow = typename InterleavedSolutionStorage::ResultRow; | |
1089 | ||
1090 | static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), | |
1091 | "must be same"); | |
1092 | static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow), | |
1093 | "must be same"); | |
1094 | ||
1095 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); | |
1096 | ||
1097 | const Hash hash = hasher.GetHash(key); | |
1098 | const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts()); | |
1099 | ||
1100 | const Index upper_start_block = iss.GetUpperStartBlock(); | |
1101 | Index num_columns = iss.GetUpperNumColumns(); | |
1102 | Index start_block_num = start_slot / kCoeffBits; | |
1103 | Index segment = start_block_num * num_columns - | |
1104 | std::min(start_block_num, upper_start_block); | |
1105 | // Change to lower num columns if applicable. | |
1106 | // (This should not compile to a conditional branch.) | |
1107 | num_columns -= (start_block_num < upper_start_block) ? 1 : 0; | |
1108 | ||
1109 | const CoeffRow cr = hasher.GetCoeffRow(hash); | |
1110 | Index start_bit = start_slot % kCoeffBits; | |
1111 | ||
1112 | ResultRow sr = 0; | |
1113 | const CoeffRow cr_left = cr << start_bit; | |
1114 | for (Index i = 0; i < num_columns; ++i) { | |
1115 | sr ^= BitParity(iss.LoadSegment(segment + i) & cr_left) << i; | |
1116 | } | |
1117 | ||
1118 | if (start_bit > 0) { | |
1119 | segment += num_columns; | |
1120 | const CoeffRow cr_right = cr >> (kCoeffBits - start_bit); | |
1121 | for (Index i = 0; i < num_columns; ++i) { | |
1122 | sr ^= BitParity(iss.LoadSegment(segment + i) & cr_right) << i; | |
1123 | } | |
1124 | } | |
1125 | ||
1126 | return sr; | |
1127 | } | |
1128 | ||
1129 | // Filter query a key from InterleavedFilterQuery. | |
1130 | template <typename InterleavedSolutionStorage, typename FilterQueryHasher> | |
1131 | bool InterleavedFilterQuery(const typename FilterQueryHasher::Key &key, | |
1132 | const FilterQueryHasher &hasher, | |
1133 | const InterleavedSolutionStorage &iss) { | |
1134 | // BEGIN mostly copied from InterleavedPhsfQuery | |
1135 | using Hash = typename FilterQueryHasher::Hash; | |
1136 | ||
1137 | using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; | |
1138 | using Index = typename InterleavedSolutionStorage::Index; | |
1139 | using ResultRow = typename InterleavedSolutionStorage::ResultRow; | |
1140 | ||
1141 | static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index), | |
1142 | "must be same"); | |
1143 | static_assert( | |
1144 | sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow), | |
1145 | "must be same"); | |
1146 | static_assert( | |
1147 | sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow), | |
1148 | "must be same"); | |
1149 | ||
1150 | constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U); | |
1151 | ||
1152 | const Hash hash = hasher.GetHash(key); | |
1153 | const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts()); | |
1154 | ||
1155 | const Index upper_start_block = iss.GetUpperStartBlock(); | |
1156 | Index num_columns = iss.GetUpperNumColumns(); | |
1157 | Index start_block_num = start_slot / kCoeffBits; | |
1158 | Index segment = start_block_num * num_columns - | |
1159 | std::min(start_block_num, upper_start_block); | |
1160 | // Change to lower num columns if applicable. | |
1161 | // (This should not compile to a conditional branch.) | |
1162 | num_columns -= (start_block_num < upper_start_block) ? 1 : 0; | |
1163 | ||
1164 | const CoeffRow cr = hasher.GetCoeffRow(hash); | |
1165 | Index start_bit = start_slot % kCoeffBits; | |
1166 | // END mostly copied from InterleavedPhsfQuery. | |
1167 | ||
1168 | const ResultRow expected = hasher.GetResultRowFromHash(hash); | |
1169 | ||
1170 | // TODO: consider optimizations such as | |
1171 | // * mask fetched values and shift cr, rather than shifting fetched values | |
1172 | // * get rid of start_bit == 0 condition with careful fetching & shifting | |
1173 | if (start_bit == 0) { | |
1174 | for (Index i = 0; i < num_columns; ++i) { | |
1175 | if (BitParity(iss.LoadSegment(segment + i) & cr) != | |
1176 | (static_cast<int>(expected >> i) & 1)) { | |
1177 | return false; | |
1178 | } | |
1179 | } | |
1180 | } else { | |
1181 | for (Index i = 0; i < num_columns; ++i) { | |
1182 | CoeffRow soln_col = | |
1183 | (iss.LoadSegment(segment + i) >> static_cast<unsigned>(start_bit)) | | |
1184 | (iss.LoadSegment(segment + num_columns + i) | |
1185 | << static_cast<unsigned>(kCoeffBits - start_bit)); | |
1186 | if (BitParity(soln_col & cr) != (static_cast<int>(expected >> i) & 1)) { | |
1187 | return false; | |
1188 | } | |
1189 | } | |
1190 | } | |
1191 | // otherwise, all match | |
1192 | return true; | |
1193 | } | |
1194 | ||
1195 | // TODO: refactor Interleaved*Query so that queries can be "prepared" by | |
1196 | // prefetching memory, to hide memory latency for multiple queries in a | |
1197 | // single thread. | |
1198 | ||
1199 | } // namespace ribbon | |
1200 | ||
1201 | } // namespace ROCKSDB_NAMESPACE |