]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/util/ribbon_alg.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / util / ribbon_alg.h
1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5
6 #pragma once
7
8 #include <array>
9 #include <memory>
10
11 #include "util/math128.h"
12
13 namespace ROCKSDB_NAMESPACE {
14
15 namespace ribbon {
16
17 // RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
18 //
19 // ribbon_alg.h: generic versions of core algorithms.
20 //
21 // Ribbon is a Perfect Hash Static Function construction useful as a compact
22 // static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
23 // system construction that approximates a Band Matrix with hashing,
24 // (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
25 // remarkably efficient and adaptable at constructing an upper-triangular
26 // band matrix from a set of band-approximating inputs from (a), and
27 // (c) a storage layout that is fast and adaptable as a filter.
28 //
29 // Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
30 // with One Short Random Block per Row, with Applications" by Stefan
31 // Walzer and Martin Dietzfelbinger ("DW paper")
32 // (b) developed by Peter C. Dillinger, though not the first on-the-fly
33 // GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
34 // Bioglio, Grangetto, Gaeta, and Sereno.
35 // (c) see "interleaved" solution storage below.
36 //
37 // See ribbon_impl.h for high-level behavioral summary. This file focuses
38 // on the core design details.
39 //
40 // ######################################################################
41 // ################# PHSF -> static filter reduction ####################
42 //
43 // A Perfect Hash Static Function is a data structure representing a
44 // map from anything hashable (a "key") to values of some fixed size.
45 // Crucially, it is allowed to return garbage values for anything not in
46 // the original set of map keys, and it is a "static" structure: entries
47 // cannot be added or deleted after construction. PHSFs representing n
48 // mappings to b-bit values (assume uniformly distributed) require at least
49 // n * b bits to represent, or at least b bits per entry. We typically
50 // describe the compactness of a PHSF by typical bits per entry as some
51 // function of b. For example, the MWHC construction (k=3 "peeling")
52 // requires about 1.0222*b and a variant called Xor+ requires about
53 // 1.08*b + 0.5 bits per entry.
54 //
55 // With more hashing, a PHSF can over-approximate a set as a Bloom filter
56 // does, with no FN queries and predictable false positive (FP) query
57 // rate. Instead of the user providing a value to map each input key to,
58 // a hash function provides the value. Keys in the original set will
59 // return a positive membership query because the underlying PHSF returns
60 // the same value as hashing the key. When a key is not in the original set,
61 // the PHSF returns a "garbage" value, which is only equal to the key's
62 // hash with (false positive) probability 1 in 2^b.
63 //
64 // For a matching false positive rate, standard Bloom filters require
65 // 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
66 // require a bit more, around 1.5*b bits per entry. Thus, a Bloom
67 // alternative could save up to or nearly 1/3rd of memory and storage
68 // that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
69 // is dynamic.)
70 //
71 // Recommended reading:
72 // "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
73 // by Graf and Lemire
74 // First three sections of "Fast Scalable Construction of (Minimal
75 // Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
76 //
77 // ######################################################################
78 // ################## PHSF vs. hash table vs. Bloom #####################
79 //
80 // You can think of traditional hash tables and related filter variants
81 // such as Cuckoo filters as utilizing an "OR" construction: a hash
82 // function associates a key with some slots and the data is returned if
83 // the data is found in any one of those slots. The collision resolution
84 // is visible in the final data structure and requires extra information.
85 // For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
86 // Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
87 // structure associates each input key with data in one slot, the
88 // structure implicitly constructs a (near-)minimal (near-)perfect hash
89 // (MPH) of the keys, which requires at least 1.44 bits per key to
90 // represent. This is why approaches with visible collision resolution
91 // have a fixed + 1.5 or more in storage overhead per entry, often in
92 // addition to an overhead multiplier on b.
93 //
94 // By contrast Bloom filters utilize an "AND" construction: a query only
95 // returns true if all bit positions associated with a key are set to 1.
96 // There is no collision resolution, so Bloom filters do not suffer a
97 // fixed bits per entry overhead like the above structures.
98 //
99 // PHSFs typically use a bitwise XOR construction: the data you want is
100 // not in a single slot, but in a linear combination of several slots.
101 // For static data, this gives the best of "AND" and "OR" constructions:
102 // avoids the +1.44 or more fixed overhead by not approximating a MPH and
103 // can do much better than Bloom's 1.44 factor on b with collision
104 // resolution, which here is done ahead of time and invisible at query
105 // time.
106 //
107 // ######################################################################
108 // ######################## PHSF construction ###########################
109 //
110 // For a typical PHSF, construction is solving a linear system of
111 // equations, typically in GF(2), which is to say that values are boolean
112 // and XOR serves both as addition and subtraction. We can use matrices to
113 // represent the problem:
114 //
115 // C * S = R
116 // (n x m) (m x b) (n x b)
117 // where C = coefficients, S = solution, R = results
118 // and solving for S given C and R.
119 //
120 // Note that C and R each have n rows, one for each input entry for the
121 // PHSF. A row in C is given by a hash function on the PHSF input key,
122 // and the corresponding row in R is the b-bit value to associate with
123 // that input key. (In a filter, rows of R are given by another hash
124 // function on the input key.)
125 //
126 // On solving, the matrix S (solution) is the final PHSF data, as it
127 // maps any row from the original C to its corresponding desired result
128 // in R. We just have to hash our query inputs and compute a linear
129 // combination of rows in S.
130 //
131 // In theory, we could chose m = n and let a hash function associate
132 // each input key with random rows in C. A solution exists with high
133 // probability, and uses essentially minimum space, b bits per entry
134 // (because we set m = n) but this has terrible scaling, something
135 // like O(n^2) space and O(n^3) time during construction (Gaussian
136 // elimination) and O(n) query time. But computational efficiency is
137 // key, and the core of this is avoiding scanning all of S to answer
138 // each query.
139 //
140 // The traditional approach (MWHC, aka Xor filter) starts with setting
141 // only some small fixed number of columns (typically k=3) to 1 for each
142 // row of C, with remaining entries implicitly 0. This is implemented as
143 // three hash functions over [0,m), and S can be implemented as a vector
144 // vector of b-bit values. Now, a query only involves looking up k rows
145 // (values) in S and computing their bitwise XOR. Additionally, this
146 // construction can use a linear time algorithm called "peeling" for
147 // finding a solution in many cases of one existing, but peeling
148 // generally requires a larger space overhead factor in the solution
149 // (m/n) than is required with Gaussian elimination.
150 //
151 // Recommended reading:
152 // "Peeling Close to the Orientability Threshold – Spatial Coupling in
153 // Hashing-Based Data Structures" by Stefan Walzer
154 //
155 // ######################################################################
156 // ##################### Ribbon PHSF construction #######################
157 //
158 // Ribbon constructs coefficient rows essentially the same as in the
159 // Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
160 // r (kCoeffBits in code), each key is hashed to a starting column in
161 // [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
162 // coefficients (GetCoeffRow() in code). If you sort the rows by start,
163 // the C matrix would look something like this:
164 //
165 // [####00000000000000000000]
166 // [####00000000000000000000]
167 // [000####00000000000000000]
168 // [0000####0000000000000000]
169 // [0000000####0000000000000]
170 // [000000000####00000000000]
171 // [000000000####00000000000]
172 // [0000000000000####0000000]
173 // [0000000000000000####0000]
174 // [00000000000000000####000]
175 // [00000000000000000000####]
176 //
177 // where each # could be a 0 or 1, chosen uniformly by a hash function.
178 // (Except we typically set the start column value to 1.) This scheme
179 // uses hashing to approximate a band matrix, and it has a solution iff
180 // it reduces to an upper-triangular boolean r-band matrix, like this:
181 //
182 // [1###00000000000000000000]
183 // [01##00000000000000000000]
184 // [000000000000000000000000]
185 // [0001###00000000000000000]
186 // [000000000000000000000000]
187 // [000001##0000000000000000]
188 // [000000000000000000000000]
189 // [00000001###0000000000000]
190 // [000000001###000000000000]
191 // [0000000001##000000000000]
192 // ...
193 // [00000000000000000000001#]
194 // [000000000000000000000001]
195 //
196 // where we have expanded to an m x m matrix by filling with rows of
197 // all zeros as needed. As in Gaussian elimination, this form is ready for
198 // generating a solution through back-substitution.
199 //
200 // The awesome thing about the Ribbon construction (from the DW paper) is
201 // how row reductions keep each row representable as a start column and
202 // r coefficients, because row reductions are only needed when two rows
203 // have the same number of leading zero columns. Thus, the combination
204 // of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
205 // out the leading 1s, so starts (at least) one column later and only
206 // needs (at most) r - 1 coefficients.
207 //
208 // ######################################################################
209 // ###################### Ribbon PHSF scalability #######################
210 //
211 // Although more practical detail is in ribbon_impl.h, it's worth
212 // understanding some of the overall benefits and limitations of the
213 // Ribbon PHSFs.
214 //
215 // High-end scalability is a primary issue for Ribbon PHSFs, because in
216 // a single Ribbon linear system with fixed r and fixed m/n ratio, the
217 // solution probability approaches zero as n approaches infinity.
218 // For a given n, solution probability improves with larger r and larger
219 // m/n.
220 //
221 // By contrast, peeling-based PHSFs have somewhat worse storage ratio
222 // or solution probability for small n (less than ~1000). This is
223 // especially true with spatial-coupling, where benefits are only
224 // notable for n on the order of 100k or 1m or more.
225 //
226 // To make best use of current hardware, r=128 seems to be closest to
227 // a "generally good" choice for Ribbon, at least in RocksDB where SST
228 // Bloom filters typically hold around 10-100k keys, and almost always
229 // less than 10m keys. r=128 ribbon has a high chance of encoding success
230 // (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
231 // for roughly 10k - 10m keys in a single linear system. r=64 only scales
232 // up to about 10k keys with the same storage overhead. Construction and
233 // access times for r=128 are similar to r=64. r=128 tracks nearly
234 // twice as much data during construction, but in most cases we expect
235 // the scalability benefits of r=128 vs. r=64 to make it preferred.
236 //
237 // A natural approach to scaling Ribbon beyond ~10m keys is splitting
238 // (or "sharding") the inputs into multiple linear systems with their
239 // own hash seeds. This can also help to control peak memory consumption.
240 // TODO: much more to come
241 //
242 // ######################################################################
243 // #################### Ribbon on-the-fly banding #######################
244 //
245 // "Banding" is what we call the process of reducing the inputs to an
246 // upper-triangular r-band matrix ready for finishing a solution with
247 // back-substitution. Although the DW paper presents an algorithm for
248 // this ("SGauss"), the awesome properties of their construction enable
249 // an even simpler, faster, and more backtrackable algorithm. In simplest
250 // terms, the SGauss algorithm requires sorting the inputs by start
251 // columns, but it's possible to make Gaussian elimination resemble hash
252 // table insertion!
253 //
254 // The enhanced algorithm is based on these observations:
255 // - When processing a coefficient row with first 1 in column j,
256 // - If it's the first at column j to be processed, it can be part of
257 // the banding at row j. (And that decision never overwritten, with
258 // no loss of generality!)
259 // - Else, it can be combined with existing row j and re-processed,
260 // which will look for a later "empty" row or reach "no solution".
261 //
262 // We call our banding algorithm "incremental" and "on-the-fly" because
263 // (like hash table insertion) we are "finished" after each input
264 // processed, with respect to all inputs processed so far. Although the
265 // band matrix is an intermediate step to the solution structure, we have
266 // eliminated intermediate steps and unnecessary data tracking for
267 // banding.
268 //
269 // Building on "incremental" and "on-the-fly", the banding algorithm is
270 // easily backtrackable because no (non-empty) rows are overwritten in
271 // the banding. Thus, if we want to "try" adding an additional set of
272 // inputs to the banding, we only have to record which rows were written
273 // in order to efficiently backtrack to our state before considering
274 // the additional set. (TODO: how this can mitigate scalability and
275 // reach sub-1% overheads)
276 //
277 // Like in a linear-probed hash table, as the occupancy approaches and
278 // surpasses 90-95%, collision resolution dominates the construction
279 // time. (Ribbon doesn't usually pay at query time; see solution
280 // storage below.) This means that we can speed up construction time
281 // by using a higher m/n ratio, up to negative returns around 1.2.
282 // At m/n ~= 1.2, which still saves memory substantially vs. Bloom
283 // filter's 1.5, construction speed (including back-substitution) is not
284 // far from sorting speed, but still a few times slower than cache-local
285 // Bloom construction speed.
286 //
287 // Back-substitution from an upper-triangular boolean band matrix is
288 // especially fast and easy. All the memory accesses are sequential or at
289 // least local, no random. If the number of result bits (b) is a
290 // compile-time constant, the back-substitution state can even be tracked
291 // in CPU registers. Regardless of the solution representation, we prefer
292 // column-major representation for tracking back-substitution state, as
293 // r (the band width) will typically be much larger than b (result bits
294 // or columns), so better to handle r-bit values b times (per solution
295 // row) than b-bit values r times.
296 //
297 // ######################################################################
298 // ##################### Ribbon solution storage ########################
299 //
300 // Row-major layout is typical for boolean (bit) matrices, including for
301 // MWHC (Xor) filters where a query combines k b-bit values, and k is
302 // typically smaller than b. Even for k=4 and b=2, at least k=4 random
303 // look-ups are required regardless of layout.
304 //
305 // Ribbon PHSFs are quite different, however, because
306 // (a) all of the solution rows relevant to a query are within a single
307 // range of r rows, and
308 // (b) the number of solution rows involved (r/2 on average, or r if
309 // avoiding conditional accesses) is typically much greater than
310 // b, the number of solution columns.
311 //
312 // Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
313 // by processing (up to) r entries of b bits each, where b is typically
314 // less than 10 for filter applications.
315 //
316 // Column-major layout has poor locality because of accessing up to b
317 // memory locations in different pages (and obviously cache lines). Note
318 // that negative filter queries do not typically need to access all
319 // solution columns, as they can return when a mismatch is found in any
320 // result/solution column. This optimization doesn't always pay off on
321 // recent hardware, where the penalty for unpredictable conditional
322 // branching can exceed the penalty for unnecessary work, but the
323 // optimization is essentially unavailable with row-major layout.
324 //
325 // The best compromise seems to be interleaving column-major on the small
326 // scale with row-major on the large scale. For example, let a solution
327 // "block" be r rows column-major encoded as b r-bit values in sequence.
328 // Each query accesses (up to) 2 adjacent blocks, which will typically
329 // span 1-3 cache lines in adjacent memory. We get very close to the same
330 // locality as row-major, but with much faster reconstruction of each
331 // result column, at least for filter applications where b is relatively
332 // small and negative queries can return early.
333 //
334 // ######################################################################
335 // ###################### Fractional result bits ########################
336 //
337 // Bloom filters have great flexibility that alternatives mostly do not
338 // have. One of those flexibilities is in utilizing any ratio of data
339 // structure bits per key. With a typical memory allocator like jemalloc,
340 // this flexibility can save roughly 10% of the filters' footprint in
341 // DRAM by rounding up and down filter sizes to minimize memory internal
342 // fragmentation (see optimize_filters_for_memory RocksDB option).
343 //
344 // At first glance, PHSFs only offer a whole number of bits per "slot"
345 // (m rather than number of keys n), but coefficient locality in the
346 // Ribbon construction makes fractional bits/key quite possible and
347 // attractive for filter applications. This works by a prefix of the
348 // structure using b-1 solution columns and the rest using b solution
349 // columns. See InterleavedSolutionStorage below for more detail.
350 //
351 // Because false positive rates are non-linear in bits/key, this approach
352 // is not quite optimal in terms of information theory. In common cases,
353 // we see additional space overhead up to about 1.5% vs. theoretical
354 // optimal to achieve the same FP rate. We consider this a quite acceptable
355 // overhead for very efficiently utilizing space that might otherwise be
356 // wasted.
357 //
358 // This property of Ribbon even makes it "elastic." A Ribbon filter and
359 // its small metadata for answering queries can be adapted into another
360 // Ribbon filter filling any smaller multiple of r bits (plus small
361 // metadata), with a correspondingly higher FP rate. None of the data
362 // thrown away during construction needs to be recalled for this reduction.
363 // Similarly a single Ribbon construction can be separated (by solution
364 // column) into two or more structures (or "layers" or "levels") with
365 // independent filtering ability (no FP correlation, just as solution or
366 // result columns in a single structure) despite being constructed as part
367 // of a single linear system. (TODO: implement)
368 // See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
369 // Efficient Read for LSM-tree-based KV Stores."
370 //
371
372 // ######################################################################
373 // ################### CODE: Ribbon core algorithms #####################
374 // ######################################################################
375 //
376 // These algorithms are templatized for genericity but near-maximum
377 // performance in a given application. The template parameters
378 // adhere to informal class/struct type concepts outlined below. (This
379 // code is written for C++11 so does not use formal C++ concepts.)
380
381 // Rough architecture for these algorithms:
382 //
383 // +-----------+ +---+ +-----------------+
384 // | AddInputs | --> | H | --> | BandingStorage |
385 // +-----------+ | a | +-----------------+
386 // | s | |
387 // | h | Back substitution
388 // | e | V
389 // +-----------+ | r | +-----------------+
390 // | Query Key | --> | | >+< | SolutionStorage |
391 // +-----------+ +---+ | +-----------------+
392 // V
393 // Query result
394
395 // Common to other concepts
396 // concept RibbonTypes {
397 // // An unsigned integer type for an r-bit subsequence of coefficients.
398 // // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
399 // // generally only hurt scalability to leave bits of CoeffRow unused.
400 // typename CoeffRow;
401 // // An unsigned integer type big enough to hold a result row (b bits,
402 // // or number of solution/result columns).
403 // // In many applications, especially filters, the number of result
404 // // columns is decided at run time, so ResultRow simply needs to be
405 // // big enough for the largest number of columns allowed.
406 // typename ResultRow;
407 // // An unsigned integer type sufficient for representing the number of
408 // // rows in the solution structure, and at least the arithmetic
409 // // promotion size (usually 32 bits). uint32_t recommended because a
410 // // single Ribbon construction doesn't really scale to billions of
411 // // entries.
412 // typename Index;
413 // };
414
415 // ######################################################################
416 // ######################## Hashers and Banding #########################
417
418 // Hasher concepts abstract out hashing details.
419
420 // concept PhsfQueryHasher extends RibbonTypes {
421 // // Type for a lookup key, which is hashable.
422 // typename Key;
423 //
424 // // Type for hashed summary of a Key. uint64_t is recommended.
425 // typename Hash;
426 //
427 // // Compute a hash value summarizing a Key
428 // Hash GetHash(const Key &) const;
429 //
430 // // Given a hash value and a number of columns that can start an
431 // // r-sequence of coefficients (== m - r + 1), return the start
432 // // column to associate with that hash value. (Starts can be chosen
433 // // uniformly or "smash" extra entries into the beginning and end for
434 // // better utilization at those extremes of the structure. Details in
435 // // ribbon.impl.h)
436 // Index GetStart(Hash, Index num_starts) const;
437 //
438 // // Given a hash value, return the r-bit sequence of coefficients to
439 // // associate with it. It's generally OK if
440 // // sizeof(CoeffRow) > sizeof(Hash)
441 // // as long as the hash itself is not too prone to collisions for the
442 // // applications and the CoeffRow is generated uniformly from
443 // // available hash data, but relatively independent of the start.
444 // //
445 // // Must be non-zero, because that's required for a solution to exist
446 // // when mapping to non-zero result row. (Note: BandingAdd could be
447 // // modified to allow 0 coeff row if that only occurs with 0 result
448 // // row, which really only makes sense for filter implementation,
449 // // where both values are hash-derived. Or BandingAdd could reject 0
450 // // coeff row, forcing next seed, but that has potential problems with
451 // // generality/scalability.)
452 // CoeffRow GetCoeffRow(Hash) const;
453 // };
454
455 // concept FilterQueryHasher extends PhsfQueryHasher {
456 // // For building or querying a filter, this returns the expected
457 // // result row associated with a hashed input. For general PHSF,
458 // // this must return 0.
459 // //
460 // // Although not strictly required, there's a slightly better chance of
461 // // solver success if result row is masked down here to only the bits
462 // // actually needed.
463 // ResultRow GetResultRowFromHash(Hash) const;
464 // }
465
466 // concept BandingHasher extends FilterQueryHasher {
467 // // For a filter, this will generally be the same as Key.
468 // // For a general PHSF, it must either
469 // // (a) include a key and a result it maps to (e.g. in a std::pair), or
470 // // (b) GetResultRowFromInput looks up the result somewhere rather than
471 // // extracting it.
472 // typename AddInput;
473 //
474 // // Instead of requiring a way to extract a Key from an
475 // // AddInput, we require getting the hash of the Key part
476 // // of an AddInput, which is trivial if AddInput == Key.
477 // Hash GetHash(const AddInput &) const;
478 //
479 // // For building a non-filter PHSF, this extracts or looks up the result
480 // // row to associate with an input. For filter PHSF, this must return 0.
481 // ResultRow GetResultRowFromInput(const AddInput &) const;
482 //
483 // // Whether the solver can assume the lowest bit of GetCoeffRow is
484 // // always 1. When true, it should improve solver efficiency slightly.
485 // static bool kFirstCoeffAlwaysOne;
486 // }
487
488 // Abstract storage for the the result of "banding" the inputs (Gaussian
489 // elimination to an upper-triangular boolean band matrix). Because the
490 // banding is an incremental / on-the-fly algorithm, this also represents
491 // all the intermediate state between input entries.
492 //
493 // concept BandingStorage extends RibbonTypes {
494 // // Tells the banding algorithm to prefetch memory associated with
495 // // the next input before processing the current input. Generally
496 // // recommended iff the BandingStorage doesn't easily fit in CPU
497 // // cache.
498 // bool UsePrefetch() const;
499 //
500 // // Prefetches (e.g. __builtin_prefetch) memory associated with a
501 // // slot index i.
502 // void Prefetch(Index i) const;
503 //
504 // // Returns a pointer to CoeffRow for slot index i.
505 // CoeffRow* CoeffRowPtr(Index i);
506 //
507 // // Returns a pointer to ResultRow for slot index i. (Gaussian row
508 // // operations involve both side of the equation.)
509 // ResultRow* ResultRowPtr(Index i);
510 //
511 // // Returns the number of columns that can start an r-sequence of
512 // // coefficients, which is the number of slots minus r (kCoeffBits)
513 // // plus one. (m - r + 1)
514 // Index GetNumStarts() const;
515 // };
516
517 // Optional storage for backtracking data in banding a set of input
518 // entries. It exposes an array structure which will generally be
519 // used as a stack. It must be able to accommodate as many entries
520 // as are passed in as inputs to `BandingAddRange`.
521 //
522 // concept BacktrackStorage extends RibbonTypes {
523 // // If false, backtracking support will be disabled in the algorithm.
524 // // This should preferably be an inline compile-time constant function.
525 // bool UseBacktrack() const;
526 //
527 // // Records `to_save` as the `i`th backtrack entry
528 // void BacktrackPut(Index i, Index to_save);
529 //
530 // // Recalls the `i`th backtrack entry
531 // Index BacktrackGet(Index i) const;
532 // }
533
534 // Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
535 // returning true if successful or false if solution is impossible with
536 // current hasher (and presumably its seed) and number of "slots" (solution
537 // or banding rows). (A solution is impossible when there is a linear
538 // dependence among the inputs that doesn't "cancel out".)
539 //
540 // Pre- and post-condition: the BandingStorage represents a band matrix
541 // ready for back substitution (row echelon form except for zero rows),
542 // augmented with result values such that back substitution would give a
543 // solution satisfying all the cr@start -> rr entries added.
544 template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
545 typename BacktrackStorage>
546 bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
547 typename BandingStorage::ResultRow rr,
548 typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
549 typename BandingStorage::Index *backtrack_pos) {
550 using CoeffRow = typename BandingStorage::CoeffRow;
551 using Index = typename BandingStorage::Index;
552
553 Index i = start;
554
555 if (!kFirstCoeffAlwaysOne) {
556 // Requires/asserts that cr != 0
557 int tz = CountTrailingZeroBits(cr);
558 i += static_cast<Index>(tz);
559 cr >>= tz;
560 }
561
562 for (;;) {
563 assert((cr & 1) == 1);
564 CoeffRow other = *(bs->CoeffRowPtr(i));
565 if (other == 0) {
566 *(bs->CoeffRowPtr(i)) = cr;
567 *(bs->ResultRowPtr(i)) = rr;
568 bts->BacktrackPut(*backtrack_pos, i);
569 ++*backtrack_pos;
570 return true;
571 }
572 assert((other & 1) == 1);
573 // Gaussian row reduction
574 cr ^= other;
575 rr ^= *(bs->ResultRowPtr(i));
576 if (cr == 0) {
577 // Inconsistency or (less likely) redundancy
578 break;
579 }
580 // Find relative offset of next non-zero coefficient.
581 int tz = CountTrailingZeroBits(cr);
582 i += static_cast<Index>(tz);
583 cr >>= tz;
584 }
585
586 // Failed, unless result row == 0 because e.g. a duplicate input or a
587 // stock hash collision, with same result row. (For filter, stock hash
588 // collision implies same result row.) Or we could have a full equation
589 // equal to sum of other equations, which is very possible with
590 // small range of values for result row.
591 return rr == 0;
592 }
593
594 // Adds a range of entries to BandingStorage returning true if successful
595 // or false if solution is impossible with current hasher (and presumably
596 // its seed) and number of "slots" (solution or banding rows). (A solution
597 // is impossible when there is a linear dependence among the inputs that
598 // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
599 //
600 // If UseBacktrack in the BacktrackStorage, this function call rolls back
601 // to prior state on failure. If !UseBacktrack, some subset of the entries
602 // will have been added to the BandingStorage, so best considered to be in
603 // an indeterminate state.
604 //
605 template <typename BandingStorage, typename BacktrackStorage,
606 typename BandingHasher, typename InputIterator>
607 bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
608 const BandingHasher &bh, InputIterator begin,
609 InputIterator end) {
610 using CoeffRow = typename BandingStorage::CoeffRow;
611 using Index = typename BandingStorage::Index;
612 using ResultRow = typename BandingStorage::ResultRow;
613 using Hash = typename BandingHasher::Hash;
614
615 static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
616 static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
617 static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
618
619 constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
620
621 if (begin == end) {
622 // trivial
623 return true;
624 }
625
626 const Index num_starts = bs->GetNumStarts();
627
628 InputIterator cur = begin;
629 Index backtrack_pos = 0;
630 if (!bs->UsePrefetch()) {
631 // Simple version, no prefetch
632 for (;;) {
633 Hash h = bh.GetHash(*cur);
634 Index start = bh.GetStart(h, num_starts);
635 ResultRow rr =
636 bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
637 CoeffRow cr = bh.GetCoeffRow(h);
638
639 if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
640 break;
641 }
642 if ((++cur) == end) {
643 return true;
644 }
645 }
646 } else {
647 // Pipelined w/prefetch
648 // Prime the pipeline
649 Hash h = bh.GetHash(*cur);
650 Index start = bh.GetStart(h, num_starts);
651 ResultRow rr = bh.GetResultRowFromInput(*cur);
652 bs->Prefetch(start);
653
654 // Pipeline
655 for (;;) {
656 rr |= bh.GetResultRowFromHash(h);
657 CoeffRow cr = bh.GetCoeffRow(h);
658 if ((++cur) == end) {
659 if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
660 break;
661 }
662 return true;
663 }
664 Hash next_h = bh.GetHash(*cur);
665 Index next_start = bh.GetStart(next_h, num_starts);
666 ResultRow next_rr = bh.GetResultRowFromInput(*cur);
667 bs->Prefetch(next_start);
668 if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
669 break;
670 }
671 h = next_h;
672 start = next_start;
673 rr = next_rr;
674 }
675 }
676 // failed; backtrack (if implemented)
677 if (bts->UseBacktrack()) {
678 while (backtrack_pos > 0) {
679 --backtrack_pos;
680 Index i = bts->BacktrackGet(backtrack_pos);
681 *(bs->CoeffRowPtr(i)) = 0;
682 // Not strictly required, but is required for good FP rate on
683 // inputs that might have been backtracked out. (We don't want
684 // anything we've backtracked on to leak into final result, as
685 // that might not be "harmless".)
686 *(bs->ResultRowPtr(i)) = 0;
687 }
688 }
689 return false;
690 }
691
692 // Adds a range of entries to BandingStorage returning true if successful
693 // or false if solution is impossible with current hasher (and presumably
694 // its seed) and number of "slots" (solution or banding rows). (A solution
695 // is impossible when there is a linear dependence among the inputs that
696 // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
697 //
698 // On failure, some subset of the entries will have been added to the
699 // BandingStorage, so best considered to be in an indeterminate state.
700 //
701 template <typename BandingStorage, typename BandingHasher,
702 typename InputIterator>
703 bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
704 InputIterator begin, InputIterator end) {
705 using Index = typename BandingStorage::Index;
706 struct NoopBacktrackStorage {
707 bool UseBacktrack() { return false; }
708 void BacktrackPut(Index, Index) {}
709 Index BacktrackGet(Index) {
710 assert(false);
711 return 0;
712 }
713 } nbts;
714 return BandingAddRange(bs, &nbts, bh, begin, end);
715 }
716
717 // ######################################################################
718 // ######################### Solution Storage ###########################
719
720 // Back-substitution and query algorithms unfortunately depend on some
721 // details of data layout in the final data structure ("solution"). Thus,
722 // there is no common SolutionStorage covering all the reasonable
723 // possibilities.
724
725 // ###################### SimpleSolutionStorage #########################
726
727 // SimpleSolutionStorage is for a row-major storage, typically with no
728 // unused bits in each ResultRow. This is mostly for demonstration
729 // purposes as the simplest solution storage scheme. It is relatively slow
730 // for filter queries.
731
732 // concept SimpleSolutionStorage extends RibbonTypes {
733 // // This is called at the beginning of back-substitution for the
734 // // solution storage to do any remaining configuration before data
735 // // is stored to it. If configuration is previously finalized, this
736 // // could be a simple assertion or even no-op. Ribbon algorithms
737 // // only call this from back-substitution, and only once per call,
738 // // before other functions here.
739 // void PrepareForNumStarts(Index num_starts) const;
740 // // Must return num_starts passed to PrepareForNumStarts, or the most
741 // // recent call to PrepareForNumStarts if this storage object can be
742 // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
743 // // there must be a run of kCoeffBits slots starting from each start.
744 // Index GetNumStarts() const;
745 // // Load the solution row (type ResultRow) for a slot
746 // ResultRow Load(Index slot_num) const;
747 // // Store the solution row (type ResultRow) for a slot
748 // void Store(Index slot_num, ResultRow data);
749 // };
750
751 // Back-substitution for generating a solution from BandingStorage to
752 // SimpleSolutionStorage.
753 template <typename SimpleSolutionStorage, typename BandingStorage>
754 void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
755 using CoeffRow = typename BandingStorage::CoeffRow;
756 using Index = typename BandingStorage::Index;
757 using ResultRow = typename BandingStorage::ResultRow;
758
759 static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
760 "must be same");
761 static_assert(
762 sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
763 "must be same");
764 static_assert(
765 sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
766 "must be same");
767
768 constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
769 constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
770
771 // A column-major buffer of the solution matrix, containing enough
772 // recently-computed solution data to compute the next solution row
773 // (based also on banding data).
774 std::array<CoeffRow, kResultBits> state;
775 state.fill(0);
776
777 const Index num_starts = bs.GetNumStarts();
778 sss->PrepareForNumStarts(num_starts);
779 const Index num_slots = num_starts + kCoeffBits - 1;
780
781 for (Index i = num_slots; i > 0;) {
782 --i;
783 CoeffRow cr = *const_cast<BandingStorage &>(bs).CoeffRowPtr(i);
784 ResultRow rr = *const_cast<BandingStorage &>(bs).ResultRowPtr(i);
785 // solution row
786 ResultRow sr = 0;
787 for (Index j = 0; j < kResultBits; ++j) {
788 // Compute next solution bit at row i, column j (see derivation below)
789 CoeffRow tmp = state[j] << 1;
790 bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
791 tmp |= bit ? CoeffRow{1} : CoeffRow{0};
792
793 // Now tmp is solution at column j from row i for next kCoeffBits
794 // more rows. Thus, for valid solution, the dot product of the
795 // solution column with the coefficient row has to equal the result
796 // at that column,
797 // BitParity(tmp & cr) == ((rr >> j) & 1)
798
799 // Update state.
800 state[j] = tmp;
801 // add to solution row
802 sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
803 }
804 sss->Store(i, sr);
805 }
806 }
807
808 // Common functionality for querying a key (already hashed) in
809 // SimpleSolutionStorage.
810 template <typename SimpleSolutionStorage>
811 typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
812 typename SimpleSolutionStorage::Index start_slot,
813 typename SimpleSolutionStorage::CoeffRow cr,
814 const SimpleSolutionStorage &sss) {
815 using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
816 using ResultRow = typename SimpleSolutionStorage::ResultRow;
817
818 constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
819
820 ResultRow result = 0;
821 for (unsigned i = 0; i < kCoeffBits; ++i) {
822 // Bit masking whole value is generally faster here than 'if'
823 result ^= sss.Load(start_slot + i) &
824 (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
825 }
826 return result;
827 }
828
829 // General PHSF query a key from SimpleSolutionStorage.
830 template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
831 typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
832 const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
833 const SimpleSolutionStorage &sss) {
834 const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
835
836 static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
837 sizeof(typename PhsfQueryHasher::Index),
838 "must be same");
839 static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
840 sizeof(typename PhsfQueryHasher::CoeffRow),
841 "must be same");
842
843 return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
844 hasher.GetCoeffRow(hash), sss);
845 }
846
847 // Filter query a key from SimpleSolutionStorage.
848 template <typename SimpleSolutionStorage, typename FilterQueryHasher>
849 bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
850 const FilterQueryHasher &hasher,
851 const SimpleSolutionStorage &sss) {
852 const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
853 const typename SimpleSolutionStorage::ResultRow expected =
854 hasher.GetResultRowFromHash(hash);
855
856 static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
857 sizeof(typename FilterQueryHasher::Index),
858 "must be same");
859 static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
860 sizeof(typename FilterQueryHasher::CoeffRow),
861 "must be same");
862 static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
863 sizeof(typename FilterQueryHasher::ResultRow),
864 "must be same");
865
866 return expected ==
867 SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
868 hasher.GetCoeffRow(hash), sss);
869 }
870
871 // #################### InterleavedSolutionStorage ######################
872
873 // InterleavedSolutionStorage is row-major at a high level, for good
874 // locality, and column-major at a low level, for CPU efficiency
875 // especially in filter queries or relatively small number of result bits
876 // (== solution columns). The storage is a sequence of "blocks" where a
877 // block has one CoeffRow-sized segment for each solution column. Each
878 // query spans at most two blocks; the starting solution row is typically
879 // in the row-logical middle of a block and spans to the middle of the
880 // next block. (See diagram below.)
881 //
882 // InterleavedSolutionStorage supports choosing b (number of result or
883 // solution columns) at run time, and even supports mixing b and b-1 solution
884 // columns in a single linear system solution, for filters that can
885 // effectively utilize any size space (multiple of CoeffRow) for minimizing
886 // FP rate for any number of added keys. To simplify query implementation
887 // (with lower-index columns first), the b-bit portion comes after the b-1
888 // portion of the structure.
889 //
890 // Diagram (=== marks logical block boundary; b=4; ### is data used by a
891 // query crossing the b-1 to b boundary, each Segment has type CoeffRow):
892 // ...
893 // +======================+
894 // | S e g m e n t col=0 |
895 // +----------------------+
896 // | S e g m e n t col=1 |
897 // +----------------------+
898 // | S e g m e n t col=2 |
899 // +======================+
900 // | S e g m e n #########|
901 // +----------------------+
902 // | S e g m e n #########|
903 // +----------------------+
904 // | S e g m e n #########|
905 // +======================+ Result/solution columns: above = 3, below = 4
906 // |#############t col=0 |
907 // +----------------------+
908 // |#############t col=1 |
909 // +----------------------+
910 // |#############t col=2 |
911 // +----------------------+
912 // | S e g m e n t col=3 |
913 // +======================+
914 // | S e g m e n t col=0 |
915 // +----------------------+
916 // | S e g m e n t col=1 |
917 // +----------------------+
918 // | S e g m e n t col=2 |
919 // +----------------------+
920 // | S e g m e n t col=3 |
921 // +======================+
922 // ...
923 //
924 // InterleavedSolutionStorage will be adapted by the algorithms from
925 // simple array-like segment storage. That array-like storage is templatized
926 // in part so that an implementation may choose to handle byte ordering
927 // at access time.
928 //
929 // concept InterleavedSolutionStorage extends RibbonTypes {
930 // // This is called at the beginning of back-substitution for the
931 // // solution storage to do any remaining configuration before data
932 // // is stored to it. If configuration is previously finalized, this
933 // // could be a simple assertion or even no-op. Ribbon algorithms
934 // // only call this from back-substitution, and only once per call,
935 // // before other functions here.
936 // void PrepareForNumStarts(Index num_starts) const;
937 // // Must return num_starts passed to PrepareForNumStarts, or the most
938 // // recent call to PrepareForNumStarts if this storage object can be
939 // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
940 // // there must be a run of kCoeffBits slots starting from each start.
941 // Index GetNumStarts() const;
942 // // The larger number of solution columns used (called "b" above).
943 // Index GetUpperNumColumns() const;
944 // // If returns > 0, then block numbers below that use
945 // // GetUpperNumColumns() - 1 columns per solution row, and the rest
946 // // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
947 // // where all but the last kCoeffBits - 1 slots are also starts. And
948 // // a block contains a segment for each solution column.
949 // // An implementation may only support uniform columns per solution
950 // // row and return constant 0 here.
951 // Index GetUpperStartBlock() const;
952 //
953 // // ### "Array of segments" portion of API ###
954 // // The number of values of type CoeffRow used in this solution
955 // // representation. (This value can be inferred from the previous
956 // // three functions, but is expected at least for sanity / assertion
957 // // checking.)
958 // Index GetNumSegments() const;
959 // // Load an entry from the logical array of segments
960 // CoeffRow LoadSegment(Index segment_num) const;
961 // // Store an entry to the logical array of segments
962 // void StoreSegment(Index segment_num, CoeffRow data);
963 // };
964
965 // A helper for InterleavedBackSubst.
966 template <typename BandingStorage>
967 inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
968 typename BandingStorage::Index num_columns,
969 const BandingStorage &bs,
970 typename BandingStorage::Index start_slot) {
971 using CoeffRow = typename BandingStorage::CoeffRow;
972 using Index = typename BandingStorage::Index;
973 using ResultRow = typename BandingStorage::ResultRow;
974
975 constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
976
977 for (Index i = start_slot + kCoeffBits; i > start_slot;) {
978 --i;
979 CoeffRow cr = *const_cast<BandingStorage &>(bs).CoeffRowPtr(i);
980 ResultRow rr = *const_cast<BandingStorage &>(bs).ResultRowPtr(i);
981 for (Index j = 0; j < num_columns; ++j) {
982 // Compute next solution bit at row i, column j (see derivation below)
983 CoeffRow tmp = state[j] << 1;
984 int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
985 tmp |= static_cast<CoeffRow>(bit);
986
987 // Now tmp is solution at column j from row i for next kCoeffBits
988 // more rows. Thus, for valid solution, the dot product of the
989 // solution column with the coefficient row has to equal the result
990 // at that column,
991 // BitParity(tmp & cr) == ((rr >> j) & 1)
992
993 // Update state.
994 state[j] = tmp;
995 }
996 }
997 }
998
999 // Back-substitution for generating a solution from BandingStorage to
1000 // InterleavedSolutionStorage.
1001 template <typename InterleavedSolutionStorage, typename BandingStorage>
1002 void InterleavedBackSubst(InterleavedSolutionStorage *iss,
1003 const BandingStorage &bs) {
1004 using CoeffRow = typename BandingStorage::CoeffRow;
1005 using Index = typename BandingStorage::Index;
1006
1007 static_assert(
1008 sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
1009 "must be same");
1010 static_assert(
1011 sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
1012 "must be same");
1013
1014 constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1015
1016 const Index num_starts = bs.GetNumStarts();
1017 // Although it might be nice to have a filter that returns "always false"
1018 // when no key is added, we aren't specifically supporting that here
1019 // because it would require another condition branch in the query.
1020 assert(num_starts > 0);
1021 iss->PrepareForNumStarts(num_starts);
1022
1023 const Index num_slots = num_starts + kCoeffBits - 1;
1024 assert(num_slots % kCoeffBits == 0);
1025 const Index num_blocks = num_slots / kCoeffBits;
1026 const Index num_segments = iss->GetNumSegments();
1027
1028 // For now upper, then lower
1029 Index num_columns = iss->GetUpperNumColumns();
1030 const Index upper_start_block = iss->GetUpperStartBlock();
1031
1032 if (num_columns == 0) {
1033 // Nothing to do, presumably because there's not enough space for even
1034 // a single segment.
1035 assert(num_segments == 0);
1036 // When num_columns == 0, a Ribbon filter query will always return true,
1037 // or a PHSF query always 0.
1038 return;
1039 }
1040
1041 // We should be utilizing all available segments
1042 assert(num_segments == (upper_start_block * (num_columns - 1)) +
1043 ((num_blocks - upper_start_block) * num_columns));
1044
1045 // TODO: consider fixed-column specializations with stack-allocated state
1046
1047 // A column-major buffer of the solution matrix, containing enough
1048 // recently-computed solution data to compute the next solution row
1049 // (based also on banding data).
1050 std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
1051
1052 Index block = num_blocks;
1053 Index segment = num_segments;
1054 while (block > upper_start_block) {
1055 --block;
1056 BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1057 segment -= num_columns;
1058 for (Index i = 0; i < num_columns; ++i) {
1059 iss->StoreSegment(segment + i, state[i]);
1060 }
1061 }
1062 // Now (if applicable), region using lower number of columns
1063 // (This should be optimized away if GetUpperStartBlock() returns
1064 // constant 0.)
1065 --num_columns;
1066 while (block > 0) {
1067 --block;
1068 BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1069 segment -= num_columns;
1070 for (Index i = 0; i < num_columns; ++i) {
1071 iss->StoreSegment(segment + i, state[i]);
1072 }
1073 }
1074 // Verify everything processed
1075 assert(block == 0);
1076 assert(segment == 0);
1077 }
1078
1079 // General PHSF query a key from InterleavedSolutionStorage.
1080 template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
1081 typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
1082 const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
1083 const InterleavedSolutionStorage &iss) {
1084 using Hash = typename PhsfQueryHasher::Hash;
1085
1086 using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1087 using Index = typename InterleavedSolutionStorage::Index;
1088 using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1089
1090 static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
1091 "must be same");
1092 static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
1093 "must be same");
1094
1095 constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1096
1097 const Hash hash = hasher.GetHash(key);
1098 const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
1099
1100 const Index upper_start_block = iss.GetUpperStartBlock();
1101 Index num_columns = iss.GetUpperNumColumns();
1102 Index start_block_num = start_slot / kCoeffBits;
1103 Index segment = start_block_num * num_columns -
1104 std::min(start_block_num, upper_start_block);
1105 // Change to lower num columns if applicable.
1106 // (This should not compile to a conditional branch.)
1107 num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
1108
1109 const CoeffRow cr = hasher.GetCoeffRow(hash);
1110 Index start_bit = start_slot % kCoeffBits;
1111
1112 ResultRow sr = 0;
1113 const CoeffRow cr_left = cr << start_bit;
1114 for (Index i = 0; i < num_columns; ++i) {
1115 sr ^= BitParity(iss.LoadSegment(segment + i) & cr_left) << i;
1116 }
1117
1118 if (start_bit > 0) {
1119 segment += num_columns;
1120 const CoeffRow cr_right = cr >> (kCoeffBits - start_bit);
1121 for (Index i = 0; i < num_columns; ++i) {
1122 sr ^= BitParity(iss.LoadSegment(segment + i) & cr_right) << i;
1123 }
1124 }
1125
1126 return sr;
1127 }
1128
1129 // Filter query a key from InterleavedFilterQuery.
1130 template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
1131 bool InterleavedFilterQuery(const typename FilterQueryHasher::Key &key,
1132 const FilterQueryHasher &hasher,
1133 const InterleavedSolutionStorage &iss) {
1134 // BEGIN mostly copied from InterleavedPhsfQuery
1135 using Hash = typename FilterQueryHasher::Hash;
1136
1137 using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1138 using Index = typename InterleavedSolutionStorage::Index;
1139 using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1140
1141 static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
1142 "must be same");
1143 static_assert(
1144 sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
1145 "must be same");
1146 static_assert(
1147 sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
1148 "must be same");
1149
1150 constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1151
1152 const Hash hash = hasher.GetHash(key);
1153 const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
1154
1155 const Index upper_start_block = iss.GetUpperStartBlock();
1156 Index num_columns = iss.GetUpperNumColumns();
1157 Index start_block_num = start_slot / kCoeffBits;
1158 Index segment = start_block_num * num_columns -
1159 std::min(start_block_num, upper_start_block);
1160 // Change to lower num columns if applicable.
1161 // (This should not compile to a conditional branch.)
1162 num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
1163
1164 const CoeffRow cr = hasher.GetCoeffRow(hash);
1165 Index start_bit = start_slot % kCoeffBits;
1166 // END mostly copied from InterleavedPhsfQuery.
1167
1168 const ResultRow expected = hasher.GetResultRowFromHash(hash);
1169
1170 // TODO: consider optimizations such as
1171 // * mask fetched values and shift cr, rather than shifting fetched values
1172 // * get rid of start_bit == 0 condition with careful fetching & shifting
1173 if (start_bit == 0) {
1174 for (Index i = 0; i < num_columns; ++i) {
1175 if (BitParity(iss.LoadSegment(segment + i) & cr) !=
1176 (static_cast<int>(expected >> i) & 1)) {
1177 return false;
1178 }
1179 }
1180 } else {
1181 for (Index i = 0; i < num_columns; ++i) {
1182 CoeffRow soln_col =
1183 (iss.LoadSegment(segment + i) >> static_cast<unsigned>(start_bit)) |
1184 (iss.LoadSegment(segment + num_columns + i)
1185 << static_cast<unsigned>(kCoeffBits - start_bit));
1186 if (BitParity(soln_col & cr) != (static_cast<int>(expected >> i) & 1)) {
1187 return false;
1188 }
1189 }
1190 }
1191 // otherwise, all match
1192 return true;
1193 }
1194
1195 // TODO: refactor Interleaved*Query so that queries can be "prepared" by
1196 // prefetching memory, to hide memory latency for multiple queries in a
1197 // single thread.
1198
1199 } // namespace ribbon
1200
1201 } // namespace ROCKSDB_NAMESPACE