ceph/src/rocksdb/util/ribbon_alg.h

   1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5
   6 #pragma once
   7
   8 #include <array>
   9 #include <memory>
  10
  11 #include "util/math128.h"
  12
  13 namespace ROCKSDB_NAMESPACE {
  14
  15 namespace ribbon {
  16
  17 // RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
  18 //
  19 // ribbon_alg.h: generic versions of core algorithms.
  20 //
  21 // Ribbon is a Perfect Hash Static Function construction useful as a compact
  22 // static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
  23 // system construction that approximates a Band Matrix with hashing,
  24 // (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
  25 // remarkably efficient and adaptable at constructing an upper-triangular
  26 // band matrix from a set of band-approximating inputs from (a), and
  27 // (c) a storage layout that is fast and adaptable as a filter.
  28 //
  29 // Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
  30 // with One Short Random Block per Row, with Applications" by Stefan
  31 // Walzer and Martin Dietzfelbinger ("DW paper")
  32 // (b) developed by Peter C. Dillinger, though not the first on-the-fly
  33 // GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
  34 // Bioglio, Grangetto, Gaeta, and Sereno.
  35 // (c) see "interleaved" solution storage below.
  36 //
  37 // See ribbon_impl.h for high-level behavioral summary. This file focuses
  38 // on the core design details.
  39 //
  40 // ######################################################################
  41 // ################# PHSF -> static filter reduction ####################
  42 //
  43 // A Perfect Hash Static Function is a data structure representing a
  44 // map from anything hashable (a "key") to values of some fixed size.
  45 // Crucially, it is allowed to return garbage values for anything not in
  46 // the original set of map keys, and it is a "static" structure: entries
  47 // cannot be added or deleted after construction. PHSFs representing n
  48 // mappings to b-bit values (assume uniformly distributed) require at least
  49 // n * b bits to represent, or at least b bits per entry. We typically
  50 // describe the compactness of a PHSF by typical bits per entry as some
  51 // function of b. For example, the MWHC construction (k=3 "peeling")
  52 // requires about 1.0222*b and a variant called Xor+ requires about
  53 // 1.08*b + 0.5 bits per entry.
  54 //
  55 // With more hashing, a PHSF can over-approximate a set as a Bloom filter
  56 // does, with no FN queries and predictable false positive (FP) query
  57 // rate. Instead of the user providing a value to map each input key to,
  58 // a hash function provides the value. Keys in the original set will
  59 // return a positive membership query because the underlying PHSF returns
  60 // the same value as hashing the key. When a key is not in the original set,
  61 // the PHSF returns a "garbage" value, which is only equal to the key's
  62 // hash with (false positive) probability 1 in 2^b.
  63 //
  64 // For a matching false positive rate, standard Bloom filters require
  65 // 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
  66 // require a bit more, around 1.5*b bits per entry. Thus, a Bloom
  67 // alternative could save up to or nearly 1/3rd of memory and storage
  68 // that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
  69 // is dynamic.)
  70 //
  71 // Recommended reading:
  72 // "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
  73 // by Graf and Lemire
  74 // First three sections of "Fast Scalable Construction of (Minimal
  75 // Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
  76 //
  77 // ######################################################################
  78 // ################## PHSF vs. hash table vs. Bloom #####################
  79 //
  80 // You can think of traditional hash tables and related filter variants
  81 // such as Cuckoo filters as utilizing an "OR" construction: a hash
  82 // function associates a key with some slots and the data is returned if
  83 // the data is found in any one of those slots. The collision resolution
  84 // is visible in the final data structure and requires extra information.
  85 // For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
  86 // Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
  87 // structure associates each input key with data in one slot, the
  88 // structure implicitly constructs a (near-)minimal (near-)perfect hash
  89 // (MPH) of the keys, which requires at least 1.44 bits per key to
  90 // represent. This is why approaches with visible collision resolution
  91 // have a fixed + 1.5 or more in storage overhead per entry, often in
  92 // addition to an overhead multiplier on b.
  93 //
  94 // By contrast Bloom filters utilize an "AND" construction: a query only
  95 // returns true if all bit positions associated with a key are set to 1.
  96 // There is no collision resolution, so Bloom filters do not suffer a
  97 // fixed bits per entry overhead like the above structures.
  98 //
  99 // PHSFs typically use a bitwise XOR construction: the data you want is
 100 // not in a single slot, but in a linear combination of several slots.
 101 // For static data, this gives the best of "AND" and "OR" constructions:
 102 // avoids the +1.44 or more fixed overhead by not approximating a MPH and
 103 // can do much better than Bloom's 1.44 factor on b with collision
 104 // resolution, which here is done ahead of time and invisible at query
 105 // time.
 106 //
 107 // ######################################################################
 108 // ######################## PHSF construction ###########################
 109 //
 110 // For a typical PHSF, construction is solving a linear system of
 111 // equations, typically in GF(2), which is to say that values are boolean
 112 // and XOR serves both as addition and subtraction. We can use matrices to
 113 // represent the problem:
 114 //
 115 //    C    *    S    =    R
 116 // (n x m)   (m x b)   (n x b)
 117 // where C = coefficients, S = solution, R = results
 118 // and solving for S given C and R.
 119 //
 120 // Note that C and R each have n rows, one for each input entry for the
 121 // PHSF. A row in C is given by a hash function on the PHSF input key,
 122 // and the corresponding row in R is the b-bit value to associate with
 123 // that input key. (In a filter, rows of R are given by another hash
 124 // function on the input key.)
 125 //
 126 // On solving, the matrix S (solution) is the final PHSF data, as it
 127 // maps any row from the original C to its corresponding desired result
 128 // in R. We just have to hash our query inputs and compute a linear
 129 // combination of rows in S.
 130 //
 131 // In theory, we could chose m = n and let a hash function associate
 132 // each input key with random rows in C. A solution exists with high
 133 // probability, and uses essentially minimum space, b bits per entry
 134 // (because we set m = n) but this has terrible scaling, something
 135 // like O(n^2) space and O(n^3) time during construction (Gaussian
 136 // elimination) and O(n) query time. But computational efficiency is
 137 // key, and the core of this is avoiding scanning all of S to answer
 138 // each query.
 139 //
 140 // The traditional approach (MWHC, aka Xor filter) starts with setting
 141 // only some small fixed number of columns (typically k=3) to 1 for each
 142 // row of C, with remaining entries implicitly 0. This is implemented as
 143 // three hash functions over [0,m), and S can be implemented as a vector
 144 // vector of b-bit values. Now, a query only involves looking up k rows
 145 // (values) in S and computing their bitwise XOR. Additionally, this
 146 // construction can use a linear time algorithm called "peeling" for
 147 // finding a solution in many cases of one existing, but peeling
 148 // generally requires a larger space overhead factor in the solution
 149 // (m/n) than is required with Gaussian elimination.
 150 //
 151 // Recommended reading:
 152 // "Peeling Close to the Orientability Threshold – Spatial Coupling in
 153 // Hashing-Based Data Structures" by Stefan Walzer
 154 //
 155 // ######################################################################
 156 // ##################### Ribbon PHSF construction #######################
 157 //
 158 // Ribbon constructs coefficient rows essentially the same as in the
 159 // Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
 160 // r (kCoeffBits in code), each key is hashed to a starting column in
 161 // [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
 162 // coefficients (GetCoeffRow() in code). If you sort the rows by start,
 163 // the C matrix would look something like this:
 164 //
 165 // [####00000000000000000000]
 166 // [####00000000000000000000]
 167 // [000####00000000000000000]
 168 // [0000####0000000000000000]
 169 // [0000000####0000000000000]
 170 // [000000000####00000000000]
 171 // [000000000####00000000000]
 172 // [0000000000000####0000000]
 173 // [0000000000000000####0000]
 174 // [00000000000000000####000]
 175 // [00000000000000000000####]
 176 //
 177 // where each # could be a 0 or 1, chosen uniformly by a hash function.
 178 // (Except we typically set the start column value to 1.) This scheme
 179 // uses hashing to approximate a band matrix, and it has a solution iff
 180 // it reduces to an upper-triangular boolean r-band matrix, like this:
 181 //
 182 // [1###00000000000000000000]
 183 // [01##00000000000000000000]
 184 // [000000000000000000000000]
 185 // [0001###00000000000000000]
 186 // [000000000000000000000000]
 187 // [000001##0000000000000000]
 188 // [000000000000000000000000]
 189 // [00000001###0000000000000]
 190 // [000000001###000000000000]
 191 // [0000000001##000000000000]
 192 // ...
 193 // [00000000000000000000001#]
 194 // [000000000000000000000001]
 195 //
 196 // where we have expanded to an m x m matrix by filling with rows of
 197 // all zeros as needed. As in Gaussian elimination, this form is ready for
 198 // generating a solution through back-substitution.
 199 //
 200 // The awesome thing about the Ribbon construction (from the DW paper) is
 201 // how row reductions keep each row representable as a start column and
 202 // r coefficients, because row reductions are only needed when two rows
 203 // have the same number of leading zero columns. Thus, the combination
 204 // of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
 205 // out the leading 1s, so starts (at least) one column later and only
 206 // needs (at most) r - 1 coefficients.
 207 //
 208 // ######################################################################
 209 // ###################### Ribbon PHSF scalability #######################
 210 //
 211 // Although more practical detail is in ribbon_impl.h, it's worth
 212 // understanding some of the overall benefits and limitations of the
 213 // Ribbon PHSFs.
 214 //
 215 // High-end scalability is a primary issue for Ribbon PHSFs, because in
 216 // a single Ribbon linear system with fixed r and fixed m/n ratio, the
 217 // solution probability approaches zero as n approaches infinity.
 218 // For a given n, solution probability improves with larger r and larger
 219 // m/n.
 220 //
 221 // By contrast, peeling-based PHSFs have somewhat worse storage ratio
 222 // or solution probability for small n (less than ~1000). This is
 223 // especially true with spatial-coupling, where benefits are only
 224 // notable for n on the order of 100k or 1m or more.
 225 //
 226 // To make best use of current hardware, r=128 seems to be closest to
 227 // a "generally good" choice for Ribbon, at least in RocksDB where SST
 228 // Bloom filters typically hold around 10-100k keys, and almost always
 229 // less than 10m keys. r=128 ribbon has a high chance of encoding success
 230 // (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
 231 // for roughly 10k - 10m keys in a single linear system. r=64 only scales
 232 // up to about 10k keys with the same storage overhead. Construction and
 233 // access times for r=128 are similar to r=64. r=128 tracks nearly
 234 // twice as much data during construction, but in most cases we expect
 235 // the scalability benefits of r=128 vs. r=64 to make it preferred.
 236 //
 237 // A natural approach to scaling Ribbon beyond ~10m keys is splitting
 238 // (or "sharding") the inputs into multiple linear systems with their
 239 // own hash seeds. This can also help to control peak memory consumption.
 240 // TODO: much more to come
 241 //
 242 // ######################################################################
 243 // #################### Ribbon on-the-fly banding #######################
 244 //
 245 // "Banding" is what we call the process of reducing the inputs to an
 246 // upper-triangular r-band matrix ready for finishing a solution with
 247 // back-substitution. Although the DW paper presents an algorithm for
 248 // this ("SGauss"), the awesome properties of their construction enable
 249 // an even simpler, faster, and more backtrackable algorithm. In simplest
 250 // terms, the SGauss algorithm requires sorting the inputs by start
 251 // columns, but it's possible to make Gaussian elimination resemble hash
 252 // table insertion!
 253 //
 254 // The enhanced algorithm is based on these observations:
 255 // - When processing a coefficient row with first 1 in column j,
 256 //   - If it's the first at column j to be processed, it can be part of
 257 //     the banding at row j. (And that decision never overwritten, with
 258 //     no loss of generality!)
 259 //   - Else, it can be combined with existing row j and re-processed,
 260 //     which will look for a later "empty" row or reach "no solution".
 261 //
 262 // We call our banding algorithm "incremental" and "on-the-fly" because
 263 // (like hash table insertion) we are "finished" after each input
 264 // processed, with respect to all inputs processed so far. Although the
 265 // band matrix is an intermediate step to the solution structure, we have
 266 // eliminated intermediate steps and unnecessary data tracking for
 267 // banding.
 268 //
 269 // Building on "incremental" and "on-the-fly", the banding algorithm is
 270 // easily backtrackable because no (non-empty) rows are overwritten in
 271 // the banding. Thus, if we want to "try" adding an additional set of
 272 // inputs to the banding, we only have to record which rows were written
 273 // in order to efficiently backtrack to our state before considering
 274 // the additional set. (TODO: how this can mitigate scalability and
 275 // reach sub-1% overheads)
 276 //
 277 // Like in a linear-probed hash table, as the occupancy approaches and
 278 // surpasses 90-95%, collision resolution dominates the construction
 279 // time. (Ribbon doesn't usually pay at query time; see solution
 280 // storage below.) This means that we can speed up construction time
 281 // by using a higher m/n ratio, up to negative returns around 1.2.
 282 // At m/n ~= 1.2, which still saves memory substantially vs. Bloom
 283 // filter's 1.5, construction speed (including back-substitution) is not
 284 // far from sorting speed, but still a few times slower than cache-local
 285 // Bloom construction speed.
 286 //
 287 // Back-substitution from an upper-triangular boolean band matrix is
 288 // especially fast and easy. All the memory accesses are sequential or at
 289 // least local, no random. If the number of result bits (b) is a
 290 // compile-time constant, the back-substitution state can even be tracked
 291 // in CPU registers. Regardless of the solution representation, we prefer
 292 // column-major representation for tracking back-substitution state, as
 293 // r (the band width) will typically be much larger than b (result bits
 294 // or columns), so better to handle r-bit values b times (per solution
 295 // row) than b-bit values r times.
 296 //
 297 // ######################################################################
 298 // ##################### Ribbon solution storage ########################
 299 //
 300 // Row-major layout is typical for boolean (bit) matrices, including for
 301 // MWHC (Xor) filters where a query combines k b-bit values, and k is
 302 // typically smaller than b. Even for k=4 and b=2, at least k=4 random
 303 // look-ups are required regardless of layout.
 304 //
 305 // Ribbon PHSFs are quite different, however, because
 306 // (a) all of the solution rows relevant to a query are within a single
 307 // range of r rows, and
 308 // (b) the number of solution rows involved (r/2 on average, or r if
 309 // avoiding conditional accesses) is typically much greater than
 310 // b, the number of solution columns.
 311 //
 312 // Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
 313 // by processing (up to) r entries of b bits each, where b is typically
 314 // less than 10 for filter applications.
 315 //
 316 // Column-major layout has poor locality because of accessing up to b
 317 // memory locations in different pages (and obviously cache lines). Note
 318 // that negative filter queries do not typically need to access all
 319 // solution columns, as they can return when a mismatch is found in any
 320 // result/solution column. This optimization doesn't always pay off on
 321 // recent hardware, where the penalty for unpredictable conditional
 322 // branching can exceed the penalty for unnecessary work, but the
 323 // optimization is essentially unavailable with row-major layout.
 324 //
 325 // The best compromise seems to be interleaving column-major on the small
 326 // scale with row-major on the large scale. For example, let a solution
 327 // "block" be r rows column-major encoded as b r-bit values in sequence.
 328 // Each query accesses (up to) 2 adjacent blocks, which will typically
 329 // span 1-3 cache lines in adjacent memory. We get very close to the same
 330 // locality as row-major, but with much faster reconstruction of each
 331 // result column, at least for filter applications where b is relatively
 332 // small and negative queries can return early.
 333 //
 334 // ######################################################################
 335 // ###################### Fractional result bits ########################
 336 //
 337 // Bloom filters have great flexibility that alternatives mostly do not
 338 // have. One of those flexibilities is in utilizing any ratio of data
 339 // structure bits per key. With a typical memory allocator like jemalloc,
 340 // this flexibility can save roughly 10% of the filters' footprint in
 341 // DRAM by rounding up and down filter sizes to minimize memory internal
 342 // fragmentation (see optimize_filters_for_memory RocksDB option).
 343 //
 344 // At first glance, PHSFs only offer a whole number of bits per "slot"
 345 // (m rather than number of keys n), but coefficient locality in the
 346 // Ribbon construction makes fractional bits/key quite possible and
 347 // attractive for filter applications. This works by a prefix of the
 348 // structure using b-1 solution columns and the rest using b solution
 349 // columns. See InterleavedSolutionStorage below for more detail.
 350 //
 351 // Because false positive rates are non-linear in bits/key, this approach
 352 // is not quite optimal in terms of information theory. In common cases,
 353 // we see additional space overhead up to about 1.5% vs. theoretical
 354 // optimal to achieve the same FP rate. We consider this a quite acceptable
 355 // overhead for very efficiently utilizing space that might otherwise be
 356 // wasted.
 357 //
 358 // This property of Ribbon even makes it "elastic." A Ribbon filter and
 359 // its small metadata for answering queries can be adapted into another
 360 // Ribbon filter filling any smaller multiple of r bits (plus small
 361 // metadata), with a correspondingly higher FP rate. None of the data
 362 // thrown away during construction needs to be recalled for this reduction.
 363 // Similarly a single Ribbon construction can be separated (by solution
 364 // column) into two or more structures (or "layers" or "levels") with
 365 // independent filtering ability (no FP correlation, just as solution or
 366 // result columns in a single structure) despite being constructed as part
 367 // of a single linear system. (TODO: implement)
 368 // See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
 369 // Efficient Read for LSM-tree-based KV Stores."
 370 //
 371
 372 // ######################################################################
 373 // ################### CODE: Ribbon core algorithms #####################
 374 // ######################################################################
 375 //
 376 // These algorithms are templatized for genericity but near-maximum
 377 // performance in a given application. The template parameters
 378 // adhere to informal class/struct type concepts outlined below. (This
 379 // code is written for C++11 so does not use formal C++ concepts.)
 380
 381 // Rough architecture for these algorithms:
 382 //
 383 // +-----------+     +---+     +-----------------+
 384 // | AddInputs | --> | H | --> | BandingStorage  |
 385 // +-----------+     | a |     +-----------------+
 386 //                   | s |             |
 387 //                   | h |      Back substitution
 388 //                   | e |             V
 389 // +-----------+     | r |     +-----------------+
 390 // | Query Key | --> |   | >+< | SolutionStorage |
 391 // +-----------+     +---+  |  +-----------------+
 392 //                          V
 393 //                     Query result
 394
 395 // Common to other concepts
 396 // concept RibbonTypes {
 397 //   // An unsigned integer type for an r-bit subsequence of coefficients.
 398 //   // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
 399 //   // generally only hurt scalability to leave bits of CoeffRow unused.
 400 //   typename CoeffRow;
 401 //   // An unsigned integer type big enough to hold a result row (b bits,
 402 //   // or number of solution/result columns).
 403 //   // In many applications, especially filters, the number of result
 404 //   // columns is decided at run time, so ResultRow simply needs to be
 405 //   // big enough for the largest number of columns allowed.
 406 //   typename ResultRow;
 407 //   // An unsigned integer type sufficient for representing the number of
 408 //   // rows in the solution structure, and at least the arithmetic
 409 //   // promotion size (usually 32 bits). uint32_t recommended because a
 410 //   // single Ribbon construction doesn't really scale to billions of
 411 //   // entries.
 412 //   typename Index;
 413 // };
 414
 415 // ######################################################################
 416 // ######################## Hashers and Banding #########################
 417
 418 // Hasher concepts abstract out hashing details.
 419
 420 // concept PhsfQueryHasher extends RibbonTypes {
 421 //   // Type for a lookup key, which is hashable.
 422 //   typename Key;
 423 //
 424 //   // Type for hashed summary of a Key. uint64_t is recommended.
 425 //   typename Hash;
 426 //
 427 //   // Compute a hash value summarizing a Key
 428 //   Hash GetHash(const Key &) const;
 429 //
 430 //   // Given a hash value and a number of columns that can start an
 431 //   // r-sequence of coefficients (== m - r + 1), return the start
 432 //   // column to associate with that hash value. (Starts can be chosen
 433 //   // uniformly or "smash" extra entries into the beginning and end for
 434 //   // better utilization at those extremes of the structure. Details in
 435 //   // ribbon.impl.h)
 436 //   Index GetStart(Hash, Index num_starts) const;
 437 //
 438 //   // Given a hash value, return the r-bit sequence of coefficients to
 439 //   // associate with it. It's generally OK if
 440 //   //   sizeof(CoeffRow) > sizeof(Hash)
 441 //   // as long as the hash itself is not too prone to collisions for the
 442 //   // applications and the CoeffRow is generated uniformly from
 443 //   // available hash data, but relatively independent of the start.
 444 //   //
 445 //   // Must be non-zero, because that's required for a solution to exist
 446 //   // when mapping to non-zero result row. (Note: BandingAdd could be
 447 //   // modified to allow 0 coeff row if that only occurs with 0 result
 448 //   // row, which really only makes sense for filter implementation,
 449 //   // where both values are hash-derived. Or BandingAdd could reject 0
 450 //   // coeff row, forcing next seed, but that has potential problems with
 451 //   // generality/scalability.)
 452 //   CoeffRow GetCoeffRow(Hash) const;
 453 // };
 454
 455 // concept FilterQueryHasher extends PhsfQueryHasher {
 456 //   // For building or querying a filter, this returns the expected
 457 //   // result row associated with a hashed input. For general PHSF,
 458 //   // this must return 0.
 459 //   //
 460 //   // Although not strictly required, there's a slightly better chance of
 461 //   // solver success if result row is masked down here to only the bits
 462 //   // actually needed.
 463 //   ResultRow GetResultRowFromHash(Hash) const;
 464 // }
 465
 466 // concept BandingHasher extends FilterQueryHasher {
 467 //   // For a filter, this will generally be the same as Key.
 468 //   // For a general PHSF, it must either
 469 //   // (a) include a key and a result it maps to (e.g. in a std::pair), or
 470 //   // (b) GetResultRowFromInput looks up the result somewhere rather than
 471 //   // extracting it.
 472 //   typename AddInput;
 473 //
 474 //   // Instead of requiring a way to extract a Key from an
 475 //   // AddInput, we require getting the hash of the Key part
 476 //   // of an AddInput, which is trivial if AddInput == Key.
 477 //   Hash GetHash(const AddInput &) const;
 478 //
 479 //   // For building a non-filter PHSF, this extracts or looks up the result
 480 //   // row to associate with an input. For filter PHSF, this must return 0.
 481 //   ResultRow GetResultRowFromInput(const AddInput &) const;
 482 //
 483 //   // Whether the solver can assume the lowest bit of GetCoeffRow is
 484 //   // always 1. When true, it should improve solver efficiency slightly.
 485 //   static bool kFirstCoeffAlwaysOne;
 486 // }
 487
 488 // Abstract storage for the the result of "banding" the inputs (Gaussian
 489 // elimination to an upper-triangular boolean band matrix). Because the
 490 // banding is an incremental / on-the-fly algorithm, this also represents
 491 // all the intermediate state between input entries.
 492 //
 493 // concept BandingStorage extends RibbonTypes {
 494 //   // Tells the banding algorithm to prefetch memory associated with
 495 //   // the next input before processing the current input. Generally
 496 //   // recommended iff the BandingStorage doesn't easily fit in CPU
 497 //   // cache.
 498 //   bool UsePrefetch() const;
 499 //
 500 //   // Prefetches (e.g. __builtin_prefetch) memory associated with a
 501 //   // slot index i.
 502 //   void Prefetch(Index i) const;
 503 //
 504 //   // Returns a pointer to CoeffRow for slot index i.
 505 //   CoeffRow* CoeffRowPtr(Index i);
 506 //
 507 //   // Returns a pointer to ResultRow for slot index i. (Gaussian row
 508 //   // operations involve both side of the equation.)
 509 //   ResultRow* ResultRowPtr(Index i);
 510 //
 511 //   // Returns the number of columns that can start an r-sequence of
 512 //   // coefficients, which is the number of slots minus r (kCoeffBits)
 513 //   // plus one. (m - r + 1)
 514 //   Index GetNumStarts() const;
 515 // };
 516
 517 // Optional storage for backtracking data in banding a set of input
 518 // entries. It exposes an array structure which will generally be
 519 // used as a stack. It must be able to accommodate as many entries
 520 // as are passed in as inputs to `BandingAddRange`.
 521 //
 522 // concept BacktrackStorage extends RibbonTypes {
 523 //   // If false, backtracking support will be disabled in the algorithm.
 524 //   // This should preferably be an inline compile-time constant function.
 525 //   bool UseBacktrack() const;
 526 //
 527 //   // Records `to_save` as the `i`th backtrack entry
 528 //   void BacktrackPut(Index i, Index to_save);
 529 //
 530 //   // Recalls the `i`th backtrack entry
 531 //   Index BacktrackGet(Index i) const;
 532 // }
 533
 534 // Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
 535 // returning true if successful or false if solution is impossible with
 536 // current hasher (and presumably its seed) and number of "slots" (solution
 537 // or banding rows). (A solution is impossible when there is a linear
 538 // dependence among the inputs that doesn't "cancel out".)
 539 //
 540 // Pre- and post-condition: the BandingStorage represents a band matrix
 541 // ready for back substitution (row echelon form except for zero rows),
 542 // augmented with result values such that back substitution would give a
 543 // solution satisfying all the cr@start -> rr entries added.
 544 template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
 545           typename BacktrackStorage>
 546 bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
 547                 typename BandingStorage::ResultRow rr,
 548                 typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
 549                 typename BandingStorage::Index *backtrack_pos) {
 550   using CoeffRow = typename BandingStorage::CoeffRow;
 551   using Index = typename BandingStorage::Index;
 552
 553   Index i = start;
 554
 555   if (!kFirstCoeffAlwaysOne) {
 556     // Requires/asserts that cr != 0
 557     int tz = CountTrailingZeroBits(cr);
 558     i += static_cast<Index>(tz);
 559     cr >>= tz;
 560   }
 561
 562   for (;;) {
 563     assert((cr & 1) == 1);
 564     CoeffRow other = *(bs->CoeffRowPtr(i));
 565     if (other == 0) {
 566       *(bs->CoeffRowPtr(i)) = cr;
 567       *(bs->ResultRowPtr(i)) = rr;
 568       bts->BacktrackPut(*backtrack_pos, i);
 569       ++*backtrack_pos;
 570       return true;
 571     }
 572     assert((other & 1) == 1);
 573     // Gaussian row reduction
 574     cr ^= other;
 575     rr ^= *(bs->ResultRowPtr(i));
 576     if (cr == 0) {
 577       // Inconsistency or (less likely) redundancy
 578       break;
 579     }
 580     // Find relative offset of next non-zero coefficient.
 581     int tz = CountTrailingZeroBits(cr);
 582     i += static_cast<Index>(tz);
 583     cr >>= tz;
 584   }
 585
 586   // Failed, unless result row == 0 because e.g. a duplicate input or a
 587   // stock hash collision, with same result row. (For filter, stock hash
 588   // collision implies same result row.) Or we could have a full equation
 589   // equal to sum of other equations, which is very possible with
 590   // small range of values for result row.
 591   return rr == 0;
 592 }
 593
 594 // Adds a range of entries to BandingStorage returning true if successful
 595 // or false if solution is impossible with current hasher (and presumably
 596 // its seed) and number of "slots" (solution or banding rows). (A solution
 597 // is impossible when there is a linear dependence among the inputs that
 598 // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
 599 //
 600 // If UseBacktrack in the BacktrackStorage, this function call rolls back
 601 // to prior state on failure. If !UseBacktrack, some subset of the entries
 602 // will have been added to the BandingStorage, so best considered to be in
 603 // an indeterminate state.
 604 //
 605 template <typename BandingStorage, typename BacktrackStorage,
 606           typename BandingHasher, typename InputIterator>
 607 bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
 608                      const BandingHasher &bh, InputIterator begin,
 609                      InputIterator end) {
 610   using CoeffRow = typename BandingStorage::CoeffRow;
 611   using Index = typename BandingStorage::Index;
 612   using ResultRow = typename BandingStorage::ResultRow;
 613   using Hash = typename BandingHasher::Hash;
 614
 615   static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
 616   static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
 617   static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
 618
 619   constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
 620
 621   if (begin == end) {
 622     // trivial
 623     return true;
 624   }
 625
 626   const Index num_starts = bs->GetNumStarts();
 627
 628   InputIterator cur = begin;
 629   Index backtrack_pos = 0;
 630   if (!bs->UsePrefetch()) {
 631     // Simple version, no prefetch
 632     for (;;) {
 633       Hash h = bh.GetHash(*cur);
 634       Index start = bh.GetStart(h, num_starts);
 635       ResultRow rr =
 636           bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
 637       CoeffRow cr = bh.GetCoeffRow(h);
 638
 639       if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
 640         break;
 641       }
 642       if ((++cur) == end) {
 643         return true;
 644       }
 645     }
 646   } else {
 647     // Pipelined w/prefetch
 648     // Prime the pipeline
 649     Hash h = bh.GetHash(*cur);
 650     Index start = bh.GetStart(h, num_starts);
 651     ResultRow rr = bh.GetResultRowFromInput(*cur);
 652     bs->Prefetch(start);
 653
 654     // Pipeline
 655     for (;;) {
 656       rr |= bh.GetResultRowFromHash(h);
 657       CoeffRow cr = bh.GetCoeffRow(h);
 658       if ((++cur) == end) {
 659         if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
 660           break;
 661         }
 662         return true;
 663       }
 664       Hash next_h = bh.GetHash(*cur);
 665       Index next_start = bh.GetStart(next_h, num_starts);
 666       ResultRow next_rr = bh.GetResultRowFromInput(*cur);
 667       bs->Prefetch(next_start);
 668       if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
 669         break;
 670       }
 671       h = next_h;
 672       start = next_start;
 673       rr = next_rr;
 674     }
 675   }
 676   // failed; backtrack (if implemented)
 677   if (bts->UseBacktrack()) {
 678     while (backtrack_pos > 0) {
 679       --backtrack_pos;
 680       Index i = bts->BacktrackGet(backtrack_pos);
 681       *(bs->CoeffRowPtr(i)) = 0;
 682       // Not strictly required, but is required for good FP rate on
 683       // inputs that might have been backtracked out. (We don't want
 684       // anything we've backtracked on to leak into final result, as
 685       // that might not be "harmless".)
 686       *(bs->ResultRowPtr(i)) = 0;
 687     }
 688   }
 689   return false;
 690 }
 691
 692 // Adds a range of entries to BandingStorage returning true if successful
 693 // or false if solution is impossible with current hasher (and presumably
 694 // its seed) and number of "slots" (solution or banding rows). (A solution
 695 // is impossible when there is a linear dependence among the inputs that
 696 // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
 697 //
 698 // On failure, some subset of the entries will have been added to the
 699 // BandingStorage, so best considered to be in an indeterminate state.
 700 //
 701 template <typename BandingStorage, typename BandingHasher,
 702           typename InputIterator>
 703 bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
 704                      InputIterator begin, InputIterator end) {
 705   using Index = typename BandingStorage::Index;
 706   struct NoopBacktrackStorage {
 707     bool UseBacktrack() { return false; }
 708     void BacktrackPut(Index, Index) {}
 709     Index BacktrackGet(Index) {
 710       assert(false);
 711       return 0;
 712     }
 713   } nbts;
 714   return BandingAddRange(bs, &nbts, bh, begin, end);
 715 }
 716
 717 // ######################################################################
 718 // ######################### Solution Storage ###########################
 719
 720 // Back-substitution and query algorithms unfortunately depend on some
 721 // details of data layout in the final data structure ("solution"). Thus,
 722 // there is no common SolutionStorage covering all the reasonable
 723 // possibilities.
 724
 725 // ###################### SimpleSolutionStorage #########################
 726
 727 // SimpleSolutionStorage is for a row-major storage, typically with no
 728 // unused bits in each ResultRow. This is mostly for demonstration
 729 // purposes as the simplest solution storage scheme. It is relatively slow
 730 // for filter queries.
 731
 732 // concept SimpleSolutionStorage extends RibbonTypes {
 733 //   // This is called at the beginning of back-substitution for the
 734 //   // solution storage to do any remaining configuration before data
 735 //   // is stored to it. If configuration is previously finalized, this
 736 //   // could be a simple assertion or even no-op. Ribbon algorithms
 737 //   // only call this from back-substitution, and only once per call,
 738 //   // before other functions here.
 739 //   void PrepareForNumStarts(Index num_starts) const;
 740 //   // Must return num_starts passed to PrepareForNumStarts, or the most
 741 //   // recent call to PrepareForNumStarts if this storage object can be
 742 //   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
 743 //   // there must be a run of kCoeffBits slots starting from each start.
 744 //   Index GetNumStarts() const;
 745 //   // Load the solution row (type ResultRow) for a slot
 746 //   ResultRow Load(Index slot_num) const;
 747 //   // Store the solution row (type ResultRow) for a slot
 748 //   void Store(Index slot_num, ResultRow data);
 749 // };
 750
 751 // Back-substitution for generating a solution from BandingStorage to
 752 // SimpleSolutionStorage.
 753 template <typename SimpleSolutionStorage, typename BandingStorage>
 754 void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
 755   using CoeffRow = typename BandingStorage::CoeffRow;
 756   using Index = typename BandingStorage::Index;
 757   using ResultRow = typename BandingStorage::ResultRow;
 758
 759   static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
 760                 "must be same");
 761   static_assert(
 762       sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
 763       "must be same");
 764   static_assert(
 765       sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
 766       "must be same");
 767
 768   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
 769   constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
 770
 771   // A column-major buffer of the solution matrix, containing enough
 772   // recently-computed solution data to compute the next solution row
 773   // (based also on banding data).
 774   std::array<CoeffRow, kResultBits> state;
 775   state.fill(0);
 776
 777   const Index num_starts = bs.GetNumStarts();
 778   sss->PrepareForNumStarts(num_starts);
 779   const Index num_slots = num_starts + kCoeffBits - 1;
 780
 781   for (Index i = num_slots; i > 0;) {
 782     --i;
 783     CoeffRow cr = *const_cast<BandingStorage &>(bs).CoeffRowPtr(i);
 784     ResultRow rr = *const_cast<BandingStorage &>(bs).ResultRowPtr(i);
 785     // solution row
 786     ResultRow sr = 0;
 787     for (Index j = 0; j < kResultBits; ++j) {
 788       // Compute next solution bit at row i, column j (see derivation below)
 789       CoeffRow tmp = state[j] << 1;
 790       bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
 791       tmp |= bit ? CoeffRow{1} : CoeffRow{0};
 792
 793       // Now tmp is solution at column j from row i for next kCoeffBits
 794       // more rows. Thus, for valid solution, the dot product of the
 795       // solution column with the coefficient row has to equal the result
 796       // at that column,
 797       //   BitParity(tmp & cr) == ((rr >> j) & 1)
 798
 799       // Update state.
 800       state[j] = tmp;
 801       // add to solution row
 802       sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
 803     }
 804     sss->Store(i, sr);
 805   }
 806 }
 807
 808 // Common functionality for querying a key (already hashed) in
 809 // SimpleSolutionStorage.
 810 template <typename SimpleSolutionStorage>
 811 typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
 812     typename SimpleSolutionStorage::Index start_slot,
 813     typename SimpleSolutionStorage::CoeffRow cr,
 814     const SimpleSolutionStorage &sss) {
 815   using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
 816   using ResultRow = typename SimpleSolutionStorage::ResultRow;
 817
 818   constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
 819
 820   ResultRow result = 0;
 821   for (unsigned i = 0; i < kCoeffBits; ++i) {
 822     // Bit masking whole value is generally faster here than 'if'
 823     result ^= sss.Load(start_slot + i) &
 824               (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
 825   }
 826   return result;
 827 }
 828
 829 // General PHSF query a key from SimpleSolutionStorage.
 830 template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
 831 typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
 832     const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
 833     const SimpleSolutionStorage &sss) {
 834   const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
 835
 836   static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
 837                     sizeof(typename PhsfQueryHasher::Index),
 838                 "must be same");
 839   static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
 840                     sizeof(typename PhsfQueryHasher::CoeffRow),
 841                 "must be same");
 842
 843   return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
 844                            hasher.GetCoeffRow(hash), sss);
 845 }
 846
 847 // Filter query a key from SimpleSolutionStorage.
 848 template <typename SimpleSolutionStorage, typename FilterQueryHasher>
 849 bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
 850                        const FilterQueryHasher &hasher,
 851                        const SimpleSolutionStorage &sss) {
 852   const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
 853   const typename SimpleSolutionStorage::ResultRow expected =
 854       hasher.GetResultRowFromHash(hash);
 855
 856   static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
 857                     sizeof(typename FilterQueryHasher::Index),
 858                 "must be same");
 859   static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
 860                     sizeof(typename FilterQueryHasher::CoeffRow),
 861                 "must be same");
 862   static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
 863                     sizeof(typename FilterQueryHasher::ResultRow),
 864                 "must be same");
 865
 866   return expected ==
 867          SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
 868                            hasher.GetCoeffRow(hash), sss);
 869 }
 870
 871 // #################### InterleavedSolutionStorage ######################
 872
 873 // InterleavedSolutionStorage is row-major at a high level, for good
 874 // locality, and column-major at a low level, for CPU efficiency
 875 // especially in filter queries or relatively small number of result bits
 876 // (== solution columns). The storage is a sequence of "blocks" where a
 877 // block has one CoeffRow-sized segment for each solution column. Each
 878 // query spans at most two blocks; the starting solution row is typically
 879 // in the row-logical middle of a block and spans to the middle of the
 880 // next block. (See diagram below.)
 881 //
 882 // InterleavedSolutionStorage supports choosing b (number of result or
 883 // solution columns) at run time, and even supports mixing b and b-1 solution
 884 // columns in a single linear system solution, for filters that can
 885 // effectively utilize any size space (multiple of CoeffRow) for minimizing
 886 // FP rate for any number of added keys. To simplify query implementation
 887 // (with lower-index columns first), the b-bit portion comes after the b-1
 888 // portion of the structure.
 889 //
 890 // Diagram (=== marks logical block boundary; b=4; ### is data used by a
 891 // query crossing the b-1 to b boundary, each Segment has type CoeffRow):
 892 //  ...
 893 // +======================+
 894 // | S e g m e n t  col=0 |
 895 // +----------------------+
 896 // | S e g m e n t  col=1 |
 897 // +----------------------+
 898 // | S e g m e n t  col=2 |
 899 // +======================+
 900 // | S e g m e n #########|
 901 // +----------------------+
 902 // | S e g m e n #########|
 903 // +----------------------+
 904 // | S e g m e n #########|
 905 // +======================+ Result/solution columns: above = 3, below = 4
 906 // |#############t  col=0 |
 907 // +----------------------+
 908 // |#############t  col=1 |
 909 // +----------------------+
 910 // |#############t  col=2 |
 911 // +----------------------+
 912 // | S e g m e n t  col=3 |
 913 // +======================+
 914 // | S e g m e n t  col=0 |
 915 // +----------------------+
 916 // | S e g m e n t  col=1 |
 917 // +----------------------+
 918 // | S e g m e n t  col=2 |
 919 // +----------------------+
 920 // | S e g m e n t  col=3 |
 921 // +======================+
 922 //  ...
 923 //
 924 // InterleavedSolutionStorage will be adapted by the algorithms from
 925 // simple array-like segment storage. That array-like storage is templatized
 926 // in part so that an implementation may choose to handle byte ordering
 927 // at access time.
 928 //
 929 // concept InterleavedSolutionStorage extends RibbonTypes {
 930 //   // This is called at the beginning of back-substitution for the
 931 //   // solution storage to do any remaining configuration before data
 932 //   // is stored to it. If configuration is previously finalized, this
 933 //   // could be a simple assertion or even no-op. Ribbon algorithms
 934 //   // only call this from back-substitution, and only once per call,
 935 //   // before other functions here.
 936 //   void PrepareForNumStarts(Index num_starts) const;
 937 //   // Must return num_starts passed to PrepareForNumStarts, or the most
 938 //   // recent call to PrepareForNumStarts if this storage object can be
 939 //   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
 940 //   // there must be a run of kCoeffBits slots starting from each start.
 941 //   Index GetNumStarts() const;
 942 //   // The larger number of solution columns used (called "b" above).
 943 //   Index GetUpperNumColumns() const;
 944 //   // If returns > 0, then block numbers below that use
 945 //   // GetUpperNumColumns() - 1 columns per solution row, and the rest
 946 //   // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
 947 //   // where all but the last kCoeffBits - 1 slots are also starts. And
 948 //   // a block contains a segment for each solution column.
 949 //   // An implementation may only support uniform columns per solution
 950 //   // row and return constant 0 here.
 951 //   Index GetUpperStartBlock() const;
 952 //
 953 //   // ### "Array of segments" portion of API ###
 954 //   // The number of values of type CoeffRow used in this solution
 955 //   // representation. (This value can be inferred from the previous
 956 //   // three functions, but is expected at least for sanity / assertion
 957 //   // checking.)
 958 //   Index GetNumSegments() const;
 959 //   // Load an entry from the logical array of segments
 960 //   CoeffRow LoadSegment(Index segment_num) const;
 961 //   // Store an entry to the logical array of segments
 962 //   void StoreSegment(Index segment_num, CoeffRow data);
 963 // };
 964
 965 // A helper for InterleavedBackSubst.
 966 template <typename BandingStorage>
 967 inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
 968                            typename BandingStorage::Index num_columns,
 969                            const BandingStorage &bs,
 970                            typename BandingStorage::Index start_slot) {
 971   using CoeffRow = typename BandingStorage::CoeffRow;
 972   using Index = typename BandingStorage::Index;
 973   using ResultRow = typename BandingStorage::ResultRow;
 974
 975   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
 976
 977   for (Index i = start_slot + kCoeffBits; i > start_slot;) {
 978     --i;
 979     CoeffRow cr = *const_cast<BandingStorage &>(bs).CoeffRowPtr(i);
 980     ResultRow rr = *const_cast<BandingStorage &>(bs).ResultRowPtr(i);
 981     for (Index j = 0; j < num_columns; ++j) {
 982       // Compute next solution bit at row i, column j (see derivation below)
 983       CoeffRow tmp = state[j] << 1;
 984       int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
 985       tmp |= static_cast<CoeffRow>(bit);
 986
 987       // Now tmp is solution at column j from row i for next kCoeffBits
 988       // more rows. Thus, for valid solution, the dot product of the
 989       // solution column with the coefficient row has to equal the result
 990       // at that column,
 991       //   BitParity(tmp & cr) == ((rr >> j) & 1)
 992
 993       // Update state.
 994       state[j] = tmp;
 995     }
 996   }
 997 }
 998
 999 // Back-substitution for generating a solution from BandingStorage to
1000 // InterleavedSolutionStorage.
1001 template <typename InterleavedSolutionStorage, typename BandingStorage>
1002 void InterleavedBackSubst(InterleavedSolutionStorage *iss,
1003                           const BandingStorage &bs) {
1004   using CoeffRow = typename BandingStorage::CoeffRow;
1005   using Index = typename BandingStorage::Index;
1006
1007   static_assert(
1008       sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
1009       "must be same");
1010   static_assert(
1011       sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
1012       "must be same");
1013
1014   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1015
1016   const Index num_starts = bs.GetNumStarts();
1017   // Although it might be nice to have a filter that returns "always false"
1018   // when no key is added, we aren't specifically supporting that here
1019   // because it would require another condition branch in the query.
1020   assert(num_starts > 0);
1021   iss->PrepareForNumStarts(num_starts);
1022
1023   const Index num_slots = num_starts + kCoeffBits - 1;
1024   assert(num_slots % kCoeffBits == 0);
1025   const Index num_blocks = num_slots / kCoeffBits;
1026   const Index num_segments = iss->GetNumSegments();
1027
1028   // For now upper, then lower
1029   Index num_columns = iss->GetUpperNumColumns();
1030   const Index upper_start_block = iss->GetUpperStartBlock();
1031
1032   if (num_columns == 0) {
1033     // Nothing to do, presumably because there's not enough space for even
1034     // a single segment.
1035     assert(num_segments == 0);
1036     // When num_columns == 0, a Ribbon filter query will always return true,
1037     // or a PHSF query always 0.
1038     return;
1039   }
1040
1041   // We should be utilizing all available segments
1042   assert(num_segments == (upper_start_block * (num_columns - 1)) +
1043                              ((num_blocks - upper_start_block) * num_columns));
1044
1045   // TODO: consider fixed-column specializations with stack-allocated state
1046
1047   // A column-major buffer of the solution matrix, containing enough
1048   // recently-computed solution data to compute the next solution row
1049   // (based also on banding data).
1050   std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
1051
1052   Index block = num_blocks;
1053   Index segment = num_segments;
1054   while (block > upper_start_block) {
1055     --block;
1056     BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1057     segment -= num_columns;
1058     for (Index i = 0; i < num_columns; ++i) {
1059       iss->StoreSegment(segment + i, state[i]);
1060     }
1061   }
1062   // Now (if applicable), region using lower number of columns
1063   // (This should be optimized away if GetUpperStartBlock() returns
1064   // constant 0.)
1065   --num_columns;
1066   while (block > 0) {
1067     --block;
1068     BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
1069     segment -= num_columns;
1070     for (Index i = 0; i < num_columns; ++i) {
1071       iss->StoreSegment(segment + i, state[i]);
1072     }
1073   }
1074   // Verify everything processed
1075   assert(block == 0);
1076   assert(segment == 0);
1077 }
1078
1079 // General PHSF query a key from InterleavedSolutionStorage.
1080 template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
1081 typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
1082     const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
1083     const InterleavedSolutionStorage &iss) {
1084   using Hash = typename PhsfQueryHasher::Hash;
1085
1086   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1087   using Index = typename InterleavedSolutionStorage::Index;
1088   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1089
1090   static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
1091                 "must be same");
1092   static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
1093                 "must be same");
1094
1095   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1096
1097   const Hash hash = hasher.GetHash(key);
1098   const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
1099
1100   const Index upper_start_block = iss.GetUpperStartBlock();
1101   Index num_columns = iss.GetUpperNumColumns();
1102   Index start_block_num = start_slot / kCoeffBits;
1103   Index segment = start_block_num * num_columns -
1104                   std::min(start_block_num, upper_start_block);
1105   // Change to lower num columns if applicable.
1106   // (This should not compile to a conditional branch.)
1107   num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
1108
1109   const CoeffRow cr = hasher.GetCoeffRow(hash);
1110   Index start_bit = start_slot % kCoeffBits;
1111
1112   ResultRow sr = 0;
1113   const CoeffRow cr_left = cr << start_bit;
1114   for (Index i = 0; i < num_columns; ++i) {
1115     sr ^= BitParity(iss.LoadSegment(segment + i) & cr_left) << i;
1116   }
1117
1118   if (start_bit > 0) {
1119     segment += num_columns;
1120     const CoeffRow cr_right = cr >> (kCoeffBits - start_bit);
1121     for (Index i = 0; i < num_columns; ++i) {
1122       sr ^= BitParity(iss.LoadSegment(segment + i) & cr_right) << i;
1123     }
1124   }
1125
1126   return sr;
1127 }
1128
1129 // Filter query a key from InterleavedFilterQuery.
1130 template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
1131 bool InterleavedFilterQuery(const typename FilterQueryHasher::Key &key,
1132                             const FilterQueryHasher &hasher,
1133                             const InterleavedSolutionStorage &iss) {
1134   // BEGIN mostly copied from InterleavedPhsfQuery
1135   using Hash = typename FilterQueryHasher::Hash;
1136
1137   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
1138   using Index = typename InterleavedSolutionStorage::Index;
1139   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
1140
1141   static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
1142                 "must be same");
1143   static_assert(
1144       sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
1145       "must be same");
1146   static_assert(
1147       sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
1148       "must be same");
1149
1150   constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
1151
1152   const Hash hash = hasher.GetHash(key);
1153   const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
1154
1155   const Index upper_start_block = iss.GetUpperStartBlock();
1156   Index num_columns = iss.GetUpperNumColumns();
1157   Index start_block_num = start_slot / kCoeffBits;
1158   Index segment = start_block_num * num_columns -
1159                   std::min(start_block_num, upper_start_block);
1160   // Change to lower num columns if applicable.
1161   // (This should not compile to a conditional branch.)
1162   num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
1163
1164   const CoeffRow cr = hasher.GetCoeffRow(hash);
1165   Index start_bit = start_slot % kCoeffBits;
1166   // END mostly copied from InterleavedPhsfQuery.
1167
1168   const ResultRow expected = hasher.GetResultRowFromHash(hash);
1169
1170   // TODO: consider optimizations such as
1171   // * mask fetched values and shift cr, rather than shifting fetched values
1172   // * get rid of start_bit == 0 condition with careful fetching & shifting
1173   if (start_bit == 0) {
1174     for (Index i = 0; i < num_columns; ++i) {
1175       if (BitParity(iss.LoadSegment(segment + i) & cr) !=
1176           (static_cast<int>(expected >> i) & 1)) {
1177         return false;
1178       }
1179     }
1180   } else {
1181     for (Index i = 0; i < num_columns; ++i) {
1182       CoeffRow soln_col =
1183           (iss.LoadSegment(segment + i) >> static_cast<unsigned>(start_bit)) |
1184           (iss.LoadSegment(segment + num_columns + i)
1185            << static_cast<unsigned>(kCoeffBits - start_bit));
1186       if (BitParity(soln_col & cr) != (static_cast<int>(expected >> i) & 1)) {
1187         return false;
1188       }
1189     }
1190   }
1191   // otherwise, all match
1192   return true;
1193 }
1194
1195 // TODO: refactor Interleaved*Query so that queries can be "prepared" by
1196 // prefetching memory, to hide memory latency for multiple queries in a
1197 // single thread.
1198
1199 }  // namespace ribbon
1200
1201 }  // namespace ROCKSDB_NAMESPACE