lib/cmap.c

   1 /*
   2  * Copyright (c) 2014, 2016 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "cmap.h"
  19 #include "coverage.h"
  20 #include "bitmap.h"
  21 #include "hash.h"
  22 #include "ovs-rcu.h"
  23 #include "random.h"
  24 #include "util.h"
  25
  26 COVERAGE_DEFINE(cmap_expand);
  27 COVERAGE_DEFINE(cmap_shrink);
  28
  29 /* Optimistic Concurrent Cuckoo Hash
  30  * =================================
  31  *
  32  * A "cuckoo hash" is an open addressing hash table schema, designed such that
  33  * a given element can be in one of only a small number of buckets 'd', each of
  34  * which holds up to a small number 'k' elements.  Thus, the expected and
  35  * worst-case lookup times are O(1) because they require comparing no more than
  36  * a fixed number of elements (k * d).  Inserting a new element can require
  37  * moving around existing elements, but it is also O(1) amortized expected
  38  * time.
  39  *
  40  * An optimistic concurrent hash table goes one step further, making it
  41  * possible for a single writer to execute concurrently with any number of
  42  * readers without requiring the readers to take any locks.
  43  *
  44  * This cuckoo hash implementation uses:
  45  *
  46  *    - Two hash functions (d=2).  More hash functions allow for a higher load
  47  *      factor, but increasing 'k' is easier and the benefits of increasing 'd'
  48  *      quickly fall off with the 'k' values used here.  Also, the method of
  49  *      generating hashes used in this implementation is hard to reasonably
  50  *      extend beyond d=2.  Finally, each additional hash function means that a
  51  *      lookup has to look at least one extra cache line.
  52  *
  53  *    - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets
  54  *      exactly one cache line in size.
  55  *
  56  * According to Erlingsson [4], these parameters suggest a maximum load factor
  57  * of about 93%.  The current implementation is conservative, expanding the
  58  * hash table when it is over 85% full.
  59  *
  60  * When the load factor is below 20%, the hash table will be shrinked by half.
  61  * This is to reduce the memory utilization of the hash table and to avoid
  62  * the hash table occupying the top of heap chunk which prevents the trimming
  63  * of heap.
  64  *
  65  * Hash Functions
  66  * ==============
  67  *
  68  * A cuckoo hash requires multiple hash functions.  When reorganizing the hash
  69  * becomes too difficult, it also requires the ability to change the hash
  70  * functions.  Requiring the client to provide multiple hashes and to be able
  71  * to change them to new hashes upon insertion is inconvenient.
  72  *
  73  * This implementation takes another approach.  The client provides a single,
  74  * fixed hash.  The cuckoo hash internally "rehashes" this hash against a
  75  * randomly selected basis value (see rehash()).  This rehashed value is one of
  76  * the two hashes.  The other hash is computed by 16-bit circular rotation of
  77  * the rehashed value.  Updating the basis changes the hash functions.
  78  *
  79  * To work properly, the hash functions used by a cuckoo hash must be
  80  * independent.  If one hash function is a function of the other (e.g. h2(x) =
  81  * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail
  82  * catastrophically (loop forever) because of collisions.  With this rehashing
  83  * technique, the two hashes are completely independent for masks up to 16 bits
  84  * wide.  For masks wider than 16 bits, only 32-n bits are independent between
  85  * the two hashes.  Thus, it becomes risky to grow a cuckoo hash table beyond
  86  * about 2**24 buckets (about 71 million elements with k=5 and maximum load
  87  * 85%).  Fortunately, Open vSwitch does not normally deal with hash tables
  88  * this large.
  89  *
  90  *
  91  * Handling Duplicates
  92  * ===================
  93  *
  94  * This cuckoo hash table implementation deals with duplicate client-provided
  95  * hash values by chaining: the second and subsequent cmap_nodes with a given
  96  * hash are chained off the initially inserted node's 'next' member.  The hash
  97  * table maintains the invariant that a single client-provided hash value
  98  * exists in only a single chain in a single bucket (even though that hash
  99  * could be stored in two buckets).
 100  *
 101  *
 102  * References
 103  * ==========
 104  *
 105  * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High
 106  *     Performance Ethernet Forwarding with CuckooSwitch".  In Proc. 9th
 107  *     CoNEXT, Dec. 2013.
 108  *
 109  * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent
 110  *     memcache with dumber caching and smarter hashing".  In Proc. 10th USENIX
 111  *     NSDI, Apr. 2013
 112  *
 113  * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2):
 114  *     122-144, May 2004.
 115  *
 116  * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical
 117  *     Alternative to Traditional Hash Tables".  In Proc. 7th Workshop on
 118  *     Distributed Data and Structures (WDAS'06), 2006.
 119  */
 120 /* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */
 121 #define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8))
 122
 123 /* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit for 64B cacheline. */
 124 #define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE)
 125
 126 /* A cuckoo hash bucket.  Designed to be cache-aligned and exactly one cache
 127  * line long. */
 128 struct cmap_bucket {
 129     /* Padding to make cmap_bucket exactly one cache line long. */
 130     PADDED_MEMBERS(CACHE_LINE_SIZE,
 131         /* Allows readers to track in-progress changes.  Initially zero, each
 132          * writer increments this value just before and just after each change
 133          * (see cmap_set_bucket()).  Thus, a reader can ensure that it gets a
 134          * consistent snapshot by waiting for the counter to become even (see
 135          * read_even_counter()), then checking that its value does not change
 136          * while examining the bucket (see cmap_find()). */
 137         atomic_uint32_t counter;
 138
 139         /* (hash, node) slots.  They are parallel arrays instead of an array of
 140          * structs to reduce the amount of space lost to padding.
 141          *
 142          * The slots are in no particular order.  A null pointer indicates that
 143          * a pair is unused.  In-use slots are not necessarily in the earliest
 144          * slots. */
 145         uint32_t hashes[CMAP_K];
 146         struct cmap_node nodes[CMAP_K];
 147     );
 148 };
 149 BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE);
 150
 151 /* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
 152  * enlarging a cmap.  Reasonable values lie between about 75% and 93%.  Smaller
 153  * values waste memory; larger values increase the average insertion time. */
 154 #define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
 155
 156 /* Default minimum load factor (as a fraction of UINT32_MAX + 1) before
 157  * shrinking a cmap.  Currently, the value is chosen to be 20%, this
 158  * means cmap will have a 40% load factor after shrink. */
 159 #define CMAP_MIN_LOAD ((uint32_t) (UINT32_MAX * .20))
 160
 161 /* The implementation of a concurrent hash map. */
 162 struct cmap_impl {
 163     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
 164         unsigned int n;             /* Number of in-use elements. */
 165         unsigned int max_n;         /* Max elements before enlarging. */
 166         unsigned int min_n;         /* Min elements before shrinking. */
 167         uint32_t mask;              /* Number of 'buckets', minus one. */
 168         uint32_t basis;             /* Basis for rehashing client's
 169                                        hash values. */
 170     );
 171
 172     PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
 173         struct cmap_bucket buckets[1];
 174     );
 175 };
 176 BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE * 2);
 177
 178 /* An empty cmap. */
 179 OVS_ALIGNED_VAR(CACHE_LINE_SIZE) const struct cmap_impl empty_cmap;
 180
 181 static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask);
 182
 183 /* Explicit inline keywords in utility functions seem to be necessary
 184  * to prevent performance regression on cmap_find(). */
 185
 186 /* Given a rehashed value 'hash', returns the other hash for that rehashed
 187  * value.  This is symmetric: other_hash(other_hash(x)) == x.  (See also "Hash
 188  * Functions" at the top of this file.) */
 189 static inline uint32_t
 190 other_hash(uint32_t hash)
 191 {
 192     return (hash << 16) | (hash >> 16);
 193 }
 194
 195 /* Returns the rehashed value for 'hash' within 'impl'.  (See also "Hash
 196  * Functions" at the top of this file.) */
 197 static inline uint32_t
 198 rehash(const struct cmap_impl *impl, uint32_t hash)
 199 {
 200     return hash_finish(impl->basis, hash);
 201 }
 202
 203 /* Not always without the inline keyword. */
 204 static inline struct cmap_impl *
 205 cmap_get_impl(const struct cmap *cmap)
 206 {
 207     return ovsrcu_get(struct cmap_impl *, &cmap->impl);
 208 }
 209
 210 static uint32_t
 211 calc_max_n(uint32_t mask)
 212 {
 213     return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MAX_LOAD) >> 32;
 214 }
 215
 216 static uint32_t
 217 calc_min_n(uint32_t mask)
 218 {
 219     return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MIN_LOAD) >> 32;
 220 }
 221
 222 static struct cmap_impl *
 223 cmap_impl_create(uint32_t mask)
 224 {
 225     struct cmap_impl *impl;
 226
 227     ovs_assert(is_pow2(mask + 1));
 228
 229     /* There are 'mask + 1' buckets but struct cmap_impl has one bucket built
 230      * in, so we only need to add space for the extra 'mask' buckets. */
 231     impl = xzalloc_cacheline(sizeof *impl + mask * sizeof *impl->buckets);
 232     impl->n = 0;
 233     impl->max_n = calc_max_n(mask);
 234     impl->min_n = calc_min_n(mask);
 235     impl->mask = mask;
 236     impl->basis = random_uint32();
 237
 238     return impl;
 239 }
 240
 241 /* Initializes 'cmap' as an empty concurrent hash map. */
 242 void
 243 cmap_init(struct cmap *cmap)
 244 {
 245     ovsrcu_set(&cmap->impl, CONST_CAST(struct cmap_impl *, &empty_cmap));
 246 }
 247
 248 /* Destroys 'cmap'.
 249  *
 250  * The client is responsible for destroying any data previously held in
 251  * 'cmap'. */
 252 void
 253 cmap_destroy(struct cmap *cmap)
 254 {
 255     if (cmap) {
 256         struct cmap_impl *impl = cmap_get_impl(cmap);
 257         if (impl != &empty_cmap) {
 258             ovsrcu_postpone(free_cacheline, impl);
 259         }
 260     }
 261 }
 262
 263 /* Returns the number of elements in 'cmap'. */
 264 size_t
 265 cmap_count(const struct cmap *cmap)
 266 {
 267     return cmap_get_impl(cmap)->n;
 268 }
 269
 270 /* Returns true if 'cmap' is empty, false otherwise. */
 271 bool
 272 cmap_is_empty(const struct cmap *cmap)
 273 {
 274     return cmap_count(cmap) == 0;
 275 }
 276
 277 static inline uint32_t
 278 read_counter(const struct cmap_bucket *bucket_)
 279 {
 280     struct cmap_bucket *bucket = CONST_CAST(struct cmap_bucket *, bucket_);
 281     uint32_t counter;
 282
 283     atomic_read_explicit(&bucket->counter, &counter, memory_order_acquire);
 284
 285     return counter;
 286 }
 287
 288 static inline uint32_t
 289 read_even_counter(const struct cmap_bucket *bucket)
 290 {
 291     uint32_t counter;
 292
 293     do {
 294         counter = read_counter(bucket);
 295     } while (OVS_UNLIKELY(counter & 1));
 296
 297     return counter;
 298 }
 299
 300 static inline bool
 301 counter_changed(const struct cmap_bucket *b_, uint32_t c)
 302 {
 303     struct cmap_bucket *b = CONST_CAST(struct cmap_bucket *, b_);
 304     uint32_t counter;
 305
 306     /* Need to make sure the counter read is not moved up, before the hash and
 307      * cmap_node_next().  Using atomic_read_explicit with memory_order_acquire
 308      * would allow prior reads to be moved after the barrier.
 309      * atomic_thread_fence prevents all following memory accesses from moving
 310      * prior to preceding loads. */
 311     atomic_thread_fence(memory_order_acquire);
 312     atomic_read_relaxed(&b->counter, &counter);
 313
 314     return OVS_UNLIKELY(counter != c);
 315 }
 316
 317 static inline const struct cmap_node *
 318 cmap_find_in_bucket(const struct cmap_bucket *bucket, uint32_t hash)
 319 {
 320     for (int i = 0; i < CMAP_K; i++) {
 321         if (bucket->hashes[i] == hash) {
 322             return cmap_node_next(&bucket->nodes[i]);
 323         }
 324     }
 325     return NULL;
 326 }
 327
 328 static inline const struct cmap_node *
 329 cmap_find__(const struct cmap_bucket *b1, const struct cmap_bucket *b2,
 330             uint32_t hash)
 331 {
 332     uint32_t c1, c2;
 333     const struct cmap_node *node;
 334
 335     do {
 336         do {
 337             c1 = read_even_counter(b1);
 338             node = cmap_find_in_bucket(b1, hash);
 339         } while (OVS_UNLIKELY(counter_changed(b1, c1)));
 340         if (node) {
 341             break;
 342         }
 343         do {
 344             c2 = read_even_counter(b2);
 345             node = cmap_find_in_bucket(b2, hash);
 346         } while (OVS_UNLIKELY(counter_changed(b2, c2)));
 347         if (node) {
 348             break;
 349         }
 350     } while (OVS_UNLIKELY(counter_changed(b1, c1)));
 351
 352     return node;
 353 }
 354
 355 /* Searches 'cmap' for an element with the specified 'hash'.  If one or more is
 356  * found, returns a pointer to the first one, otherwise a null pointer.  All of
 357  * the nodes on the returned list are guaranteed to have exactly the given
 358  * 'hash'.
 359  *
 360  * This function works even if 'cmap' is changing concurrently.  If 'cmap' is
 361  * not changing, then cmap_find_protected() is slightly faster.
 362  *
 363  * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */
 364 const struct cmap_node *
 365 cmap_find(const struct cmap *cmap, uint32_t hash)
 366 {
 367     const struct cmap_impl *impl = cmap_get_impl(cmap);
 368     uint32_t h1 = rehash(impl, hash);
 369     uint32_t h2 = other_hash(h1);
 370
 371     return cmap_find__(&impl->buckets[h1 & impl->mask],
 372                        &impl->buckets[h2 & impl->mask],
 373                        hash);
 374 }
 375
 376 /* Find a node by the index of the entry of cmap. Index N means the N/CMAP_K
 377  * bucket and N%CMAP_K entry in that bucket.
 378  * Notice that it is not protected by the optimistic lock (versioning) because
 379  * it does not compare the hashes. Currently it is only used by the datapath
 380  * SMC cache.
 381  *
 382  * Return node for the entry of index or NULL if the index beyond boundary */
 383 const struct cmap_node *
 384 cmap_find_by_index(const struct cmap *cmap, uint32_t index)
 385 {
 386     const struct cmap_impl *impl = cmap_get_impl(cmap);
 387
 388     uint32_t b = index / CMAP_K;
 389     uint32_t e = index % CMAP_K;
 390
 391     if (b > impl->mask) {
 392         return NULL;
 393     }
 394
 395     const struct cmap_bucket *bucket = &impl->buckets[b];
 396
 397     return cmap_node_next(&bucket->nodes[e]);
 398 }
 399
 400 /* Find the index of certain hash value. Currently only used by the datapath
 401  * SMC cache.
 402  *
 403  * Return the index of the entry if found, or UINT32_MAX if not found. The
 404  * function assumes entry index cannot be larger than UINT32_MAX. */
 405 uint32_t
 406 cmap_find_index(const struct cmap *cmap, uint32_t hash)
 407 {
 408     const struct cmap_impl *impl = cmap_get_impl(cmap);
 409     uint32_t h1 = rehash(impl, hash);
 410     uint32_t h2 = other_hash(h1);
 411
 412     uint32_t b_index1 = h1 & impl->mask;
 413     uint32_t b_index2 = h2 & impl->mask;
 414
 415     uint32_t c1, c2;
 416     uint32_t index = UINT32_MAX;
 417
 418     const struct cmap_bucket *b1 = &impl->buckets[b_index1];
 419     const struct cmap_bucket *b2 = &impl->buckets[b_index2];
 420
 421     do {
 422         do {
 423             c1 = read_even_counter(b1);
 424             for (int i = 0; i < CMAP_K; i++) {
 425                 if (b1->hashes[i] == hash) {
 426                     index = b_index1 * CMAP_K + i;
 427                  }
 428             }
 429         } while (OVS_UNLIKELY(counter_changed(b1, c1)));
 430         if (index != UINT32_MAX) {
 431             break;
 432         }
 433         do {
 434             c2 = read_even_counter(b2);
 435             for (int i = 0; i < CMAP_K; i++) {
 436                 if (b2->hashes[i] == hash) {
 437                     index = b_index2 * CMAP_K + i;
 438                 }
 439             }
 440         } while (OVS_UNLIKELY(counter_changed(b2, c2)));
 441
 442         if (index != UINT32_MAX) {
 443             break;
 444         }
 445     } while (OVS_UNLIKELY(counter_changed(b1, c1)));
 446
 447     return index;
 448 }
 449
 450 /* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1,
 451  * and sets the corresponding pointer in 'nodes', if the hash value was
 452  * found from the 'cmap'.  In other cases the 'nodes' values are not changed,
 453  * i.e., no NULL pointers are stored there.
 454  * Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer
 455  * was stored, 0 otherwise.
 456  * Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for
 457  * hash collisions. */
 458 unsigned long
 459 cmap_find_batch(const struct cmap *cmap, unsigned long map,
 460                 uint32_t hashes[], const struct cmap_node *nodes[])
 461 {
 462     const struct cmap_impl *impl = cmap_get_impl(cmap);
 463     unsigned long result = map;
 464     int i;
 465     uint32_t h1s[sizeof map * CHAR_BIT];
 466     const struct cmap_bucket *b1s[sizeof map * CHAR_BIT];
 467     const struct cmap_bucket *b2s[sizeof map * CHAR_BIT];
 468     uint32_t c1s[sizeof map * CHAR_BIT];
 469
 470     /* Compute hashes and prefetch 1st buckets. */
 471     ULLONG_FOR_EACH_1(i, map) {
 472         h1s[i] = rehash(impl, hashes[i]);
 473         b1s[i] = &impl->buckets[h1s[i] & impl->mask];
 474         OVS_PREFETCH(b1s[i]);
 475     }
 476     /* Lookups, Round 1. Only look up at the first bucket. */
 477     ULLONG_FOR_EACH_1(i, map) {
 478         uint32_t c1;
 479         const struct cmap_bucket *b1 = b1s[i];
 480         const struct cmap_node *node;
 481
 482         do {
 483             c1 = read_even_counter(b1);
 484             node = cmap_find_in_bucket(b1, hashes[i]);
 485         } while (OVS_UNLIKELY(counter_changed(b1, c1)));
 486
 487         if (!node) {
 488             /* Not found (yet); Prefetch the 2nd bucket. */
 489             b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask];
 490             OVS_PREFETCH(b2s[i]);
 491             c1s[i] = c1; /* We may need to check this after Round 2. */
 492             continue;
 493         }
 494         /* Found. */
 495         ULLONG_SET0(map, i); /* Ignore this on round 2. */
 496         OVS_PREFETCH(node);
 497         nodes[i] = node;
 498     }
 499     /* Round 2. Look into the 2nd bucket, if needed. */
 500     ULLONG_FOR_EACH_1(i, map) {
 501         uint32_t c2;
 502         const struct cmap_bucket *b2 = b2s[i];
 503         const struct cmap_node *node;
 504
 505         do {
 506             c2 = read_even_counter(b2);
 507             node = cmap_find_in_bucket(b2, hashes[i]);
 508         } while (OVS_UNLIKELY(counter_changed(b2, c2)));
 509
 510         if (!node) {
 511             /* Not found, but the node may have been moved from b2 to b1 right
 512              * after we finished with b1 earlier.  We just got a clean reading
 513              * of the 2nd bucket, so we check the counter of the 1st bucket
 514              * only.  However, we need to check both buckets again, as the
 515              * entry may be moved again to the 2nd bucket.  Basically, we
 516              * need to loop as long as it takes to get stable readings of
 517              * both buckets.  cmap_find__() does that, and now that we have
 518              * fetched both buckets we can just use it. */
 519             if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) {
 520                 node = cmap_find__(b1s[i], b2s[i], hashes[i]);
 521                 if (node) {
 522                     goto found;
 523                 }
 524             }
 525             /* Not found. */
 526             ULLONG_SET0(result, i); /* Fix the result. */
 527             continue;
 528         }
 529 found:
 530         OVS_PREFETCH(node);
 531         nodes[i] = node;
 532     }
 533     return result;
 534 }
 535
 536 static int
 537 cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash)
 538 {
 539     int i;
 540
 541     for (i = 0; i < CMAP_K; i++) {
 542         if (b->hashes[i] == hash && cmap_node_next_protected(&b->nodes[i])) {
 543             return i;
 544         }
 545     }
 546     return -1;
 547 }
 548
 549 static struct cmap_node *
 550 cmap_find_bucket_protected(struct cmap_impl *impl, uint32_t hash, uint32_t h)
 551 {
 552     struct cmap_bucket *b = &impl->buckets[h & impl->mask];
 553     int i;
 554
 555     for (i = 0; i < CMAP_K; i++) {
 556         if (b->hashes[i] == hash) {
 557             return cmap_node_next_protected(&b->nodes[i]);
 558         }
 559     }
 560     return NULL;
 561 }
 562
 563 /* Like cmap_find(), but only for use if 'cmap' cannot change concurrently.
 564  *
 565  * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */
 566 struct cmap_node *
 567 cmap_find_protected(const struct cmap *cmap, uint32_t hash)
 568 {
 569     struct cmap_impl *impl = cmap_get_impl(cmap);
 570     uint32_t h1 = rehash(impl, hash);
 571     uint32_t h2 = other_hash(h1);
 572     struct cmap_node *node;
 573
 574     node = cmap_find_bucket_protected(impl, hash, h1);
 575     if (node) {
 576         return node;
 577     }
 578     return cmap_find_bucket_protected(impl, hash, h2);
 579 }
 580
 581 static int
 582 cmap_find_empty_slot_protected(const struct cmap_bucket *b)
 583 {
 584     int i;
 585
 586     for (i = 0; i < CMAP_K; i++) {
 587         if (!cmap_node_next_protected(&b->nodes[i])) {
 588             return i;
 589         }
 590     }
 591     return -1;
 592 }
 593
 594 static void
 595 cmap_set_bucket(struct cmap_bucket *b, int i,
 596                 struct cmap_node *node, uint32_t hash)
 597 {
 598     uint32_t c;
 599
 600     atomic_read_explicit(&b->counter, &c, memory_order_acquire);
 601     atomic_store_explicit(&b->counter, c + 1, memory_order_release);
 602     ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */
 603     b->hashes[i] = hash;
 604     atomic_store_explicit(&b->counter, c + 2, memory_order_release);
 605 }
 606
 607 /* Searches 'b' for a node with the given 'hash'.  If it finds one, adds
 608  * 'new_node' to the node's linked list and returns true.  If it does not find
 609  * one, returns false. */
 610 static bool
 611 cmap_insert_dup(struct cmap_node *new_node, uint32_t hash,
 612                 struct cmap_bucket *b)
 613 {
 614     int i;
 615
 616     for (i = 0; i < CMAP_K; i++) {
 617         if (b->hashes[i] == hash) {
 618             struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
 619
 620             if (node) {
 621                 struct cmap_node *p;
 622
 623                 /* The common case is that 'new_node' is a singleton,
 624                  * with a null 'next' pointer.  Rehashing can add a
 625                  * longer chain, but due to our invariant of always
 626                  * having all nodes with the same (user) hash value at
 627                  * a single chain, rehashing will always insert the
 628                  * chain to an empty node.  The only way we can end up
 629                  * here is by the user inserting a chain of nodes at
 630                  * once.  Find the end of the chain starting at
 631                  * 'new_node', then splice 'node' to the end of that
 632                  * chain. */
 633                 p = new_node;
 634                 for (;;) {
 635                     struct cmap_node *next = cmap_node_next_protected(p);
 636
 637                     if (!next) {
 638                         break;
 639                     }
 640                     p = next;
 641                 }
 642                 ovsrcu_set_hidden(&p->next, node);
 643             } else {
 644                 /* The hash value is there from some previous insertion, but
 645                  * the associated node has been removed.  We're not really
 646                  * inserting a duplicate, but we can still reuse the slot.
 647                  * Carry on. */
 648             }
 649
 650             /* Change the bucket to point to 'new_node'.  This is a degenerate
 651              * form of cmap_set_bucket() that doesn't update the counter since
 652              * we're only touching one field and in a way that doesn't change
 653              * the bucket's meaning for readers. */
 654             ovsrcu_set(&b->nodes[i].next, new_node);
 655
 656             return true;
 657         }
 658     }
 659     return false;
 660 }
 661
 662 /* Searches 'b' for an empty slot.  If successful, stores 'node' and 'hash' in
 663  * the slot and returns true.  Otherwise, returns false. */
 664 static bool
 665 cmap_insert_bucket(struct cmap_node *node, uint32_t hash,
 666                    struct cmap_bucket *b)
 667 {
 668     int i;
 669
 670     for (i = 0; i < CMAP_K; i++) {
 671         if (!cmap_node_next_protected(&b->nodes[i])) {
 672             cmap_set_bucket(b, i, node, hash);
 673             return true;
 674         }
 675     }
 676     return false;
 677 }
 678
 679 /* Returns the other bucket that b->nodes[slot] could occupy in 'impl'.  (This
 680  * might be the same as 'b'.) */
 681 static struct cmap_bucket *
 682 other_bucket_protected(struct cmap_impl *impl, struct cmap_bucket *b, int slot)
 683 {
 684     uint32_t h1 = rehash(impl, b->hashes[slot]);
 685     uint32_t h2 = other_hash(h1);
 686     uint32_t b_idx = b - impl->buckets;
 687     uint32_t other_h = (h1 & impl->mask) == b_idx ? h2 : h1;
 688
 689     return &impl->buckets[other_h & impl->mask];
 690 }
 691
 692 /* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1'
 693  * and 'b2' are full.  This function attempts to rearrange buckets within
 694  * 'impl' to make room for 'new_node'.
 695  *
 696  * The implementation is a general-purpose breadth-first search.  At first
 697  * glance, this is more complex than a random walk through 'impl' (suggested by
 698  * some references), but random walks have a tendency to loop back through a
 699  * single bucket.  We have to move nodes backward along the path that we find,
 700  * so that no node actually disappears from the hash table, which means a
 701  * random walk would have to be careful to deal with loops.  By contrast, a
 702  * successful breadth-first search always finds a *shortest* path through the
 703  * hash table, and a shortest path will never contain loops, so it avoids that
 704  * problem entirely.
 705  */
 706 static bool
 707 cmap_insert_bfs(struct cmap_impl *impl, struct cmap_node *new_node,
 708                 uint32_t hash, struct cmap_bucket *b1, struct cmap_bucket *b2)
 709 {
 710     enum { MAX_DEPTH = 4 };
 711
 712     /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'.
 713      *
 714      * One can follow the path via:
 715      *
 716      *     struct cmap_bucket *b;
 717      *     int i;
 718      *
 719      *     b = path->start;
 720      *     for (i = 0; i < path->n; i++) {
 721      *         b = other_bucket_protected(impl, b, path->slots[i]);
 722      *     }
 723      *     ovs_assert(b == path->end);
 724      */
 725     struct cmap_path {
 726         struct cmap_bucket *start; /* First bucket along the path. */
 727         struct cmap_bucket *end;   /* Last bucket on the path. */
 728         uint8_t slots[MAX_DEPTH];  /* Slots used for each hop. */
 729         int n;                     /* Number of slots[]. */
 730     };
 731
 732     /* We need to limit the amount of work we do trying to find a path.  It
 733      * might actually be impossible to rearrange the cmap, and after some time
 734      * it is likely to be easier to rehash the entire cmap.
 735      *
 736      * This value of MAX_QUEUE is an arbitrary limit suggested by one of the
 737      * references.  Empirically, it seems to work OK. */
 738     enum { MAX_QUEUE = 500 };
 739     struct cmap_path queue[MAX_QUEUE];
 740     int head = 0;
 741     int tail = 0;
 742
 743     /* Add 'b1' and 'b2' as starting points for the search. */
 744     queue[head].start = b1;
 745     queue[head].end = b1;
 746     queue[head].n = 0;
 747     head++;
 748     if (b1 != b2) {
 749         queue[head].start = b2;
 750         queue[head].end = b2;
 751         queue[head].n = 0;
 752         head++;
 753     }
 754
 755     while (tail < head) {
 756         const struct cmap_path *path = &queue[tail++];
 757         struct cmap_bucket *this = path->end;
 758         int i;
 759
 760         for (i = 0; i < CMAP_K; i++) {
 761             struct cmap_bucket *next = other_bucket_protected(impl, this, i);
 762             int j;
 763
 764             if (this == next) {
 765                 continue;
 766             }
 767
 768             j = cmap_find_empty_slot_protected(next);
 769             if (j >= 0) {
 770                 /* We've found a path along which we can rearrange the hash
 771                  * table:  Start at path->start, follow all the slots in
 772                  * path->slots[], then follow slot 'i', then the bucket you
 773                  * arrive at has slot 'j' empty. */
 774                 struct cmap_bucket *buckets[MAX_DEPTH + 2];
 775                 int slots[MAX_DEPTH + 2];
 776                 int k;
 777
 778                 /* Figure out the full sequence of slots. */
 779                 for (k = 0; k < path->n; k++) {
 780                     slots[k] = path->slots[k];
 781                 }
 782                 slots[path->n] = i;
 783                 slots[path->n + 1] = j;
 784
 785                 /* Figure out the full sequence of buckets. */
 786                 buckets[0] = path->start;
 787                 for (k = 0; k <= path->n; k++) {
 788                     buckets[k + 1] = other_bucket_protected(impl, buckets[k], slots[k]);
 789                 }
 790
 791                 /* Now the path is fully expressed.  One can start from
 792                  * buckets[0], go via slots[0] to buckets[1], via slots[1] to
 793                  * buckets[2], and so on.
 794                  *
 795                  * Move all the nodes across the path "backward".  After each
 796                  * step some node appears in two buckets.  Thus, every node is
 797                  * always visible to a concurrent search. */
 798                 for (k = path->n + 1; k > 0; k--) {
 799                     int slot = slots[k - 1];
 800
 801                     cmap_set_bucket(
 802                         buckets[k], slots[k],
 803                         cmap_node_next_protected(&buckets[k - 1]->nodes[slot]),
 804                         buckets[k - 1]->hashes[slot]);
 805                 }
 806
 807                 /* Finally, replace the first node on the path by
 808                  * 'new_node'. */
 809                 cmap_set_bucket(buckets[0], slots[0], new_node, hash);
 810
 811                 return true;
 812             }
 813
 814             if (path->n < MAX_DEPTH && head < MAX_QUEUE) {
 815                 struct cmap_path *new_path = &queue[head++];
 816
 817                 *new_path = *path;
 818                 new_path->end = next;
 819                 new_path->slots[new_path->n++] = i;
 820             }
 821         }
 822     }
 823
 824     return false;
 825 }
 826
 827 /* Adds 'node', with the given 'hash', to 'impl'.
 828  *
 829  * 'node' is ordinarily a single node, with a null 'next' pointer.  When
 830  * rehashing, however, it may be a longer chain of nodes. */
 831 static bool
 832 cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash)
 833 {
 834     uint32_t h1 = rehash(impl, hash);
 835     uint32_t h2 = other_hash(h1);
 836     struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask];
 837     struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask];
 838
 839     return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) ||
 840                          cmap_insert_dup(node, hash, b2)) ||
 841             OVS_LIKELY(cmap_insert_bucket(node, hash, b1) ||
 842                        cmap_insert_bucket(node, hash, b2)) ||
 843             cmap_insert_bfs(impl, node, hash, b1, b2));
 844 }
 845
 846 /* Inserts 'node', with the given 'hash', into 'cmap'.  The caller must ensure
 847  * that 'cmap' cannot change concurrently (from another thread).  If duplicates
 848  * are undesirable, the caller must have already verified that 'cmap' does not
 849  * contain a duplicate of 'node'.
 850  *
 851  * Returns the current number of nodes in the cmap after the insertion. */
 852 size_t
 853 cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash)
 854 {
 855     struct cmap_impl *impl = cmap_get_impl(cmap);
 856
 857     ovsrcu_set_hidden(&node->next, NULL);
 858
 859     if (OVS_UNLIKELY(impl->n >= impl->max_n)) {
 860         COVERAGE_INC(cmap_expand);
 861         impl = cmap_rehash(cmap, (impl->mask << 1) | 1);
 862     }
 863
 864     while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) {
 865         impl = cmap_rehash(cmap, impl->mask);
 866     }
 867     return ++impl->n;
 868 }
 869
 870 static bool
 871 cmap_replace__(struct cmap_impl *impl, struct cmap_node *node,
 872                struct cmap_node *replacement, uint32_t hash, uint32_t h)
 873 {
 874     struct cmap_bucket *b = &impl->buckets[h & impl->mask];
 875     int slot;
 876
 877     slot = cmap_find_slot_protected(b, hash);
 878     if (slot < 0) {
 879         return false;
 880     }
 881
 882     /* The pointer to 'node' is changed to point to 'replacement',
 883      * which is the next node if no replacement node is given. */
 884     if (!replacement) {
 885         replacement = cmap_node_next_protected(node);
 886     } else {
 887         /* 'replacement' takes the position of 'node' in the list. */
 888         ovsrcu_set_hidden(&replacement->next, cmap_node_next_protected(node));
 889     }
 890
 891     struct cmap_node *iter = &b->nodes[slot];
 892     for (;;) {
 893         struct cmap_node *next = cmap_node_next_protected(iter);
 894
 895         if (next == node) {
 896             ovsrcu_set(&iter->next, replacement);
 897             return true;
 898         }
 899         iter = next;
 900     }
 901 }
 902
 903 /* Replaces 'old_node' in 'cmap' with 'new_node'.  The caller must
 904  * ensure that 'cmap' cannot change concurrently (from another thread).
 905  *
 906  * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or
 907  * into any other concurrent hash map while any other thread might be accessing
 908  * it.  One correct way to do this is to free it from an RCU callback with
 909  * ovsrcu_postpone().
 910  *
 911  * Returns the current number of nodes in the cmap after the replacement.  The
 912  * number of nodes decreases by one if 'new_node' is NULL. */
 913 size_t
 914 cmap_replace(struct cmap *cmap, struct cmap_node *old_node,
 915              struct cmap_node *new_node, uint32_t hash)
 916 {
 917     struct cmap_impl *impl = cmap_get_impl(cmap);
 918     uint32_t h1 = rehash(impl, hash);
 919     uint32_t h2 = other_hash(h1);
 920
 921     ovs_assert(cmap_replace__(impl, old_node, new_node, hash, h1) ||
 922                cmap_replace__(impl, old_node, new_node, hash, h2));
 923
 924     if (!new_node) {
 925         impl->n--;
 926         if (OVS_UNLIKELY(impl->n < impl->min_n)) {
 927             COVERAGE_INC(cmap_shrink);
 928             impl = cmap_rehash(cmap, impl->mask >> 1);
 929         }
 930     }
 931     return impl->n;
 932 }
 933
 934 static bool
 935 cmap_try_rehash(const struct cmap_impl *old, struct cmap_impl *new)
 936 {
 937     const struct cmap_bucket *b;
 938
 939     for (b = old->buckets; b <= &old->buckets[old->mask]; b++) {
 940         int i;
 941
 942         for (i = 0; i < CMAP_K; i++) {
 943             /* possible optimization here because we know the hashes are
 944              * unique */
 945             struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
 946
 947             if (node && !cmap_try_insert(new, node, b->hashes[i])) {
 948                 return false;
 949             }
 950         }
 951     }
 952     return true;
 953 }
 954
 955 static struct cmap_impl *
 956 cmap_rehash(struct cmap *cmap, uint32_t mask)
 957 {
 958     struct cmap_impl *old = cmap_get_impl(cmap);
 959     struct cmap_impl *new;
 960
 961     new = cmap_impl_create(mask);
 962     ovs_assert(old->n < new->max_n);
 963
 964     while (!cmap_try_rehash(old, new)) {
 965         memset(new->buckets, 0, (mask + 1) * sizeof *new->buckets);
 966         new->basis = random_uint32();
 967     }
 968
 969     new->n = old->n;
 970     ovsrcu_set(&cmap->impl, new);
 971     if (old != &empty_cmap) {
 972         ovsrcu_postpone(free_cacheline, old);
 973     }
 974
 975     return new;
 976 }
 977
 978 struct cmap_cursor
 979 cmap_cursor_start(const struct cmap *cmap)
 980 {
 981     struct cmap_cursor cursor;
 982
 983     cursor.impl = cmap_get_impl(cmap);
 984     cursor.bucket_idx = 0;
 985     cursor.entry_idx = 0;
 986     cursor.node = NULL;
 987     cmap_cursor_advance(&cursor);
 988
 989     return cursor;
 990 }
 991
 992 void
 993 cmap_cursor_advance(struct cmap_cursor *cursor)
 994 {
 995     const struct cmap_impl *impl = cursor->impl;
 996
 997     if (cursor->node) {
 998         cursor->node = cmap_node_next(cursor->node);
 999         if (cursor->node) {
1000             return;
1001         }
1002     }
1003
1004     while (cursor->bucket_idx <= impl->mask) {
1005         const struct cmap_bucket *b = &impl->buckets[cursor->bucket_idx];
1006
1007         while (cursor->entry_idx < CMAP_K) {
1008             cursor->node = cmap_node_next(&b->nodes[cursor->entry_idx++]);
1009             if (cursor->node) {
1010                 return;
1011             }
1012         }
1013
1014         cursor->bucket_idx++;
1015         cursor->entry_idx = 0;
1016     }
1017 }
1018
1019 /* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in
1020  * 'cmap'.  Uses '*pos' to determine where to begin iteration, and updates
1021  * '*pos' to pass on the next iteration into them before returning.
1022  *
1023  * It's better to use plain CMAP_FOR_EACH and related functions, since they are
1024  * faster and better at dealing with cmaps that change during iteration.
1025  *
1026  * Before beginning iteration, set '*pos' to all zeros. */
1027 struct cmap_node *
1028 cmap_next_position(const struct cmap *cmap,
1029                    struct cmap_position *pos)
1030 {
1031     struct cmap_impl *impl = cmap_get_impl(cmap);
1032     unsigned int bucket = pos->bucket;
1033     unsigned int entry = pos->entry;
1034     unsigned int offset = pos->offset;
1035
1036     while (bucket <= impl->mask) {
1037         const struct cmap_bucket *b = &impl->buckets[bucket];
1038
1039         while (entry < CMAP_K) {
1040             const struct cmap_node *node = cmap_node_next(&b->nodes[entry]);
1041             unsigned int i;
1042
1043             for (i = 0; node; i++, node = cmap_node_next(node)) {
1044                 if (i == offset) {
1045                     if (cmap_node_next(node)) {
1046                         offset++;
1047                     } else {
1048                         entry++;
1049                         offset = 0;
1050                     }
1051                     pos->bucket = bucket;
1052                     pos->entry = entry;
1053                     pos->offset = offset;
1054                     return CONST_CAST(struct cmap_node *, node);
1055                 }
1056             }
1057
1058             entry++;
1059             offset = 0;
1060         }
1061
1062         bucket++;
1063         entry = offset = 0;
1064     }
1065
1066     pos->bucket = pos->entry = pos->offset = 0;
1067     return NULL;
1068 }