]> git.proxmox.com Git - mirror_ovs.git/blame - lib/cmap.c
datapath-windows: Correct endianness for deleting zone.
[mirror_ovs.git] / lib / cmap.c
CommitLineData
0e666160 1/*
b70e6976 2 * Copyright (c) 2014, 2016 Nicira, Inc.
0e666160
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18#include "cmap.h"
8f9392dc 19#include "coverage.h"
52a524eb 20#include "bitmap.h"
0e666160
BP
21#include "hash.h"
22#include "ovs-rcu.h"
23#include "random.h"
24#include "util.h"
25
8f9392dc
AW
26COVERAGE_DEFINE(cmap_expand);
27COVERAGE_DEFINE(cmap_shrink);
28
0e666160
BP
29/* Optimistic Concurrent Cuckoo Hash
30 * =================================
31 *
32 * A "cuckoo hash" is an open addressing hash table schema, designed such that
33 * a given element can be in one of only a small number of buckets 'd', each of
34 * which holds up to a small number 'k' elements. Thus, the expected and
35 * worst-case lookup times are O(1) because they require comparing no more than
36 * a fixed number of elements (k * d). Inserting a new element can require
37 * moving around existing elements, but it is also O(1) amortized expected
38 * time.
39 *
40 * An optimistic concurrent hash table goes one step further, making it
41 * possible for a single writer to execute concurrently with any number of
42 * readers without requiring the readers to take any locks.
43 *
44 * This cuckoo hash implementation uses:
45 *
46 * - Two hash functions (d=2). More hash functions allow for a higher load
47 * factor, but increasing 'k' is easier and the benefits of increasing 'd'
48 * quickly fall off with the 'k' values used here. Also, the method of
49 * generating hashes used in this implementation is hard to reasonably
50 * extend beyond d=2. Finally, each additional hash function means that a
51 * lookup has to look at least one extra cache line.
52 *
53 * - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets
54 * exactly one cache line in size.
55 *
56 * According to Erlingsson [4], these parameters suggest a maximum load factor
57 * of about 93%. The current implementation is conservative, expanding the
58 * hash table when it is over 85% full.
59 *
8f9392dc
AW
60 * When the load factor is below 20%, the hash table will be shrinked by half.
61 * This is to reduce the memory utilization of the hash table and to avoid
62 * the hash table occupying the top of heap chunk which prevents the trimming
63 * of heap.
0e666160
BP
64 *
65 * Hash Functions
66 * ==============
67 *
68 * A cuckoo hash requires multiple hash functions. When reorganizing the hash
69 * becomes too difficult, it also requires the ability to change the hash
70 * functions. Requiring the client to provide multiple hashes and to be able
71 * to change them to new hashes upon insertion is inconvenient.
72 *
73 * This implementation takes another approach. The client provides a single,
74 * fixed hash. The cuckoo hash internally "rehashes" this hash against a
75 * randomly selected basis value (see rehash()). This rehashed value is one of
76 * the two hashes. The other hash is computed by 16-bit circular rotation of
77 * the rehashed value. Updating the basis changes the hash functions.
78 *
79 * To work properly, the hash functions used by a cuckoo hash must be
80 * independent. If one hash function is a function of the other (e.g. h2(x) =
81 * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail
82 * catastrophically (loop forever) because of collisions. With this rehashing
83 * technique, the two hashes are completely independent for masks up to 16 bits
84 * wide. For masks wider than 16 bits, only 32-n bits are independent between
85 * the two hashes. Thus, it becomes risky to grow a cuckoo hash table beyond
86 * about 2**24 buckets (about 71 million elements with k=5 and maximum load
87 * 85%). Fortunately, Open vSwitch does not normally deal with hash tables
88 * this large.
89 *
90 *
91 * Handling Duplicates
92 * ===================
93 *
94 * This cuckoo hash table implementation deals with duplicate client-provided
95 * hash values by chaining: the second and subsequent cmap_nodes with a given
96 * hash are chained off the initially inserted node's 'next' member. The hash
97 * table maintains the invariant that a single client-provided hash value
98 * exists in only a single chain in a single bucket (even though that hash
99 * could be stored in two buckets).
100 *
101 *
102 * References
103 * ==========
104 *
105 * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High
106 * Performance Ethernet Forwarding with CuckooSwitch". In Proc. 9th
107 * CoNEXT, Dec. 2013.
108 *
109 * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent
110 * memcache with dumber caching and smarter hashing". In Proc. 10th USENIX
111 * NSDI, Apr. 2013
112 *
113 * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2):
114 * 122-144, May 2004.
115 *
116 * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical
117 * Alternative to Traditional Hash Tables". In Proc. 7th Workshop on
118 * Distributed Data and Structures (WDAS'06), 2006.
119 */
120/* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */
121#define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8))
122
a80dd094 123/* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit for 64B cacheline. */
0e666160
BP
124#define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE)
125
0e666160
BP
126/* A cuckoo hash bucket. Designed to be cache-aligned and exactly one cache
127 * line long. */
128struct cmap_bucket {
0e666160 129 /* Padding to make cmap_bucket exactly one cache line long. */
a80dd094
IM
130 PADDED_MEMBERS(CACHE_LINE_SIZE,
131 /* Allows readers to track in-progress changes. Initially zero, each
132 * writer increments this value just before and just after each change
133 * (see cmap_set_bucket()). Thus, a reader can ensure that it gets a
134 * consistent snapshot by waiting for the counter to become even (see
135 * read_even_counter()), then checking that its value does not change
136 * while examining the bucket (see cmap_find()). */
137 atomic_uint32_t counter;
138
139 /* (hash, node) slots. They are parallel arrays instead of an array of
140 * structs to reduce the amount of space lost to padding.
141 *
142 * The slots are in no particular order. A null pointer indicates that
143 * a pair is unused. In-use slots are not necessarily in the earliest
144 * slots. */
145 uint32_t hashes[CMAP_K];
146 struct cmap_node nodes[CMAP_K];
147 );
0e666160
BP
148};
149BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE);
150
151/* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
152 * enlarging a cmap. Reasonable values lie between about 75% and 93%. Smaller
153 * values waste memory; larger values increase the average insertion time. */
154#define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
155
8f9392dc
AW
156/* Default minimum load factor (as a fraction of UINT32_MAX + 1) before
157 * shrinking a cmap. Currently, the value is chosen to be 20%, this
158 * means cmap will have a 40% load factor after shrink. */
159#define CMAP_MIN_LOAD ((uint32_t) (UINT32_MAX * .20))
160
0e666160
BP
161/* The implementation of a concurrent hash map. */
162struct cmap_impl {
82f197ca
BB
163 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
164 unsigned int n; /* Number of in-use elements. */
165 unsigned int max_n; /* Max elements before enlarging. */
166 unsigned int min_n; /* Min elements before shrinking. */
167 uint32_t mask; /* Number of 'buckets', minus one. */
168 uint32_t basis; /* Basis for rehashing client's
169 hash values. */
170 );
171
172 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
173 struct cmap_bucket buckets[1];
174 );
0e666160 175};
b70e6976
BP
176BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE * 2);
177
178/* An empty cmap. */
179OVS_ALIGNED_VAR(CACHE_LINE_SIZE) const struct cmap_impl empty_cmap;
0e666160
BP
180
181static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask);
182
55847abe
JR
183/* Explicit inline keywords in utility functions seem to be necessary
184 * to prevent performance regression on cmap_find(). */
185
0e666160
BP
186/* Given a rehashed value 'hash', returns the other hash for that rehashed
187 * value. This is symmetric: other_hash(other_hash(x)) == x. (See also "Hash
188 * Functions" at the top of this file.) */
55847abe 189static inline uint32_t
0e666160
BP
190other_hash(uint32_t hash)
191{
192 return (hash << 16) | (hash >> 16);
193}
194
195/* Returns the rehashed value for 'hash' within 'impl'. (See also "Hash
196 * Functions" at the top of this file.) */
55847abe 197static inline uint32_t
0e666160
BP
198rehash(const struct cmap_impl *impl, uint32_t hash)
199{
33c6a1b9 200 return hash_finish(impl->basis, hash);
0e666160
BP
201}
202
55847abe
JR
203/* Not always without the inline keyword. */
204static inline struct cmap_impl *
0e666160
BP
205cmap_get_impl(const struct cmap *cmap)
206{
207 return ovsrcu_get(struct cmap_impl *, &cmap->impl);
208}
209
210static uint32_t
211calc_max_n(uint32_t mask)
212{
213 return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MAX_LOAD) >> 32;
214}
215
8f9392dc
AW
216static uint32_t
217calc_min_n(uint32_t mask)
218{
219 return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MIN_LOAD) >> 32;
220}
221
0e666160
BP
222static struct cmap_impl *
223cmap_impl_create(uint32_t mask)
224{
225 struct cmap_impl *impl;
226
227 ovs_assert(is_pow2(mask + 1));
228
b70e6976
BP
229 /* There are 'mask + 1' buckets but struct cmap_impl has one bucket built
230 * in, so we only need to add space for the extra 'mask' buckets. */
231 impl = xzalloc_cacheline(sizeof *impl + mask * sizeof *impl->buckets);
0e666160
BP
232 impl->n = 0;
233 impl->max_n = calc_max_n(mask);
8f9392dc 234 impl->min_n = calc_min_n(mask);
0e666160
BP
235 impl->mask = mask;
236 impl->basis = random_uint32();
237
238 return impl;
239}
240
241/* Initializes 'cmap' as an empty concurrent hash map. */
242void
243cmap_init(struct cmap *cmap)
244{
b70e6976 245 ovsrcu_set(&cmap->impl, CONST_CAST(struct cmap_impl *, &empty_cmap));
0e666160
BP
246}
247
248/* Destroys 'cmap'.
249 *
250 * The client is responsible for destroying any data previously held in
251 * 'cmap'. */
252void
253cmap_destroy(struct cmap *cmap)
254{
255 if (cmap) {
b70e6976
BP
256 struct cmap_impl *impl = cmap_get_impl(cmap);
257 if (impl != &empty_cmap) {
258 ovsrcu_postpone(free_cacheline, impl);
259 }
0e666160
BP
260 }
261}
262
263/* Returns the number of elements in 'cmap'. */
264size_t
265cmap_count(const struct cmap *cmap)
266{
267 return cmap_get_impl(cmap)->n;
268}
269
270/* Returns true if 'cmap' is empty, false otherwise. */
271bool
272cmap_is_empty(const struct cmap *cmap)
273{
274 return cmap_count(cmap) == 0;
275}
276
55847abe
JR
277static inline uint32_t
278read_counter(const struct cmap_bucket *bucket_)
0e666160 279{
55847abe 280 struct cmap_bucket *bucket = CONST_CAST(struct cmap_bucket *, bucket_);
0e666160
BP
281 uint32_t counter;
282
34a0d778 283 atomic_read_explicit(&bucket->counter, &counter, memory_order_acquire);
55847abe 284
0e666160
BP
285 return counter;
286}
287
55847abe
JR
288static inline uint32_t
289read_even_counter(const struct cmap_bucket *bucket)
0e666160
BP
290{
291 uint32_t counter;
292
293 do {
34a0d778 294 counter = read_counter(bucket);
0e666160
BP
295 } while (OVS_UNLIKELY(counter & 1));
296
297 return counter;
298}
299
55847abe
JR
300static inline bool
301counter_changed(const struct cmap_bucket *b_, uint32_t c)
0e666160 302{
55847abe 303 struct cmap_bucket *b = CONST_CAST(struct cmap_bucket *, b_);
52a524eb 304 uint32_t counter;
55847abe 305
5c416811 306 /* Need to make sure the counter read is not moved up, before the hash and
52a524eb
JR
307 * cmap_node_next(). Using atomic_read_explicit with memory_order_acquire
308 * would allow prior reads to be moved after the barrier.
309 * atomic_thread_fence prevents all following memory accesses from moving
310 * prior to preceding loads. */
5c416811 311 atomic_thread_fence(memory_order_acquire);
52a524eb 312 atomic_read_relaxed(&b->counter, &counter);
5c416811 313
52a524eb 314 return OVS_UNLIKELY(counter != c);
0e666160
BP
315}
316
55847abe
JR
317static inline const struct cmap_node *
318cmap_find_in_bucket(const struct cmap_bucket *bucket, uint32_t hash)
319{
320 for (int i = 0; i < CMAP_K; i++) {
321 if (bucket->hashes[i] == hash) {
322 return cmap_node_next(&bucket->nodes[i]);
323 }
324 }
325 return NULL;
326}
327
328static inline const struct cmap_node *
329cmap_find__(const struct cmap_bucket *b1, const struct cmap_bucket *b2,
330 uint32_t hash)
331{
332 uint32_t c1, c2;
333 const struct cmap_node *node;
334
335 do {
336 do {
337 c1 = read_even_counter(b1);
338 node = cmap_find_in_bucket(b1, hash);
339 } while (OVS_UNLIKELY(counter_changed(b1, c1)));
340 if (node) {
341 break;
342 }
343 do {
344 c2 = read_even_counter(b2);
345 node = cmap_find_in_bucket(b2, hash);
346 } while (OVS_UNLIKELY(counter_changed(b2, c2)));
347 if (node) {
348 break;
349 }
350 } while (OVS_UNLIKELY(counter_changed(b1, c1)));
351
352 return node;
353}
354
0e666160
BP
355/* Searches 'cmap' for an element with the specified 'hash'. If one or more is
356 * found, returns a pointer to the first one, otherwise a null pointer. All of
357 * the nodes on the returned list are guaranteed to have exactly the given
358 * 'hash'.
359 *
360 * This function works even if 'cmap' is changing concurrently. If 'cmap' is
361 * not changing, then cmap_find_protected() is slightly faster.
362 *
363 * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */
55847abe 364const struct cmap_node *
0e666160
BP
365cmap_find(const struct cmap *cmap, uint32_t hash)
366{
55847abe 367 const struct cmap_impl *impl = cmap_get_impl(cmap);
0e666160
BP
368 uint32_t h1 = rehash(impl, hash);
369 uint32_t h2 = other_hash(h1);
0e666160 370
55847abe
JR
371 return cmap_find__(&impl->buckets[h1 & impl->mask],
372 &impl->buckets[h2 & impl->mask],
373 hash);
0e666160
BP
374}
375
52a524eb
JR
376/* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1,
377 * and sets the corresponding pointer in 'nodes', if the hash value was
378 * found from the 'cmap'. In other cases the 'nodes' values are not changed,
379 * i.e., no NULL pointers are stored there.
380 * Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer
381 * was stored, 0 otherwise.
382 * Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for
383 * hash collisions. */
384unsigned long
385cmap_find_batch(const struct cmap *cmap, unsigned long map,
386 uint32_t hashes[], const struct cmap_node *nodes[])
387{
388 const struct cmap_impl *impl = cmap_get_impl(cmap);
389 unsigned long result = map;
390 int i;
391 uint32_t h1s[sizeof map * CHAR_BIT];
392 const struct cmap_bucket *b1s[sizeof map * CHAR_BIT];
393 const struct cmap_bucket *b2s[sizeof map * CHAR_BIT];
394 uint32_t c1s[sizeof map * CHAR_BIT];
395
396 /* Compute hashes and prefetch 1st buckets. */
3ee6026a 397 ULLONG_FOR_EACH_1(i, map) {
52a524eb
JR
398 h1s[i] = rehash(impl, hashes[i]);
399 b1s[i] = &impl->buckets[h1s[i] & impl->mask];
400 OVS_PREFETCH(b1s[i]);
401 }
402 /* Lookups, Round 1. Only look up at the first bucket. */
3ee6026a 403 ULLONG_FOR_EACH_1(i, map) {
52a524eb
JR
404 uint32_t c1;
405 const struct cmap_bucket *b1 = b1s[i];
406 const struct cmap_node *node;
407
408 do {
409 c1 = read_even_counter(b1);
410 node = cmap_find_in_bucket(b1, hashes[i]);
411 } while (OVS_UNLIKELY(counter_changed(b1, c1)));
412
413 if (!node) {
414 /* Not found (yet); Prefetch the 2nd bucket. */
415 b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask];
416 OVS_PREFETCH(b2s[i]);
417 c1s[i] = c1; /* We may need to check this after Round 2. */
418 continue;
419 }
420 /* Found. */
3ee6026a 421 ULLONG_SET0(map, i); /* Ignore this on round 2. */
52a524eb
JR
422 OVS_PREFETCH(node);
423 nodes[i] = node;
424 }
425 /* Round 2. Look into the 2nd bucket, if needed. */
3ee6026a 426 ULLONG_FOR_EACH_1(i, map) {
52a524eb
JR
427 uint32_t c2;
428 const struct cmap_bucket *b2 = b2s[i];
429 const struct cmap_node *node;
430
431 do {
432 c2 = read_even_counter(b2);
433 node = cmap_find_in_bucket(b2, hashes[i]);
434 } while (OVS_UNLIKELY(counter_changed(b2, c2)));
435
436 if (!node) {
437 /* Not found, but the node may have been moved from b2 to b1 right
438 * after we finished with b1 earlier. We just got a clean reading
439 * of the 2nd bucket, so we check the counter of the 1st bucket
440 * only. However, we need to check both buckets again, as the
441 * entry may be moved again to the 2nd bucket. Basically, we
442 * need to loop as long as it takes to get stable readings of
443 * both buckets. cmap_find__() does that, and now that we have
444 * fetched both buckets we can just use it. */
445 if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) {
446 node = cmap_find__(b1s[i], b2s[i], hashes[i]);
447 if (node) {
448 goto found;
449 }
450 }
451 /* Not found. */
3ee6026a 452 ULLONG_SET0(result, i); /* Fix the result. */
52a524eb
JR
453 continue;
454 }
455found:
456 OVS_PREFETCH(node);
457 nodes[i] = node;
458 }
459 return result;
460}
461
0e666160 462static int
6ff0d5d6 463cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash)
0e666160
BP
464{
465 int i;
466
467 for (i = 0; i < CMAP_K; i++) {
5c416811 468 if (b->hashes[i] == hash && cmap_node_next_protected(&b->nodes[i])) {
0e666160
BP
469 return i;
470 }
471 }
472 return -1;
473}
474
475static struct cmap_node *
476cmap_find_bucket_protected(struct cmap_impl *impl, uint32_t hash, uint32_t h)
477{
478 struct cmap_bucket *b = &impl->buckets[h & impl->mask];
479 int i;
480
481 for (i = 0; i < CMAP_K; i++) {
5c416811
JR
482 if (b->hashes[i] == hash) {
483 return cmap_node_next_protected(&b->nodes[i]);
0e666160
BP
484 }
485 }
486 return NULL;
487}
488
489/* Like cmap_find(), but only for use if 'cmap' cannot change concurrently.
490 *
491 * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */
492struct cmap_node *
493cmap_find_protected(const struct cmap *cmap, uint32_t hash)
494{
495 struct cmap_impl *impl = cmap_get_impl(cmap);
496 uint32_t h1 = rehash(impl, hash);
497 uint32_t h2 = other_hash(hash);
498 struct cmap_node *node;
499
500 node = cmap_find_bucket_protected(impl, hash, h1);
501 if (node) {
502 return node;
503 }
504 return cmap_find_bucket_protected(impl, hash, h2);
505}
506
507static int
6ff0d5d6 508cmap_find_empty_slot_protected(const struct cmap_bucket *b)
0e666160
BP
509{
510 int i;
511
512 for (i = 0; i < CMAP_K; i++) {
6ff0d5d6 513 if (!cmap_node_next_protected(&b->nodes[i])) {
0e666160
BP
514 return i;
515 }
516 }
517 return -1;
518}
519
520static void
521cmap_set_bucket(struct cmap_bucket *b, int i,
522 struct cmap_node *node, uint32_t hash)
523{
524 uint32_t c;
525
526 atomic_read_explicit(&b->counter, &c, memory_order_acquire);
527 atomic_store_explicit(&b->counter, c + 1, memory_order_release);
6ff0d5d6 528 ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */
5c416811 529 b->hashes[i] = hash;
0e666160
BP
530 atomic_store_explicit(&b->counter, c + 2, memory_order_release);
531}
532
533/* Searches 'b' for a node with the given 'hash'. If it finds one, adds
534 * 'new_node' to the node's linked list and returns true. If it does not find
535 * one, returns false. */
536static bool
537cmap_insert_dup(struct cmap_node *new_node, uint32_t hash,
538 struct cmap_bucket *b)
539{
540 int i;
541
542 for (i = 0; i < CMAP_K; i++) {
5c416811
JR
543 if (b->hashes[i] == hash) {
544 struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
6ff0d5d6 545
0e666160
BP
546 if (node) {
547 struct cmap_node *p;
548
6ff0d5d6
JR
549 /* The common case is that 'new_node' is a singleton,
550 * with a null 'next' pointer. Rehashing can add a
551 * longer chain, but due to our invariant of always
552 * having all nodes with the same (user) hash value at
553 * a single chain, rehashing will always insert the
554 * chain to an empty node. The only way we can end up
555 * here is by the user inserting a chain of nodes at
556 * once. Find the end of the chain starting at
557 * 'new_node', then splice 'node' to the end of that
558 * chain. */
0e666160
BP
559 p = new_node;
560 for (;;) {
561 struct cmap_node *next = cmap_node_next_protected(p);
6ff0d5d6 562
0e666160
BP
563 if (!next) {
564 break;
565 }
566 p = next;
567 }
7e5f06c3 568 ovsrcu_set_hidden(&p->next, node);
0e666160
BP
569 } else {
570 /* The hash value is there from some previous insertion, but
571 * the associated node has been removed. We're not really
572 * inserting a duplicate, but we can still reuse the slot.
573 * Carry on. */
574 }
575
576 /* Change the bucket to point to 'new_node'. This is a degenerate
577 * form of cmap_set_bucket() that doesn't update the counter since
578 * we're only touching one field and in a way that doesn't change
579 * the bucket's meaning for readers. */
6ff0d5d6 580 ovsrcu_set(&b->nodes[i].next, new_node);
0e666160
BP
581
582 return true;
583 }
584 }
585 return false;
586}
587
588/* Searches 'b' for an empty slot. If successful, stores 'node' and 'hash' in
589 * the slot and returns true. Otherwise, returns false. */
590static bool
591cmap_insert_bucket(struct cmap_node *node, uint32_t hash,
592 struct cmap_bucket *b)
593{
594 int i;
595
596 for (i = 0; i < CMAP_K; i++) {
6ff0d5d6 597 if (!cmap_node_next_protected(&b->nodes[i])) {
0e666160
BP
598 cmap_set_bucket(b, i, node, hash);
599 return true;
600 }
601 }
602 return false;
603}
604
605/* Returns the other bucket that b->nodes[slot] could occupy in 'impl'. (This
606 * might be the same as 'b'.) */
607static struct cmap_bucket *
6ff0d5d6 608other_bucket_protected(struct cmap_impl *impl, struct cmap_bucket *b, int slot)
0e666160 609{
5c416811 610 uint32_t h1 = rehash(impl, b->hashes[slot]);
0e666160
BP
611 uint32_t h2 = other_hash(h1);
612 uint32_t b_idx = b - impl->buckets;
613 uint32_t other_h = (h1 & impl->mask) == b_idx ? h2 : h1;
614
615 return &impl->buckets[other_h & impl->mask];
616}
617
618/* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1'
619 * and 'b2' are full. This function attempts to rearrange buckets within
620 * 'impl' to make room for 'new_node'.
621 *
622 * The implementation is a general-purpose breadth-first search. At first
623 * glance, this is more complex than a random walk through 'impl' (suggested by
624 * some references), but random walks have a tendency to loop back through a
625 * single bucket. We have to move nodes backward along the path that we find,
626 * so that no node actually disappears from the hash table, which means a
627 * random walk would have to be careful to deal with loops. By contrast, a
628 * successful breadth-first search always finds a *shortest* path through the
629 * hash table, and a shortest path will never contain loops, so it avoids that
630 * problem entirely.
631 */
632static bool
633cmap_insert_bfs(struct cmap_impl *impl, struct cmap_node *new_node,
634 uint32_t hash, struct cmap_bucket *b1, struct cmap_bucket *b2)
635{
52a6bdaf 636 enum { MAX_DEPTH = 4 };
0e666160
BP
637
638 /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'.
639 *
640 * One can follow the path via:
641 *
642 * struct cmap_bucket *b;
643 * int i;
644 *
645 * b = path->start;
646 * for (i = 0; i < path->n; i++) {
6ff0d5d6 647 * b = other_bucket_protected(impl, b, path->slots[i]);
0e666160
BP
648 * }
649 * ovs_assert(b == path->end);
650 */
651 struct cmap_path {
652 struct cmap_bucket *start; /* First bucket along the path. */
653 struct cmap_bucket *end; /* Last bucket on the path. */
52a6bdaf 654 uint8_t slots[MAX_DEPTH]; /* Slots used for each hop. */
0e666160
BP
655 int n; /* Number of slots[]. */
656 };
657
658 /* We need to limit the amount of work we do trying to find a path. It
659 * might actually be impossible to rearrange the cmap, and after some time
660 * it is likely to be easier to rehash the entire cmap.
661 *
662 * This value of MAX_QUEUE is an arbitrary limit suggested by one of the
663 * references. Empirically, it seems to work OK. */
664 enum { MAX_QUEUE = 500 };
665 struct cmap_path queue[MAX_QUEUE];
666 int head = 0;
667 int tail = 0;
668
669 /* Add 'b1' and 'b2' as starting points for the search. */
670 queue[head].start = b1;
671 queue[head].end = b1;
672 queue[head].n = 0;
673 head++;
674 if (b1 != b2) {
675 queue[head].start = b2;
676 queue[head].end = b2;
677 queue[head].n = 0;
678 head++;
679 }
680
681 while (tail < head) {
682 const struct cmap_path *path = &queue[tail++];
683 struct cmap_bucket *this = path->end;
684 int i;
685
686 for (i = 0; i < CMAP_K; i++) {
6ff0d5d6 687 struct cmap_bucket *next = other_bucket_protected(impl, this, i);
0e666160
BP
688 int j;
689
690 if (this == next) {
691 continue;
692 }
693
6ff0d5d6 694 j = cmap_find_empty_slot_protected(next);
0e666160
BP
695 if (j >= 0) {
696 /* We've found a path along which we can rearrange the hash
697 * table: Start at path->start, follow all the slots in
698 * path->slots[], then follow slot 'i', then the bucket you
699 * arrive at has slot 'j' empty. */
52a6bdaf
GS
700 struct cmap_bucket *buckets[MAX_DEPTH + 2];
701 int slots[MAX_DEPTH + 2];
0e666160
BP
702 int k;
703
704 /* Figure out the full sequence of slots. */
705 for (k = 0; k < path->n; k++) {
706 slots[k] = path->slots[k];
707 }
708 slots[path->n] = i;
709 slots[path->n + 1] = j;
710
711 /* Figure out the full sequence of buckets. */
712 buckets[0] = path->start;
713 for (k = 0; k <= path->n; k++) {
6ff0d5d6 714 buckets[k + 1] = other_bucket_protected(impl, buckets[k], slots[k]);
0e666160
BP
715 }
716
717 /* Now the path is fully expressed. One can start from
718 * buckets[0], go via slots[0] to buckets[1], via slots[1] to
719 * buckets[2], and so on.
720 *
721 * Move all the nodes across the path "backward". After each
722 * step some node appears in two buckets. Thus, every node is
723 * always visible to a concurrent search. */
724 for (k = path->n + 1; k > 0; k--) {
725 int slot = slots[k - 1];
726
5c416811
JR
727 cmap_set_bucket(
728 buckets[k], slots[k],
729 cmap_node_next_protected(&buckets[k - 1]->nodes[slot]),
730 buckets[k - 1]->hashes[slot]);
0e666160
BP
731 }
732
733 /* Finally, replace the first node on the path by
734 * 'new_node'. */
735 cmap_set_bucket(buckets[0], slots[0], new_node, hash);
736
737 return true;
738 }
739
52a6bdaf 740 if (path->n < MAX_DEPTH && head < MAX_QUEUE) {
0e666160
BP
741 struct cmap_path *new_path = &queue[head++];
742
743 *new_path = *path;
744 new_path->end = next;
745 new_path->slots[new_path->n++] = i;
746 }
747 }
748 }
749
750 return false;
751}
752
753/* Adds 'node', with the given 'hash', to 'impl'.
754 *
755 * 'node' is ordinarily a single node, with a null 'next' pointer. When
756 * rehashing, however, it may be a longer chain of nodes. */
757static bool
758cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash)
759{
760 uint32_t h1 = rehash(impl, hash);
761 uint32_t h2 = other_hash(h1);
762 struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask];
763 struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask];
764
765 return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) ||
766 cmap_insert_dup(node, hash, b2)) ||
767 OVS_LIKELY(cmap_insert_bucket(node, hash, b1) ||
768 cmap_insert_bucket(node, hash, b2)) ||
769 cmap_insert_bfs(impl, node, hash, b1, b2));
770}
771
772/* Inserts 'node', with the given 'hash', into 'cmap'. The caller must ensure
773 * that 'cmap' cannot change concurrently (from another thread). If duplicates
774 * are undesirable, the caller must have already verified that 'cmap' does not
ee58b469
JR
775 * contain a duplicate of 'node'.
776 *
777 * Returns the current number of nodes in the cmap after the insertion. */
778size_t
0e666160
BP
779cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash)
780{
781 struct cmap_impl *impl = cmap_get_impl(cmap);
782
7e5f06c3 783 ovsrcu_set_hidden(&node->next, NULL);
0e666160
BP
784
785 if (OVS_UNLIKELY(impl->n >= impl->max_n)) {
8f9392dc 786 COVERAGE_INC(cmap_expand);
0e666160
BP
787 impl = cmap_rehash(cmap, (impl->mask << 1) | 1);
788 }
789
790 while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) {
791 impl = cmap_rehash(cmap, impl->mask);
792 }
ee58b469 793 return ++impl->n;
0e666160
BP
794}
795
796static bool
9d933fc7
JR
797cmap_replace__(struct cmap_impl *impl, struct cmap_node *node,
798 struct cmap_node *replacement, uint32_t hash, uint32_t h)
0e666160
BP
799{
800 struct cmap_bucket *b = &impl->buckets[h & impl->mask];
801 int slot;
802
6ff0d5d6 803 slot = cmap_find_slot_protected(b, hash);
0e666160
BP
804 if (slot < 0) {
805 return false;
806 }
807
9d933fc7
JR
808 /* The pointer to 'node' is changed to point to 'replacement',
809 * which is the next node if no replacement node is given. */
810 if (!replacement) {
811 replacement = cmap_node_next_protected(node);
812 } else {
813 /* 'replacement' takes the position of 'node' in the list. */
7e5f06c3 814 ovsrcu_set_hidden(&replacement->next, cmap_node_next_protected(node));
9d933fc7
JR
815 }
816
6ff0d5d6
JR
817 struct cmap_node *iter = &b->nodes[slot];
818 for (;;) {
819 struct cmap_node *next = cmap_node_next_protected(iter);
820
821 if (next == node) {
822 ovsrcu_set(&iter->next, replacement);
823 return true;
0e666160 824 }
6ff0d5d6 825 iter = next;
0e666160 826 }
0e666160
BP
827}
828
9d933fc7
JR
829/* Replaces 'old_node' in 'cmap' with 'new_node'. The caller must
830 * ensure that 'cmap' cannot change concurrently (from another thread).
831 *
832 * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or
833 * into any other concurrent hash map while any other thread might be accessing
834 * it. One correct way to do this is to free it from an RCU callback with
ee58b469
JR
835 * ovsrcu_postpone().
836 *
837 * Returns the current number of nodes in the cmap after the replacement. The
838 * number of nodes decreases by one if 'new_node' is NULL. */
839size_t
9d933fc7
JR
840cmap_replace(struct cmap *cmap, struct cmap_node *old_node,
841 struct cmap_node *new_node, uint32_t hash)
0e666160
BP
842{
843 struct cmap_impl *impl = cmap_get_impl(cmap);
844 uint32_t h1 = rehash(impl, hash);
845 uint32_t h2 = other_hash(h1);
846 bool ok;
847
9d933fc7
JR
848 ok = cmap_replace__(impl, old_node, new_node, hash, h1)
849 || cmap_replace__(impl, old_node, new_node, hash, h2);
0e666160 850 ovs_assert(ok);
ee58b469
JR
851
852 if (!new_node) {
853 impl->n--;
8f9392dc
AW
854 if (OVS_UNLIKELY(impl->n < impl->min_n)) {
855 COVERAGE_INC(cmap_shrink);
856 impl = cmap_rehash(cmap, impl->mask >> 1);
857 }
ee58b469
JR
858 }
859 return impl->n;
0e666160
BP
860}
861
862static bool
863cmap_try_rehash(const struct cmap_impl *old, struct cmap_impl *new)
864{
865 const struct cmap_bucket *b;
866
867 for (b = old->buckets; b <= &old->buckets[old->mask]; b++) {
868 int i;
869
870 for (i = 0; i < CMAP_K; i++) {
871 /* possible optimization here because we know the hashes are
872 * unique */
6ff0d5d6 873 struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
0e666160 874
5c416811 875 if (node && !cmap_try_insert(new, node, b->hashes[i])) {
0e666160
BP
876 return false;
877 }
878 }
879 }
880 return true;
881}
882
883static struct cmap_impl *
884cmap_rehash(struct cmap *cmap, uint32_t mask)
885{
886 struct cmap_impl *old = cmap_get_impl(cmap);
887 struct cmap_impl *new;
888
889 new = cmap_impl_create(mask);
890 ovs_assert(old->n < new->max_n);
891
892 while (!cmap_try_rehash(old, new)) {
893 memset(new->buckets, 0, (mask + 1) * sizeof *new->buckets);
894 new->basis = random_uint32();
895 }
896
897 new->n = old->n;
898 ovsrcu_set(&cmap->impl, new);
b70e6976
BP
899 if (old != &empty_cmap) {
900 ovsrcu_postpone(free_cacheline, old);
901 }
0e666160
BP
902
903 return new;
904}
905
78c8df12
BP
906struct cmap_cursor
907cmap_cursor_start(const struct cmap *cmap)
0e666160 908{
78c8df12
BP
909 struct cmap_cursor cursor;
910
911 cursor.impl = cmap_get_impl(cmap);
912 cursor.bucket_idx = 0;
913 cursor.entry_idx = 0;
914 cursor.node = NULL;
915 cmap_cursor_advance(&cursor);
916
917 return cursor;
0e666160
BP
918}
919
78c8df12
BP
920void
921cmap_cursor_advance(struct cmap_cursor *cursor)
0e666160
BP
922{
923 const struct cmap_impl *impl = cursor->impl;
924
78c8df12
BP
925 if (cursor->node) {
926 cursor->node = cmap_node_next(cursor->node);
927 if (cursor->node) {
928 return;
0e666160
BP
929 }
930 }
931
932 while (cursor->bucket_idx <= impl->mask) {
933 const struct cmap_bucket *b = &impl->buckets[cursor->bucket_idx];
934
935 while (cursor->entry_idx < CMAP_K) {
78c8df12
BP
936 cursor->node = cmap_node_next(&b->nodes[cursor->entry_idx++]);
937 if (cursor->node) {
938 return;
0e666160
BP
939 }
940 }
941
942 cursor->bucket_idx++;
943 cursor->entry_idx = 0;
944 }
0e666160
BP
945}
946
947/* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in
948 * 'cmap'. Uses '*pos' to determine where to begin iteration, and updates
949 * '*pos' to pass on the next iteration into them before returning.
950 *
951 * It's better to use plain CMAP_FOR_EACH and related functions, since they are
952 * faster and better at dealing with cmaps that change during iteration.
953 *
954 * Before beginning iteration, set '*pos' to all zeros. */
955struct cmap_node *
956cmap_next_position(const struct cmap *cmap,
957 struct cmap_position *pos)
958{
959 struct cmap_impl *impl = cmap_get_impl(cmap);
960 unsigned int bucket = pos->bucket;
961 unsigned int entry = pos->entry;
962 unsigned int offset = pos->offset;
963
964 while (bucket <= impl->mask) {
965 const struct cmap_bucket *b = &impl->buckets[bucket];
966
967 while (entry < CMAP_K) {
6ff0d5d6 968 const struct cmap_node *node = cmap_node_next(&b->nodes[entry]);
0e666160
BP
969 unsigned int i;
970
cd50723c 971 for (i = 0; node; i++, node = cmap_node_next(node)) {
0e666160
BP
972 if (i == offset) {
973 if (cmap_node_next(node)) {
974 offset++;
975 } else {
976 entry++;
977 offset = 0;
978 }
979 pos->bucket = bucket;
980 pos->entry = entry;
981 pos->offset = offset;
982 return CONST_CAST(struct cmap_node *, node);
983 }
984 }
985
986 entry++;
987 offset = 0;
988 }
989
990 bucket++;
991 entry = offset = 0;
992 }
993
994 pos->bucket = pos->entry = pos->offset = 0;
995 return NULL;
996}