]> git.proxmox.com Git - mirror_ovs.git/blob - lib/cmap.c
7a54ea6ab2d0bac57fc914807c73b63a4ddb29a4
[mirror_ovs.git] / lib / cmap.c
1 /*
2 * Copyright (c) 2014 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "cmap.h"
19 #include "coverage.h"
20 #include "bitmap.h"
21 #include "hash.h"
22 #include "ovs-rcu.h"
23 #include "random.h"
24 #include "util.h"
25
26 COVERAGE_DEFINE(cmap_expand);
27 COVERAGE_DEFINE(cmap_shrink);
28
29 /* Optimistic Concurrent Cuckoo Hash
30 * =================================
31 *
32 * A "cuckoo hash" is an open addressing hash table schema, designed such that
33 * a given element can be in one of only a small number of buckets 'd', each of
34 * which holds up to a small number 'k' elements. Thus, the expected and
35 * worst-case lookup times are O(1) because they require comparing no more than
36 * a fixed number of elements (k * d). Inserting a new element can require
37 * moving around existing elements, but it is also O(1) amortized expected
38 * time.
39 *
40 * An optimistic concurrent hash table goes one step further, making it
41 * possible for a single writer to execute concurrently with any number of
42 * readers without requiring the readers to take any locks.
43 *
44 * This cuckoo hash implementation uses:
45 *
46 * - Two hash functions (d=2). More hash functions allow for a higher load
47 * factor, but increasing 'k' is easier and the benefits of increasing 'd'
48 * quickly fall off with the 'k' values used here. Also, the method of
49 * generating hashes used in this implementation is hard to reasonably
50 * extend beyond d=2. Finally, each additional hash function means that a
51 * lookup has to look at least one extra cache line.
52 *
53 * - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets
54 * exactly one cache line in size.
55 *
56 * According to Erlingsson [4], these parameters suggest a maximum load factor
57 * of about 93%. The current implementation is conservative, expanding the
58 * hash table when it is over 85% full.
59 *
60 * When the load factor is below 20%, the hash table will be shrinked by half.
61 * This is to reduce the memory utilization of the hash table and to avoid
62 * the hash table occupying the top of heap chunk which prevents the trimming
63 * of heap.
64 *
65 * Hash Functions
66 * ==============
67 *
68 * A cuckoo hash requires multiple hash functions. When reorganizing the hash
69 * becomes too difficult, it also requires the ability to change the hash
70 * functions. Requiring the client to provide multiple hashes and to be able
71 * to change them to new hashes upon insertion is inconvenient.
72 *
73 * This implementation takes another approach. The client provides a single,
74 * fixed hash. The cuckoo hash internally "rehashes" this hash against a
75 * randomly selected basis value (see rehash()). This rehashed value is one of
76 * the two hashes. The other hash is computed by 16-bit circular rotation of
77 * the rehashed value. Updating the basis changes the hash functions.
78 *
79 * To work properly, the hash functions used by a cuckoo hash must be
80 * independent. If one hash function is a function of the other (e.g. h2(x) =
81 * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail
82 * catastrophically (loop forever) because of collisions. With this rehashing
83 * technique, the two hashes are completely independent for masks up to 16 bits
84 * wide. For masks wider than 16 bits, only 32-n bits are independent between
85 * the two hashes. Thus, it becomes risky to grow a cuckoo hash table beyond
86 * about 2**24 buckets (about 71 million elements with k=5 and maximum load
87 * 85%). Fortunately, Open vSwitch does not normally deal with hash tables
88 * this large.
89 *
90 *
91 * Handling Duplicates
92 * ===================
93 *
94 * This cuckoo hash table implementation deals with duplicate client-provided
95 * hash values by chaining: the second and subsequent cmap_nodes with a given
96 * hash are chained off the initially inserted node's 'next' member. The hash
97 * table maintains the invariant that a single client-provided hash value
98 * exists in only a single chain in a single bucket (even though that hash
99 * could be stored in two buckets).
100 *
101 *
102 * References
103 * ==========
104 *
105 * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High
106 * Performance Ethernet Forwarding with CuckooSwitch". In Proc. 9th
107 * CoNEXT, Dec. 2013.
108 *
109 * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent
110 * memcache with dumber caching and smarter hashing". In Proc. 10th USENIX
111 * NSDI, Apr. 2013
112 *
113 * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2):
114 * 122-144, May 2004.
115 *
116 * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical
117 * Alternative to Traditional Hash Tables". In Proc. 7th Workshop on
118 * Distributed Data and Structures (WDAS'06), 2006.
119 */
120 /* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */
121 #define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8))
122
123 /* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit. */
124 #define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE)
125
126 /* Pad to make a bucket a full cache line in size: 4 on 32-bit, 0 on 64-bit. */
127 #define CMAP_PADDING ((CACHE_LINE_SIZE - 4) - (CMAP_K * CMAP_ENTRY_SIZE))
128
129 /* A cuckoo hash bucket. Designed to be cache-aligned and exactly one cache
130 * line long. */
131 struct cmap_bucket {
132 /* Allows readers to track in-progress changes. Initially zero, each
133 * writer increments this value just before and just after each change (see
134 * cmap_set_bucket()). Thus, a reader can ensure that it gets a consistent
135 * snapshot by waiting for the counter to become even (see
136 * read_even_counter()), then checking that its value does not change while
137 * examining the bucket (see cmap_find()). */
138 atomic_uint32_t counter;
139
140 /* (hash, node) slots. They are parallel arrays instead of an array of
141 * structs to reduce the amount of space lost to padding.
142 *
143 * The slots are in no particular order. A null pointer indicates that a
144 * pair is unused. In-use slots are not necessarily in the earliest
145 * slots. */
146 uint32_t hashes[CMAP_K];
147 struct cmap_node nodes[CMAP_K];
148
149 /* Padding to make cmap_bucket exactly one cache line long. */
150 #if CMAP_PADDING > 0
151 uint8_t pad[CMAP_PADDING];
152 #endif
153 };
154 BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE);
155
156 /* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
157 * enlarging a cmap. Reasonable values lie between about 75% and 93%. Smaller
158 * values waste memory; larger values increase the average insertion time. */
159 #define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
160
161 /* Default minimum load factor (as a fraction of UINT32_MAX + 1) before
162 * shrinking a cmap. Currently, the value is chosen to be 20%, this
163 * means cmap will have a 40% load factor after shrink. */
164 #define CMAP_MIN_LOAD ((uint32_t) (UINT32_MAX * .20))
165
166 /* The implementation of a concurrent hash map. */
167 struct cmap_impl {
168 unsigned int n; /* Number of in-use elements. */
169 unsigned int max_n; /* Max elements before enlarging. */
170 unsigned int min_n; /* Min elements before shrinking. */
171 uint32_t mask; /* Number of 'buckets', minus one. */
172 uint32_t basis; /* Basis for rehashing client's hash values. */
173
174 /* Padding to make cmap_impl exactly one cache line long. */
175 uint8_t pad[CACHE_LINE_SIZE - sizeof(unsigned int) * 5];
176
177 struct cmap_bucket buckets[];
178 };
179 BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE);
180
181 static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask);
182
183 /* Explicit inline keywords in utility functions seem to be necessary
184 * to prevent performance regression on cmap_find(). */
185
186 /* Given a rehashed value 'hash', returns the other hash for that rehashed
187 * value. This is symmetric: other_hash(other_hash(x)) == x. (See also "Hash
188 * Functions" at the top of this file.) */
189 static inline uint32_t
190 other_hash(uint32_t hash)
191 {
192 return (hash << 16) | (hash >> 16);
193 }
194
195 /* Returns the rehashed value for 'hash' within 'impl'. (See also "Hash
196 * Functions" at the top of this file.) */
197 static inline uint32_t
198 rehash(const struct cmap_impl *impl, uint32_t hash)
199 {
200 return hash_finish(impl->basis, hash);
201 }
202
203 /* Not always without the inline keyword. */
204 static inline struct cmap_impl *
205 cmap_get_impl(const struct cmap *cmap)
206 {
207 return ovsrcu_get(struct cmap_impl *, &cmap->impl);
208 }
209
210 static uint32_t
211 calc_max_n(uint32_t mask)
212 {
213 return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MAX_LOAD) >> 32;
214 }
215
216 static uint32_t
217 calc_min_n(uint32_t mask)
218 {
219 return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MIN_LOAD) >> 32;
220 }
221
222 static struct cmap_impl *
223 cmap_impl_create(uint32_t mask)
224 {
225 struct cmap_impl *impl;
226
227 ovs_assert(is_pow2(mask + 1));
228
229 impl = xzalloc_cacheline(sizeof *impl
230 + (mask + 1) * sizeof *impl->buckets);
231 impl->n = 0;
232 impl->max_n = calc_max_n(mask);
233 impl->min_n = calc_min_n(mask);
234 impl->mask = mask;
235 impl->basis = random_uint32();
236
237 return impl;
238 }
239
240 /* Initializes 'cmap' as an empty concurrent hash map. */
241 void
242 cmap_init(struct cmap *cmap)
243 {
244 ovsrcu_set(&cmap->impl, cmap_impl_create(0));
245 }
246
247 /* Destroys 'cmap'.
248 *
249 * The client is responsible for destroying any data previously held in
250 * 'cmap'. */
251 void
252 cmap_destroy(struct cmap *cmap)
253 {
254 if (cmap) {
255 ovsrcu_postpone(free_cacheline, cmap_get_impl(cmap));
256 }
257 }
258
259 /* Returns the number of elements in 'cmap'. */
260 size_t
261 cmap_count(const struct cmap *cmap)
262 {
263 return cmap_get_impl(cmap)->n;
264 }
265
266 /* Returns true if 'cmap' is empty, false otherwise. */
267 bool
268 cmap_is_empty(const struct cmap *cmap)
269 {
270 return cmap_count(cmap) == 0;
271 }
272
273 static inline uint32_t
274 read_counter(const struct cmap_bucket *bucket_)
275 {
276 struct cmap_bucket *bucket = CONST_CAST(struct cmap_bucket *, bucket_);
277 uint32_t counter;
278
279 atomic_read_explicit(&bucket->counter, &counter, memory_order_acquire);
280
281 return counter;
282 }
283
284 static inline uint32_t
285 read_even_counter(const struct cmap_bucket *bucket)
286 {
287 uint32_t counter;
288
289 do {
290 counter = read_counter(bucket);
291 } while (OVS_UNLIKELY(counter & 1));
292
293 return counter;
294 }
295
296 static inline bool
297 counter_changed(const struct cmap_bucket *b_, uint32_t c)
298 {
299 struct cmap_bucket *b = CONST_CAST(struct cmap_bucket *, b_);
300 uint32_t counter;
301
302 /* Need to make sure the counter read is not moved up, before the hash and
303 * cmap_node_next(). Using atomic_read_explicit with memory_order_acquire
304 * would allow prior reads to be moved after the barrier.
305 * atomic_thread_fence prevents all following memory accesses from moving
306 * prior to preceding loads. */
307 atomic_thread_fence(memory_order_acquire);
308 atomic_read_relaxed(&b->counter, &counter);
309
310 return OVS_UNLIKELY(counter != c);
311 }
312
313 static inline const struct cmap_node *
314 cmap_find_in_bucket(const struct cmap_bucket *bucket, uint32_t hash)
315 {
316 for (int i = 0; i < CMAP_K; i++) {
317 if (bucket->hashes[i] == hash) {
318 return cmap_node_next(&bucket->nodes[i]);
319 }
320 }
321 return NULL;
322 }
323
324 static inline const struct cmap_node *
325 cmap_find__(const struct cmap_bucket *b1, const struct cmap_bucket *b2,
326 uint32_t hash)
327 {
328 uint32_t c1, c2;
329 const struct cmap_node *node;
330
331 do {
332 do {
333 c1 = read_even_counter(b1);
334 node = cmap_find_in_bucket(b1, hash);
335 } while (OVS_UNLIKELY(counter_changed(b1, c1)));
336 if (node) {
337 break;
338 }
339 do {
340 c2 = read_even_counter(b2);
341 node = cmap_find_in_bucket(b2, hash);
342 } while (OVS_UNLIKELY(counter_changed(b2, c2)));
343 if (node) {
344 break;
345 }
346 } while (OVS_UNLIKELY(counter_changed(b1, c1)));
347
348 return node;
349 }
350
351 /* Searches 'cmap' for an element with the specified 'hash'. If one or more is
352 * found, returns a pointer to the first one, otherwise a null pointer. All of
353 * the nodes on the returned list are guaranteed to have exactly the given
354 * 'hash'.
355 *
356 * This function works even if 'cmap' is changing concurrently. If 'cmap' is
357 * not changing, then cmap_find_protected() is slightly faster.
358 *
359 * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */
360 const struct cmap_node *
361 cmap_find(const struct cmap *cmap, uint32_t hash)
362 {
363 const struct cmap_impl *impl = cmap_get_impl(cmap);
364 uint32_t h1 = rehash(impl, hash);
365 uint32_t h2 = other_hash(h1);
366
367 return cmap_find__(&impl->buckets[h1 & impl->mask],
368 &impl->buckets[h2 & impl->mask],
369 hash);
370 }
371
372 /* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1,
373 * and sets the corresponding pointer in 'nodes', if the hash value was
374 * found from the 'cmap'. In other cases the 'nodes' values are not changed,
375 * i.e., no NULL pointers are stored there.
376 * Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer
377 * was stored, 0 otherwise.
378 * Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for
379 * hash collisions. */
380 unsigned long
381 cmap_find_batch(const struct cmap *cmap, unsigned long map,
382 uint32_t hashes[], const struct cmap_node *nodes[])
383 {
384 const struct cmap_impl *impl = cmap_get_impl(cmap);
385 unsigned long result = map;
386 int i;
387 uint32_t h1s[sizeof map * CHAR_BIT];
388 const struct cmap_bucket *b1s[sizeof map * CHAR_BIT];
389 const struct cmap_bucket *b2s[sizeof map * CHAR_BIT];
390 uint32_t c1s[sizeof map * CHAR_BIT];
391
392 /* Compute hashes and prefetch 1st buckets. */
393 ULLONG_FOR_EACH_1(i, map) {
394 h1s[i] = rehash(impl, hashes[i]);
395 b1s[i] = &impl->buckets[h1s[i] & impl->mask];
396 OVS_PREFETCH(b1s[i]);
397 }
398 /* Lookups, Round 1. Only look up at the first bucket. */
399 ULLONG_FOR_EACH_1(i, map) {
400 uint32_t c1;
401 const struct cmap_bucket *b1 = b1s[i];
402 const struct cmap_node *node;
403
404 do {
405 c1 = read_even_counter(b1);
406 node = cmap_find_in_bucket(b1, hashes[i]);
407 } while (OVS_UNLIKELY(counter_changed(b1, c1)));
408
409 if (!node) {
410 /* Not found (yet); Prefetch the 2nd bucket. */
411 b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask];
412 OVS_PREFETCH(b2s[i]);
413 c1s[i] = c1; /* We may need to check this after Round 2. */
414 continue;
415 }
416 /* Found. */
417 ULLONG_SET0(map, i); /* Ignore this on round 2. */
418 OVS_PREFETCH(node);
419 nodes[i] = node;
420 }
421 /* Round 2. Look into the 2nd bucket, if needed. */
422 ULLONG_FOR_EACH_1(i, map) {
423 uint32_t c2;
424 const struct cmap_bucket *b2 = b2s[i];
425 const struct cmap_node *node;
426
427 do {
428 c2 = read_even_counter(b2);
429 node = cmap_find_in_bucket(b2, hashes[i]);
430 } while (OVS_UNLIKELY(counter_changed(b2, c2)));
431
432 if (!node) {
433 /* Not found, but the node may have been moved from b2 to b1 right
434 * after we finished with b1 earlier. We just got a clean reading
435 * of the 2nd bucket, so we check the counter of the 1st bucket
436 * only. However, we need to check both buckets again, as the
437 * entry may be moved again to the 2nd bucket. Basically, we
438 * need to loop as long as it takes to get stable readings of
439 * both buckets. cmap_find__() does that, and now that we have
440 * fetched both buckets we can just use it. */
441 if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) {
442 node = cmap_find__(b1s[i], b2s[i], hashes[i]);
443 if (node) {
444 goto found;
445 }
446 }
447 /* Not found. */
448 ULLONG_SET0(result, i); /* Fix the result. */
449 continue;
450 }
451 found:
452 OVS_PREFETCH(node);
453 nodes[i] = node;
454 }
455 return result;
456 }
457
458 static int
459 cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash)
460 {
461 int i;
462
463 for (i = 0; i < CMAP_K; i++) {
464 if (b->hashes[i] == hash && cmap_node_next_protected(&b->nodes[i])) {
465 return i;
466 }
467 }
468 return -1;
469 }
470
471 static struct cmap_node *
472 cmap_find_bucket_protected(struct cmap_impl *impl, uint32_t hash, uint32_t h)
473 {
474 struct cmap_bucket *b = &impl->buckets[h & impl->mask];
475 int i;
476
477 for (i = 0; i < CMAP_K; i++) {
478 if (b->hashes[i] == hash) {
479 return cmap_node_next_protected(&b->nodes[i]);
480 }
481 }
482 return NULL;
483 }
484
485 /* Like cmap_find(), but only for use if 'cmap' cannot change concurrently.
486 *
487 * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */
488 struct cmap_node *
489 cmap_find_protected(const struct cmap *cmap, uint32_t hash)
490 {
491 struct cmap_impl *impl = cmap_get_impl(cmap);
492 uint32_t h1 = rehash(impl, hash);
493 uint32_t h2 = other_hash(hash);
494 struct cmap_node *node;
495
496 node = cmap_find_bucket_protected(impl, hash, h1);
497 if (node) {
498 return node;
499 }
500 return cmap_find_bucket_protected(impl, hash, h2);
501 }
502
503 static int
504 cmap_find_empty_slot_protected(const struct cmap_bucket *b)
505 {
506 int i;
507
508 for (i = 0; i < CMAP_K; i++) {
509 if (!cmap_node_next_protected(&b->nodes[i])) {
510 return i;
511 }
512 }
513 return -1;
514 }
515
516 static void
517 cmap_set_bucket(struct cmap_bucket *b, int i,
518 struct cmap_node *node, uint32_t hash)
519 {
520 uint32_t c;
521
522 atomic_read_explicit(&b->counter, &c, memory_order_acquire);
523 atomic_store_explicit(&b->counter, c + 1, memory_order_release);
524 ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */
525 b->hashes[i] = hash;
526 atomic_store_explicit(&b->counter, c + 2, memory_order_release);
527 }
528
529 /* Searches 'b' for a node with the given 'hash'. If it finds one, adds
530 * 'new_node' to the node's linked list and returns true. If it does not find
531 * one, returns false. */
532 static bool
533 cmap_insert_dup(struct cmap_node *new_node, uint32_t hash,
534 struct cmap_bucket *b)
535 {
536 int i;
537
538 for (i = 0; i < CMAP_K; i++) {
539 if (b->hashes[i] == hash) {
540 struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
541
542 if (node) {
543 struct cmap_node *p;
544
545 /* The common case is that 'new_node' is a singleton,
546 * with a null 'next' pointer. Rehashing can add a
547 * longer chain, but due to our invariant of always
548 * having all nodes with the same (user) hash value at
549 * a single chain, rehashing will always insert the
550 * chain to an empty node. The only way we can end up
551 * here is by the user inserting a chain of nodes at
552 * once. Find the end of the chain starting at
553 * 'new_node', then splice 'node' to the end of that
554 * chain. */
555 p = new_node;
556 for (;;) {
557 struct cmap_node *next = cmap_node_next_protected(p);
558
559 if (!next) {
560 break;
561 }
562 p = next;
563 }
564 ovsrcu_set_hidden(&p->next, node);
565 } else {
566 /* The hash value is there from some previous insertion, but
567 * the associated node has been removed. We're not really
568 * inserting a duplicate, but we can still reuse the slot.
569 * Carry on. */
570 }
571
572 /* Change the bucket to point to 'new_node'. This is a degenerate
573 * form of cmap_set_bucket() that doesn't update the counter since
574 * we're only touching one field and in a way that doesn't change
575 * the bucket's meaning for readers. */
576 ovsrcu_set(&b->nodes[i].next, new_node);
577
578 return true;
579 }
580 }
581 return false;
582 }
583
584 /* Searches 'b' for an empty slot. If successful, stores 'node' and 'hash' in
585 * the slot and returns true. Otherwise, returns false. */
586 static bool
587 cmap_insert_bucket(struct cmap_node *node, uint32_t hash,
588 struct cmap_bucket *b)
589 {
590 int i;
591
592 for (i = 0; i < CMAP_K; i++) {
593 if (!cmap_node_next_protected(&b->nodes[i])) {
594 cmap_set_bucket(b, i, node, hash);
595 return true;
596 }
597 }
598 return false;
599 }
600
601 /* Returns the other bucket that b->nodes[slot] could occupy in 'impl'. (This
602 * might be the same as 'b'.) */
603 static struct cmap_bucket *
604 other_bucket_protected(struct cmap_impl *impl, struct cmap_bucket *b, int slot)
605 {
606 uint32_t h1 = rehash(impl, b->hashes[slot]);
607 uint32_t h2 = other_hash(h1);
608 uint32_t b_idx = b - impl->buckets;
609 uint32_t other_h = (h1 & impl->mask) == b_idx ? h2 : h1;
610
611 return &impl->buckets[other_h & impl->mask];
612 }
613
614 /* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1'
615 * and 'b2' are full. This function attempts to rearrange buckets within
616 * 'impl' to make room for 'new_node'.
617 *
618 * The implementation is a general-purpose breadth-first search. At first
619 * glance, this is more complex than a random walk through 'impl' (suggested by
620 * some references), but random walks have a tendency to loop back through a
621 * single bucket. We have to move nodes backward along the path that we find,
622 * so that no node actually disappears from the hash table, which means a
623 * random walk would have to be careful to deal with loops. By contrast, a
624 * successful breadth-first search always finds a *shortest* path through the
625 * hash table, and a shortest path will never contain loops, so it avoids that
626 * problem entirely.
627 */
628 static bool
629 cmap_insert_bfs(struct cmap_impl *impl, struct cmap_node *new_node,
630 uint32_t hash, struct cmap_bucket *b1, struct cmap_bucket *b2)
631 {
632 enum { MAX_DEPTH = 4 };
633
634 /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'.
635 *
636 * One can follow the path via:
637 *
638 * struct cmap_bucket *b;
639 * int i;
640 *
641 * b = path->start;
642 * for (i = 0; i < path->n; i++) {
643 * b = other_bucket_protected(impl, b, path->slots[i]);
644 * }
645 * ovs_assert(b == path->end);
646 */
647 struct cmap_path {
648 struct cmap_bucket *start; /* First bucket along the path. */
649 struct cmap_bucket *end; /* Last bucket on the path. */
650 uint8_t slots[MAX_DEPTH]; /* Slots used for each hop. */
651 int n; /* Number of slots[]. */
652 };
653
654 /* We need to limit the amount of work we do trying to find a path. It
655 * might actually be impossible to rearrange the cmap, and after some time
656 * it is likely to be easier to rehash the entire cmap.
657 *
658 * This value of MAX_QUEUE is an arbitrary limit suggested by one of the
659 * references. Empirically, it seems to work OK. */
660 enum { MAX_QUEUE = 500 };
661 struct cmap_path queue[MAX_QUEUE];
662 int head = 0;
663 int tail = 0;
664
665 /* Add 'b1' and 'b2' as starting points for the search. */
666 queue[head].start = b1;
667 queue[head].end = b1;
668 queue[head].n = 0;
669 head++;
670 if (b1 != b2) {
671 queue[head].start = b2;
672 queue[head].end = b2;
673 queue[head].n = 0;
674 head++;
675 }
676
677 while (tail < head) {
678 const struct cmap_path *path = &queue[tail++];
679 struct cmap_bucket *this = path->end;
680 int i;
681
682 for (i = 0; i < CMAP_K; i++) {
683 struct cmap_bucket *next = other_bucket_protected(impl, this, i);
684 int j;
685
686 if (this == next) {
687 continue;
688 }
689
690 j = cmap_find_empty_slot_protected(next);
691 if (j >= 0) {
692 /* We've found a path along which we can rearrange the hash
693 * table: Start at path->start, follow all the slots in
694 * path->slots[], then follow slot 'i', then the bucket you
695 * arrive at has slot 'j' empty. */
696 struct cmap_bucket *buckets[MAX_DEPTH + 2];
697 int slots[MAX_DEPTH + 2];
698 int k;
699
700 /* Figure out the full sequence of slots. */
701 for (k = 0; k < path->n; k++) {
702 slots[k] = path->slots[k];
703 }
704 slots[path->n] = i;
705 slots[path->n + 1] = j;
706
707 /* Figure out the full sequence of buckets. */
708 buckets[0] = path->start;
709 for (k = 0; k <= path->n; k++) {
710 buckets[k + 1] = other_bucket_protected(impl, buckets[k], slots[k]);
711 }
712
713 /* Now the path is fully expressed. One can start from
714 * buckets[0], go via slots[0] to buckets[1], via slots[1] to
715 * buckets[2], and so on.
716 *
717 * Move all the nodes across the path "backward". After each
718 * step some node appears in two buckets. Thus, every node is
719 * always visible to a concurrent search. */
720 for (k = path->n + 1; k > 0; k--) {
721 int slot = slots[k - 1];
722
723 cmap_set_bucket(
724 buckets[k], slots[k],
725 cmap_node_next_protected(&buckets[k - 1]->nodes[slot]),
726 buckets[k - 1]->hashes[slot]);
727 }
728
729 /* Finally, replace the first node on the path by
730 * 'new_node'. */
731 cmap_set_bucket(buckets[0], slots[0], new_node, hash);
732
733 return true;
734 }
735
736 if (path->n < MAX_DEPTH && head < MAX_QUEUE) {
737 struct cmap_path *new_path = &queue[head++];
738
739 *new_path = *path;
740 new_path->end = next;
741 new_path->slots[new_path->n++] = i;
742 }
743 }
744 }
745
746 return false;
747 }
748
749 /* Adds 'node', with the given 'hash', to 'impl'.
750 *
751 * 'node' is ordinarily a single node, with a null 'next' pointer. When
752 * rehashing, however, it may be a longer chain of nodes. */
753 static bool
754 cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash)
755 {
756 uint32_t h1 = rehash(impl, hash);
757 uint32_t h2 = other_hash(h1);
758 struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask];
759 struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask];
760
761 return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) ||
762 cmap_insert_dup(node, hash, b2)) ||
763 OVS_LIKELY(cmap_insert_bucket(node, hash, b1) ||
764 cmap_insert_bucket(node, hash, b2)) ||
765 cmap_insert_bfs(impl, node, hash, b1, b2));
766 }
767
768 /* Inserts 'node', with the given 'hash', into 'cmap'. The caller must ensure
769 * that 'cmap' cannot change concurrently (from another thread). If duplicates
770 * are undesirable, the caller must have already verified that 'cmap' does not
771 * contain a duplicate of 'node'.
772 *
773 * Returns the current number of nodes in the cmap after the insertion. */
774 size_t
775 cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash)
776 {
777 struct cmap_impl *impl = cmap_get_impl(cmap);
778
779 ovsrcu_set_hidden(&node->next, NULL);
780
781 if (OVS_UNLIKELY(impl->n >= impl->max_n)) {
782 COVERAGE_INC(cmap_expand);
783 impl = cmap_rehash(cmap, (impl->mask << 1) | 1);
784 }
785
786 while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) {
787 impl = cmap_rehash(cmap, impl->mask);
788 }
789 return ++impl->n;
790 }
791
792 static bool
793 cmap_replace__(struct cmap_impl *impl, struct cmap_node *node,
794 struct cmap_node *replacement, uint32_t hash, uint32_t h)
795 {
796 struct cmap_bucket *b = &impl->buckets[h & impl->mask];
797 int slot;
798
799 slot = cmap_find_slot_protected(b, hash);
800 if (slot < 0) {
801 return false;
802 }
803
804 /* The pointer to 'node' is changed to point to 'replacement',
805 * which is the next node if no replacement node is given. */
806 if (!replacement) {
807 replacement = cmap_node_next_protected(node);
808 } else {
809 /* 'replacement' takes the position of 'node' in the list. */
810 ovsrcu_set_hidden(&replacement->next, cmap_node_next_protected(node));
811 }
812
813 struct cmap_node *iter = &b->nodes[slot];
814 for (;;) {
815 struct cmap_node *next = cmap_node_next_protected(iter);
816
817 if (next == node) {
818 ovsrcu_set(&iter->next, replacement);
819 return true;
820 }
821 iter = next;
822 }
823 }
824
825 /* Replaces 'old_node' in 'cmap' with 'new_node'. The caller must
826 * ensure that 'cmap' cannot change concurrently (from another thread).
827 *
828 * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or
829 * into any other concurrent hash map while any other thread might be accessing
830 * it. One correct way to do this is to free it from an RCU callback with
831 * ovsrcu_postpone().
832 *
833 * Returns the current number of nodes in the cmap after the replacement. The
834 * number of nodes decreases by one if 'new_node' is NULL. */
835 size_t
836 cmap_replace(struct cmap *cmap, struct cmap_node *old_node,
837 struct cmap_node *new_node, uint32_t hash)
838 {
839 struct cmap_impl *impl = cmap_get_impl(cmap);
840 uint32_t h1 = rehash(impl, hash);
841 uint32_t h2 = other_hash(h1);
842 bool ok;
843
844 ok = cmap_replace__(impl, old_node, new_node, hash, h1)
845 || cmap_replace__(impl, old_node, new_node, hash, h2);
846 ovs_assert(ok);
847
848 if (!new_node) {
849 impl->n--;
850 if (OVS_UNLIKELY(impl->n < impl->min_n)) {
851 COVERAGE_INC(cmap_shrink);
852 impl = cmap_rehash(cmap, impl->mask >> 1);
853 }
854 }
855 return impl->n;
856 }
857
858 static bool
859 cmap_try_rehash(const struct cmap_impl *old, struct cmap_impl *new)
860 {
861 const struct cmap_bucket *b;
862
863 for (b = old->buckets; b <= &old->buckets[old->mask]; b++) {
864 int i;
865
866 for (i = 0; i < CMAP_K; i++) {
867 /* possible optimization here because we know the hashes are
868 * unique */
869 struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]);
870
871 if (node && !cmap_try_insert(new, node, b->hashes[i])) {
872 return false;
873 }
874 }
875 }
876 return true;
877 }
878
879 static struct cmap_impl *
880 cmap_rehash(struct cmap *cmap, uint32_t mask)
881 {
882 struct cmap_impl *old = cmap_get_impl(cmap);
883 struct cmap_impl *new;
884
885 new = cmap_impl_create(mask);
886 ovs_assert(old->n < new->max_n);
887
888 while (!cmap_try_rehash(old, new)) {
889 memset(new->buckets, 0, (mask + 1) * sizeof *new->buckets);
890 new->basis = random_uint32();
891 }
892
893 new->n = old->n;
894 ovsrcu_set(&cmap->impl, new);
895 ovsrcu_postpone(free_cacheline, old);
896
897 return new;
898 }
899
900 struct cmap_cursor
901 cmap_cursor_start(const struct cmap *cmap)
902 {
903 struct cmap_cursor cursor;
904
905 cursor.impl = cmap_get_impl(cmap);
906 cursor.bucket_idx = 0;
907 cursor.entry_idx = 0;
908 cursor.node = NULL;
909 cmap_cursor_advance(&cursor);
910
911 return cursor;
912 }
913
914 void
915 cmap_cursor_advance(struct cmap_cursor *cursor)
916 {
917 const struct cmap_impl *impl = cursor->impl;
918
919 if (cursor->node) {
920 cursor->node = cmap_node_next(cursor->node);
921 if (cursor->node) {
922 return;
923 }
924 }
925
926 while (cursor->bucket_idx <= impl->mask) {
927 const struct cmap_bucket *b = &impl->buckets[cursor->bucket_idx];
928
929 while (cursor->entry_idx < CMAP_K) {
930 cursor->node = cmap_node_next(&b->nodes[cursor->entry_idx++]);
931 if (cursor->node) {
932 return;
933 }
934 }
935
936 cursor->bucket_idx++;
937 cursor->entry_idx = 0;
938 }
939 }
940
941 /* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in
942 * 'cmap'. Uses '*pos' to determine where to begin iteration, and updates
943 * '*pos' to pass on the next iteration into them before returning.
944 *
945 * It's better to use plain CMAP_FOR_EACH and related functions, since they are
946 * faster and better at dealing with cmaps that change during iteration.
947 *
948 * Before beginning iteration, set '*pos' to all zeros. */
949 struct cmap_node *
950 cmap_next_position(const struct cmap *cmap,
951 struct cmap_position *pos)
952 {
953 struct cmap_impl *impl = cmap_get_impl(cmap);
954 unsigned int bucket = pos->bucket;
955 unsigned int entry = pos->entry;
956 unsigned int offset = pos->offset;
957
958 while (bucket <= impl->mask) {
959 const struct cmap_bucket *b = &impl->buckets[bucket];
960
961 while (entry < CMAP_K) {
962 const struct cmap_node *node = cmap_node_next(&b->nodes[entry]);
963 unsigned int i;
964
965 for (i = 0; node; i++, node = cmap_node_next(node)) {
966 if (i == offset) {
967 if (cmap_node_next(node)) {
968 offset++;
969 } else {
970 entry++;
971 offset = 0;
972 }
973 pos->bucket = bucket;
974 pos->entry = entry;
975 pos->offset = offset;
976 return CONST_CAST(struct cmap_node *, node);
977 }
978 }
979
980 entry++;
981 offset = 0;
982 }
983
984 bucket++;
985 entry = offset = 0;
986 }
987
988 pos->bucket = pos->entry = pos->offset = 0;
989 return NULL;
990 }