]>
Commit | Line | Data |
---|---|---|
0e666160 | 1 | /* |
b70e6976 | 2 | * Copyright (c) 2014, 2016 Nicira, Inc. |
0e666160 BP |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #include <config.h> | |
18 | #include "cmap.h" | |
8f9392dc | 19 | #include "coverage.h" |
52a524eb | 20 | #include "bitmap.h" |
0e666160 BP |
21 | #include "hash.h" |
22 | #include "ovs-rcu.h" | |
23 | #include "random.h" | |
24 | #include "util.h" | |
25 | ||
8f9392dc AW |
26 | COVERAGE_DEFINE(cmap_expand); |
27 | COVERAGE_DEFINE(cmap_shrink); | |
28 | ||
0e666160 BP |
29 | /* Optimistic Concurrent Cuckoo Hash |
30 | * ================================= | |
31 | * | |
32 | * A "cuckoo hash" is an open addressing hash table schema, designed such that | |
33 | * a given element can be in one of only a small number of buckets 'd', each of | |
34 | * which holds up to a small number 'k' elements. Thus, the expected and | |
35 | * worst-case lookup times are O(1) because they require comparing no more than | |
36 | * a fixed number of elements (k * d). Inserting a new element can require | |
37 | * moving around existing elements, but it is also O(1) amortized expected | |
38 | * time. | |
39 | * | |
40 | * An optimistic concurrent hash table goes one step further, making it | |
41 | * possible for a single writer to execute concurrently with any number of | |
42 | * readers without requiring the readers to take any locks. | |
43 | * | |
44 | * This cuckoo hash implementation uses: | |
45 | * | |
46 | * - Two hash functions (d=2). More hash functions allow for a higher load | |
47 | * factor, but increasing 'k' is easier and the benefits of increasing 'd' | |
48 | * quickly fall off with the 'k' values used here. Also, the method of | |
49 | * generating hashes used in this implementation is hard to reasonably | |
50 | * extend beyond d=2. Finally, each additional hash function means that a | |
51 | * lookup has to look at least one extra cache line. | |
52 | * | |
53 | * - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets | |
54 | * exactly one cache line in size. | |
55 | * | |
56 | * According to Erlingsson [4], these parameters suggest a maximum load factor | |
57 | * of about 93%. The current implementation is conservative, expanding the | |
58 | * hash table when it is over 85% full. | |
59 | * | |
8f9392dc AW |
60 | * When the load factor is below 20%, the hash table will be shrinked by half. |
61 | * This is to reduce the memory utilization of the hash table and to avoid | |
62 | * the hash table occupying the top of heap chunk which prevents the trimming | |
63 | * of heap. | |
0e666160 BP |
64 | * |
65 | * Hash Functions | |
66 | * ============== | |
67 | * | |
68 | * A cuckoo hash requires multiple hash functions. When reorganizing the hash | |
69 | * becomes too difficult, it also requires the ability to change the hash | |
70 | * functions. Requiring the client to provide multiple hashes and to be able | |
71 | * to change them to new hashes upon insertion is inconvenient. | |
72 | * | |
73 | * This implementation takes another approach. The client provides a single, | |
74 | * fixed hash. The cuckoo hash internally "rehashes" this hash against a | |
75 | * randomly selected basis value (see rehash()). This rehashed value is one of | |
76 | * the two hashes. The other hash is computed by 16-bit circular rotation of | |
77 | * the rehashed value. Updating the basis changes the hash functions. | |
78 | * | |
79 | * To work properly, the hash functions used by a cuckoo hash must be | |
80 | * independent. If one hash function is a function of the other (e.g. h2(x) = | |
81 | * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail | |
82 | * catastrophically (loop forever) because of collisions. With this rehashing | |
83 | * technique, the two hashes are completely independent for masks up to 16 bits | |
84 | * wide. For masks wider than 16 bits, only 32-n bits are independent between | |
85 | * the two hashes. Thus, it becomes risky to grow a cuckoo hash table beyond | |
86 | * about 2**24 buckets (about 71 million elements with k=5 and maximum load | |
87 | * 85%). Fortunately, Open vSwitch does not normally deal with hash tables | |
88 | * this large. | |
89 | * | |
90 | * | |
91 | * Handling Duplicates | |
92 | * =================== | |
93 | * | |
94 | * This cuckoo hash table implementation deals with duplicate client-provided | |
95 | * hash values by chaining: the second and subsequent cmap_nodes with a given | |
96 | * hash are chained off the initially inserted node's 'next' member. The hash | |
97 | * table maintains the invariant that a single client-provided hash value | |
98 | * exists in only a single chain in a single bucket (even though that hash | |
99 | * could be stored in two buckets). | |
100 | * | |
101 | * | |
102 | * References | |
103 | * ========== | |
104 | * | |
105 | * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High | |
106 | * Performance Ethernet Forwarding with CuckooSwitch". In Proc. 9th | |
107 | * CoNEXT, Dec. 2013. | |
108 | * | |
109 | * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent | |
110 | * memcache with dumber caching and smarter hashing". In Proc. 10th USENIX | |
111 | * NSDI, Apr. 2013 | |
112 | * | |
113 | * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2): | |
114 | * 122-144, May 2004. | |
115 | * | |
116 | * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical | |
117 | * Alternative to Traditional Hash Tables". In Proc. 7th Workshop on | |
118 | * Distributed Data and Structures (WDAS'06), 2006. | |
119 | */ | |
120 | /* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */ | |
121 | #define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8)) | |
122 | ||
123 | /* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit. */ | |
124 | #define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE) | |
125 | ||
126 | /* Pad to make a bucket a full cache line in size: 4 on 32-bit, 0 on 64-bit. */ | |
127 | #define CMAP_PADDING ((CACHE_LINE_SIZE - 4) - (CMAP_K * CMAP_ENTRY_SIZE)) | |
128 | ||
129 | /* A cuckoo hash bucket. Designed to be cache-aligned and exactly one cache | |
130 | * line long. */ | |
131 | struct cmap_bucket { | |
132 | /* Allows readers to track in-progress changes. Initially zero, each | |
133 | * writer increments this value just before and just after each change (see | |
134 | * cmap_set_bucket()). Thus, a reader can ensure that it gets a consistent | |
135 | * snapshot by waiting for the counter to become even (see | |
136 | * read_even_counter()), then checking that its value does not change while | |
137 | * examining the bucket (see cmap_find()). */ | |
138 | atomic_uint32_t counter; | |
139 | ||
140 | /* (hash, node) slots. They are parallel arrays instead of an array of | |
141 | * structs to reduce the amount of space lost to padding. | |
142 | * | |
143 | * The slots are in no particular order. A null pointer indicates that a | |
144 | * pair is unused. In-use slots are not necessarily in the earliest | |
145 | * slots. */ | |
5c416811 | 146 | uint32_t hashes[CMAP_K]; |
6ff0d5d6 | 147 | struct cmap_node nodes[CMAP_K]; |
0e666160 BP |
148 | |
149 | /* Padding to make cmap_bucket exactly one cache line long. */ | |
150 | #if CMAP_PADDING > 0 | |
151 | uint8_t pad[CMAP_PADDING]; | |
152 | #endif | |
153 | }; | |
154 | BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE); | |
155 | ||
156 | /* Default maximum load factor (as a fraction of UINT32_MAX + 1) before | |
157 | * enlarging a cmap. Reasonable values lie between about 75% and 93%. Smaller | |
158 | * values waste memory; larger values increase the average insertion time. */ | |
159 | #define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85)) | |
160 | ||
8f9392dc AW |
161 | /* Default minimum load factor (as a fraction of UINT32_MAX + 1) before |
162 | * shrinking a cmap. Currently, the value is chosen to be 20%, this | |
163 | * means cmap will have a 40% load factor after shrink. */ | |
164 | #define CMAP_MIN_LOAD ((uint32_t) (UINT32_MAX * .20)) | |
165 | ||
0e666160 BP |
166 | /* The implementation of a concurrent hash map. */ |
167 | struct cmap_impl { | |
168 | unsigned int n; /* Number of in-use elements. */ | |
169 | unsigned int max_n; /* Max elements before enlarging. */ | |
8f9392dc | 170 | unsigned int min_n; /* Min elements before shrinking. */ |
0e666160 BP |
171 | uint32_t mask; /* Number of 'buckets', minus one. */ |
172 | uint32_t basis; /* Basis for rehashing client's hash values. */ | |
b70e6976 BP |
173 | uint8_t pad[CACHE_LINE_SIZE - 4 * 5]; /* Pad to end of cache line. */ |
174 | struct cmap_bucket buckets[1]; | |
0e666160 | 175 | }; |
b70e6976 BP |
176 | BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE * 2); |
177 | ||
178 | /* An empty cmap. */ | |
179 | OVS_ALIGNED_VAR(CACHE_LINE_SIZE) const struct cmap_impl empty_cmap; | |
0e666160 BP |
180 | |
181 | static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask); | |
182 | ||
55847abe JR |
183 | /* Explicit inline keywords in utility functions seem to be necessary |
184 | * to prevent performance regression on cmap_find(). */ | |
185 | ||
0e666160 BP |
186 | /* Given a rehashed value 'hash', returns the other hash for that rehashed |
187 | * value. This is symmetric: other_hash(other_hash(x)) == x. (See also "Hash | |
188 | * Functions" at the top of this file.) */ | |
55847abe | 189 | static inline uint32_t |
0e666160 BP |
190 | other_hash(uint32_t hash) |
191 | { | |
192 | return (hash << 16) | (hash >> 16); | |
193 | } | |
194 | ||
195 | /* Returns the rehashed value for 'hash' within 'impl'. (See also "Hash | |
196 | * Functions" at the top of this file.) */ | |
55847abe | 197 | static inline uint32_t |
0e666160 BP |
198 | rehash(const struct cmap_impl *impl, uint32_t hash) |
199 | { | |
33c6a1b9 | 200 | return hash_finish(impl->basis, hash); |
0e666160 BP |
201 | } |
202 | ||
55847abe JR |
203 | /* Not always without the inline keyword. */ |
204 | static inline struct cmap_impl * | |
0e666160 BP |
205 | cmap_get_impl(const struct cmap *cmap) |
206 | { | |
207 | return ovsrcu_get(struct cmap_impl *, &cmap->impl); | |
208 | } | |
209 | ||
210 | static uint32_t | |
211 | calc_max_n(uint32_t mask) | |
212 | { | |
213 | return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MAX_LOAD) >> 32; | |
214 | } | |
215 | ||
8f9392dc AW |
216 | static uint32_t |
217 | calc_min_n(uint32_t mask) | |
218 | { | |
219 | return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MIN_LOAD) >> 32; | |
220 | } | |
221 | ||
0e666160 BP |
222 | static struct cmap_impl * |
223 | cmap_impl_create(uint32_t mask) | |
224 | { | |
225 | struct cmap_impl *impl; | |
226 | ||
227 | ovs_assert(is_pow2(mask + 1)); | |
228 | ||
b70e6976 BP |
229 | /* There are 'mask + 1' buckets but struct cmap_impl has one bucket built |
230 | * in, so we only need to add space for the extra 'mask' buckets. */ | |
231 | impl = xzalloc_cacheline(sizeof *impl + mask * sizeof *impl->buckets); | |
0e666160 BP |
232 | impl->n = 0; |
233 | impl->max_n = calc_max_n(mask); | |
8f9392dc | 234 | impl->min_n = calc_min_n(mask); |
0e666160 BP |
235 | impl->mask = mask; |
236 | impl->basis = random_uint32(); | |
237 | ||
238 | return impl; | |
239 | } | |
240 | ||
241 | /* Initializes 'cmap' as an empty concurrent hash map. */ | |
242 | void | |
243 | cmap_init(struct cmap *cmap) | |
244 | { | |
b70e6976 | 245 | ovsrcu_set(&cmap->impl, CONST_CAST(struct cmap_impl *, &empty_cmap)); |
0e666160 BP |
246 | } |
247 | ||
248 | /* Destroys 'cmap'. | |
249 | * | |
250 | * The client is responsible for destroying any data previously held in | |
251 | * 'cmap'. */ | |
252 | void | |
253 | cmap_destroy(struct cmap *cmap) | |
254 | { | |
255 | if (cmap) { | |
b70e6976 BP |
256 | struct cmap_impl *impl = cmap_get_impl(cmap); |
257 | if (impl != &empty_cmap) { | |
258 | ovsrcu_postpone(free_cacheline, impl); | |
259 | } | |
0e666160 BP |
260 | } |
261 | } | |
262 | ||
263 | /* Returns the number of elements in 'cmap'. */ | |
264 | size_t | |
265 | cmap_count(const struct cmap *cmap) | |
266 | { | |
267 | return cmap_get_impl(cmap)->n; | |
268 | } | |
269 | ||
270 | /* Returns true if 'cmap' is empty, false otherwise. */ | |
271 | bool | |
272 | cmap_is_empty(const struct cmap *cmap) | |
273 | { | |
274 | return cmap_count(cmap) == 0; | |
275 | } | |
276 | ||
55847abe JR |
277 | static inline uint32_t |
278 | read_counter(const struct cmap_bucket *bucket_) | |
0e666160 | 279 | { |
55847abe | 280 | struct cmap_bucket *bucket = CONST_CAST(struct cmap_bucket *, bucket_); |
0e666160 BP |
281 | uint32_t counter; |
282 | ||
34a0d778 | 283 | atomic_read_explicit(&bucket->counter, &counter, memory_order_acquire); |
55847abe | 284 | |
0e666160 BP |
285 | return counter; |
286 | } | |
287 | ||
55847abe JR |
288 | static inline uint32_t |
289 | read_even_counter(const struct cmap_bucket *bucket) | |
0e666160 BP |
290 | { |
291 | uint32_t counter; | |
292 | ||
293 | do { | |
34a0d778 | 294 | counter = read_counter(bucket); |
0e666160 BP |
295 | } while (OVS_UNLIKELY(counter & 1)); |
296 | ||
297 | return counter; | |
298 | } | |
299 | ||
55847abe JR |
300 | static inline bool |
301 | counter_changed(const struct cmap_bucket *b_, uint32_t c) | |
0e666160 | 302 | { |
55847abe | 303 | struct cmap_bucket *b = CONST_CAST(struct cmap_bucket *, b_); |
52a524eb | 304 | uint32_t counter; |
55847abe | 305 | |
5c416811 | 306 | /* Need to make sure the counter read is not moved up, before the hash and |
52a524eb JR |
307 | * cmap_node_next(). Using atomic_read_explicit with memory_order_acquire |
308 | * would allow prior reads to be moved after the barrier. | |
309 | * atomic_thread_fence prevents all following memory accesses from moving | |
310 | * prior to preceding loads. */ | |
5c416811 | 311 | atomic_thread_fence(memory_order_acquire); |
52a524eb | 312 | atomic_read_relaxed(&b->counter, &counter); |
5c416811 | 313 | |
52a524eb | 314 | return OVS_UNLIKELY(counter != c); |
0e666160 BP |
315 | } |
316 | ||
55847abe JR |
317 | static inline const struct cmap_node * |
318 | cmap_find_in_bucket(const struct cmap_bucket *bucket, uint32_t hash) | |
319 | { | |
320 | for (int i = 0; i < CMAP_K; i++) { | |
321 | if (bucket->hashes[i] == hash) { | |
322 | return cmap_node_next(&bucket->nodes[i]); | |
323 | } | |
324 | } | |
325 | return NULL; | |
326 | } | |
327 | ||
328 | static inline const struct cmap_node * | |
329 | cmap_find__(const struct cmap_bucket *b1, const struct cmap_bucket *b2, | |
330 | uint32_t hash) | |
331 | { | |
332 | uint32_t c1, c2; | |
333 | const struct cmap_node *node; | |
334 | ||
335 | do { | |
336 | do { | |
337 | c1 = read_even_counter(b1); | |
338 | node = cmap_find_in_bucket(b1, hash); | |
339 | } while (OVS_UNLIKELY(counter_changed(b1, c1))); | |
340 | if (node) { | |
341 | break; | |
342 | } | |
343 | do { | |
344 | c2 = read_even_counter(b2); | |
345 | node = cmap_find_in_bucket(b2, hash); | |
346 | } while (OVS_UNLIKELY(counter_changed(b2, c2))); | |
347 | if (node) { | |
348 | break; | |
349 | } | |
350 | } while (OVS_UNLIKELY(counter_changed(b1, c1))); | |
351 | ||
352 | return node; | |
353 | } | |
354 | ||
0e666160 BP |
355 | /* Searches 'cmap' for an element with the specified 'hash'. If one or more is |
356 | * found, returns a pointer to the first one, otherwise a null pointer. All of | |
357 | * the nodes on the returned list are guaranteed to have exactly the given | |
358 | * 'hash'. | |
359 | * | |
360 | * This function works even if 'cmap' is changing concurrently. If 'cmap' is | |
361 | * not changing, then cmap_find_protected() is slightly faster. | |
362 | * | |
363 | * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */ | |
55847abe | 364 | const struct cmap_node * |
0e666160 BP |
365 | cmap_find(const struct cmap *cmap, uint32_t hash) |
366 | { | |
55847abe | 367 | const struct cmap_impl *impl = cmap_get_impl(cmap); |
0e666160 BP |
368 | uint32_t h1 = rehash(impl, hash); |
369 | uint32_t h2 = other_hash(h1); | |
0e666160 | 370 | |
55847abe JR |
371 | return cmap_find__(&impl->buckets[h1 & impl->mask], |
372 | &impl->buckets[h2 & impl->mask], | |
373 | hash); | |
0e666160 BP |
374 | } |
375 | ||
52a524eb JR |
376 | /* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1, |
377 | * and sets the corresponding pointer in 'nodes', if the hash value was | |
378 | * found from the 'cmap'. In other cases the 'nodes' values are not changed, | |
379 | * i.e., no NULL pointers are stored there. | |
380 | * Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer | |
381 | * was stored, 0 otherwise. | |
382 | * Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for | |
383 | * hash collisions. */ | |
384 | unsigned long | |
385 | cmap_find_batch(const struct cmap *cmap, unsigned long map, | |
386 | uint32_t hashes[], const struct cmap_node *nodes[]) | |
387 | { | |
388 | const struct cmap_impl *impl = cmap_get_impl(cmap); | |
389 | unsigned long result = map; | |
390 | int i; | |
391 | uint32_t h1s[sizeof map * CHAR_BIT]; | |
392 | const struct cmap_bucket *b1s[sizeof map * CHAR_BIT]; | |
393 | const struct cmap_bucket *b2s[sizeof map * CHAR_BIT]; | |
394 | uint32_t c1s[sizeof map * CHAR_BIT]; | |
395 | ||
396 | /* Compute hashes and prefetch 1st buckets. */ | |
3ee6026a | 397 | ULLONG_FOR_EACH_1(i, map) { |
52a524eb JR |
398 | h1s[i] = rehash(impl, hashes[i]); |
399 | b1s[i] = &impl->buckets[h1s[i] & impl->mask]; | |
400 | OVS_PREFETCH(b1s[i]); | |
401 | } | |
402 | /* Lookups, Round 1. Only look up at the first bucket. */ | |
3ee6026a | 403 | ULLONG_FOR_EACH_1(i, map) { |
52a524eb JR |
404 | uint32_t c1; |
405 | const struct cmap_bucket *b1 = b1s[i]; | |
406 | const struct cmap_node *node; | |
407 | ||
408 | do { | |
409 | c1 = read_even_counter(b1); | |
410 | node = cmap_find_in_bucket(b1, hashes[i]); | |
411 | } while (OVS_UNLIKELY(counter_changed(b1, c1))); | |
412 | ||
413 | if (!node) { | |
414 | /* Not found (yet); Prefetch the 2nd bucket. */ | |
415 | b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask]; | |
416 | OVS_PREFETCH(b2s[i]); | |
417 | c1s[i] = c1; /* We may need to check this after Round 2. */ | |
418 | continue; | |
419 | } | |
420 | /* Found. */ | |
3ee6026a | 421 | ULLONG_SET0(map, i); /* Ignore this on round 2. */ |
52a524eb JR |
422 | OVS_PREFETCH(node); |
423 | nodes[i] = node; | |
424 | } | |
425 | /* Round 2. Look into the 2nd bucket, if needed. */ | |
3ee6026a | 426 | ULLONG_FOR_EACH_1(i, map) { |
52a524eb JR |
427 | uint32_t c2; |
428 | const struct cmap_bucket *b2 = b2s[i]; | |
429 | const struct cmap_node *node; | |
430 | ||
431 | do { | |
432 | c2 = read_even_counter(b2); | |
433 | node = cmap_find_in_bucket(b2, hashes[i]); | |
434 | } while (OVS_UNLIKELY(counter_changed(b2, c2))); | |
435 | ||
436 | if (!node) { | |
437 | /* Not found, but the node may have been moved from b2 to b1 right | |
438 | * after we finished with b1 earlier. We just got a clean reading | |
439 | * of the 2nd bucket, so we check the counter of the 1st bucket | |
440 | * only. However, we need to check both buckets again, as the | |
441 | * entry may be moved again to the 2nd bucket. Basically, we | |
442 | * need to loop as long as it takes to get stable readings of | |
443 | * both buckets. cmap_find__() does that, and now that we have | |
444 | * fetched both buckets we can just use it. */ | |
445 | if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) { | |
446 | node = cmap_find__(b1s[i], b2s[i], hashes[i]); | |
447 | if (node) { | |
448 | goto found; | |
449 | } | |
450 | } | |
451 | /* Not found. */ | |
3ee6026a | 452 | ULLONG_SET0(result, i); /* Fix the result. */ |
52a524eb JR |
453 | continue; |
454 | } | |
455 | found: | |
456 | OVS_PREFETCH(node); | |
457 | nodes[i] = node; | |
458 | } | |
459 | return result; | |
460 | } | |
461 | ||
0e666160 | 462 | static int |
6ff0d5d6 | 463 | cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash) |
0e666160 BP |
464 | { |
465 | int i; | |
466 | ||
467 | for (i = 0; i < CMAP_K; i++) { | |
5c416811 | 468 | if (b->hashes[i] == hash && cmap_node_next_protected(&b->nodes[i])) { |
0e666160 BP |
469 | return i; |
470 | } | |
471 | } | |
472 | return -1; | |
473 | } | |
474 | ||
475 | static struct cmap_node * | |
476 | cmap_find_bucket_protected(struct cmap_impl *impl, uint32_t hash, uint32_t h) | |
477 | { | |
478 | struct cmap_bucket *b = &impl->buckets[h & impl->mask]; | |
479 | int i; | |
480 | ||
481 | for (i = 0; i < CMAP_K; i++) { | |
5c416811 JR |
482 | if (b->hashes[i] == hash) { |
483 | return cmap_node_next_protected(&b->nodes[i]); | |
0e666160 BP |
484 | } |
485 | } | |
486 | return NULL; | |
487 | } | |
488 | ||
489 | /* Like cmap_find(), but only for use if 'cmap' cannot change concurrently. | |
490 | * | |
491 | * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */ | |
492 | struct cmap_node * | |
493 | cmap_find_protected(const struct cmap *cmap, uint32_t hash) | |
494 | { | |
495 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
496 | uint32_t h1 = rehash(impl, hash); | |
497 | uint32_t h2 = other_hash(hash); | |
498 | struct cmap_node *node; | |
499 | ||
500 | node = cmap_find_bucket_protected(impl, hash, h1); | |
501 | if (node) { | |
502 | return node; | |
503 | } | |
504 | return cmap_find_bucket_protected(impl, hash, h2); | |
505 | } | |
506 | ||
507 | static int | |
6ff0d5d6 | 508 | cmap_find_empty_slot_protected(const struct cmap_bucket *b) |
0e666160 BP |
509 | { |
510 | int i; | |
511 | ||
512 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 | 513 | if (!cmap_node_next_protected(&b->nodes[i])) { |
0e666160 BP |
514 | return i; |
515 | } | |
516 | } | |
517 | return -1; | |
518 | } | |
519 | ||
520 | static void | |
521 | cmap_set_bucket(struct cmap_bucket *b, int i, | |
522 | struct cmap_node *node, uint32_t hash) | |
523 | { | |
524 | uint32_t c; | |
525 | ||
526 | atomic_read_explicit(&b->counter, &c, memory_order_acquire); | |
527 | atomic_store_explicit(&b->counter, c + 1, memory_order_release); | |
6ff0d5d6 | 528 | ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */ |
5c416811 | 529 | b->hashes[i] = hash; |
0e666160 BP |
530 | atomic_store_explicit(&b->counter, c + 2, memory_order_release); |
531 | } | |
532 | ||
533 | /* Searches 'b' for a node with the given 'hash'. If it finds one, adds | |
534 | * 'new_node' to the node's linked list and returns true. If it does not find | |
535 | * one, returns false. */ | |
536 | static bool | |
537 | cmap_insert_dup(struct cmap_node *new_node, uint32_t hash, | |
538 | struct cmap_bucket *b) | |
539 | { | |
540 | int i; | |
541 | ||
542 | for (i = 0; i < CMAP_K; i++) { | |
5c416811 JR |
543 | if (b->hashes[i] == hash) { |
544 | struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]); | |
6ff0d5d6 | 545 | |
0e666160 BP |
546 | if (node) { |
547 | struct cmap_node *p; | |
548 | ||
6ff0d5d6 JR |
549 | /* The common case is that 'new_node' is a singleton, |
550 | * with a null 'next' pointer. Rehashing can add a | |
551 | * longer chain, but due to our invariant of always | |
552 | * having all nodes with the same (user) hash value at | |
553 | * a single chain, rehashing will always insert the | |
554 | * chain to an empty node. The only way we can end up | |
555 | * here is by the user inserting a chain of nodes at | |
556 | * once. Find the end of the chain starting at | |
557 | * 'new_node', then splice 'node' to the end of that | |
558 | * chain. */ | |
0e666160 BP |
559 | p = new_node; |
560 | for (;;) { | |
561 | struct cmap_node *next = cmap_node_next_protected(p); | |
6ff0d5d6 | 562 | |
0e666160 BP |
563 | if (!next) { |
564 | break; | |
565 | } | |
566 | p = next; | |
567 | } | |
7e5f06c3 | 568 | ovsrcu_set_hidden(&p->next, node); |
0e666160 BP |
569 | } else { |
570 | /* The hash value is there from some previous insertion, but | |
571 | * the associated node has been removed. We're not really | |
572 | * inserting a duplicate, but we can still reuse the slot. | |
573 | * Carry on. */ | |
574 | } | |
575 | ||
576 | /* Change the bucket to point to 'new_node'. This is a degenerate | |
577 | * form of cmap_set_bucket() that doesn't update the counter since | |
578 | * we're only touching one field and in a way that doesn't change | |
579 | * the bucket's meaning for readers. */ | |
6ff0d5d6 | 580 | ovsrcu_set(&b->nodes[i].next, new_node); |
0e666160 BP |
581 | |
582 | return true; | |
583 | } | |
584 | } | |
585 | return false; | |
586 | } | |
587 | ||
588 | /* Searches 'b' for an empty slot. If successful, stores 'node' and 'hash' in | |
589 | * the slot and returns true. Otherwise, returns false. */ | |
590 | static bool | |
591 | cmap_insert_bucket(struct cmap_node *node, uint32_t hash, | |
592 | struct cmap_bucket *b) | |
593 | { | |
594 | int i; | |
595 | ||
596 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 | 597 | if (!cmap_node_next_protected(&b->nodes[i])) { |
0e666160 BP |
598 | cmap_set_bucket(b, i, node, hash); |
599 | return true; | |
600 | } | |
601 | } | |
602 | return false; | |
603 | } | |
604 | ||
605 | /* Returns the other bucket that b->nodes[slot] could occupy in 'impl'. (This | |
606 | * might be the same as 'b'.) */ | |
607 | static struct cmap_bucket * | |
6ff0d5d6 | 608 | other_bucket_protected(struct cmap_impl *impl, struct cmap_bucket *b, int slot) |
0e666160 | 609 | { |
5c416811 | 610 | uint32_t h1 = rehash(impl, b->hashes[slot]); |
0e666160 BP |
611 | uint32_t h2 = other_hash(h1); |
612 | uint32_t b_idx = b - impl->buckets; | |
613 | uint32_t other_h = (h1 & impl->mask) == b_idx ? h2 : h1; | |
614 | ||
615 | return &impl->buckets[other_h & impl->mask]; | |
616 | } | |
617 | ||
618 | /* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1' | |
619 | * and 'b2' are full. This function attempts to rearrange buckets within | |
620 | * 'impl' to make room for 'new_node'. | |
621 | * | |
622 | * The implementation is a general-purpose breadth-first search. At first | |
623 | * glance, this is more complex than a random walk through 'impl' (suggested by | |
624 | * some references), but random walks have a tendency to loop back through a | |
625 | * single bucket. We have to move nodes backward along the path that we find, | |
626 | * so that no node actually disappears from the hash table, which means a | |
627 | * random walk would have to be careful to deal with loops. By contrast, a | |
628 | * successful breadth-first search always finds a *shortest* path through the | |
629 | * hash table, and a shortest path will never contain loops, so it avoids that | |
630 | * problem entirely. | |
631 | */ | |
632 | static bool | |
633 | cmap_insert_bfs(struct cmap_impl *impl, struct cmap_node *new_node, | |
634 | uint32_t hash, struct cmap_bucket *b1, struct cmap_bucket *b2) | |
635 | { | |
52a6bdaf | 636 | enum { MAX_DEPTH = 4 }; |
0e666160 BP |
637 | |
638 | /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'. | |
639 | * | |
640 | * One can follow the path via: | |
641 | * | |
642 | * struct cmap_bucket *b; | |
643 | * int i; | |
644 | * | |
645 | * b = path->start; | |
646 | * for (i = 0; i < path->n; i++) { | |
6ff0d5d6 | 647 | * b = other_bucket_protected(impl, b, path->slots[i]); |
0e666160 BP |
648 | * } |
649 | * ovs_assert(b == path->end); | |
650 | */ | |
651 | struct cmap_path { | |
652 | struct cmap_bucket *start; /* First bucket along the path. */ | |
653 | struct cmap_bucket *end; /* Last bucket on the path. */ | |
52a6bdaf | 654 | uint8_t slots[MAX_DEPTH]; /* Slots used for each hop. */ |
0e666160 BP |
655 | int n; /* Number of slots[]. */ |
656 | }; | |
657 | ||
658 | /* We need to limit the amount of work we do trying to find a path. It | |
659 | * might actually be impossible to rearrange the cmap, and after some time | |
660 | * it is likely to be easier to rehash the entire cmap. | |
661 | * | |
662 | * This value of MAX_QUEUE is an arbitrary limit suggested by one of the | |
663 | * references. Empirically, it seems to work OK. */ | |
664 | enum { MAX_QUEUE = 500 }; | |
665 | struct cmap_path queue[MAX_QUEUE]; | |
666 | int head = 0; | |
667 | int tail = 0; | |
668 | ||
669 | /* Add 'b1' and 'b2' as starting points for the search. */ | |
670 | queue[head].start = b1; | |
671 | queue[head].end = b1; | |
672 | queue[head].n = 0; | |
673 | head++; | |
674 | if (b1 != b2) { | |
675 | queue[head].start = b2; | |
676 | queue[head].end = b2; | |
677 | queue[head].n = 0; | |
678 | head++; | |
679 | } | |
680 | ||
681 | while (tail < head) { | |
682 | const struct cmap_path *path = &queue[tail++]; | |
683 | struct cmap_bucket *this = path->end; | |
684 | int i; | |
685 | ||
686 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 | 687 | struct cmap_bucket *next = other_bucket_protected(impl, this, i); |
0e666160 BP |
688 | int j; |
689 | ||
690 | if (this == next) { | |
691 | continue; | |
692 | } | |
693 | ||
6ff0d5d6 | 694 | j = cmap_find_empty_slot_protected(next); |
0e666160 BP |
695 | if (j >= 0) { |
696 | /* We've found a path along which we can rearrange the hash | |
697 | * table: Start at path->start, follow all the slots in | |
698 | * path->slots[], then follow slot 'i', then the bucket you | |
699 | * arrive at has slot 'j' empty. */ | |
52a6bdaf GS |
700 | struct cmap_bucket *buckets[MAX_DEPTH + 2]; |
701 | int slots[MAX_DEPTH + 2]; | |
0e666160 BP |
702 | int k; |
703 | ||
704 | /* Figure out the full sequence of slots. */ | |
705 | for (k = 0; k < path->n; k++) { | |
706 | slots[k] = path->slots[k]; | |
707 | } | |
708 | slots[path->n] = i; | |
709 | slots[path->n + 1] = j; | |
710 | ||
711 | /* Figure out the full sequence of buckets. */ | |
712 | buckets[0] = path->start; | |
713 | for (k = 0; k <= path->n; k++) { | |
6ff0d5d6 | 714 | buckets[k + 1] = other_bucket_protected(impl, buckets[k], slots[k]); |
0e666160 BP |
715 | } |
716 | ||
717 | /* Now the path is fully expressed. One can start from | |
718 | * buckets[0], go via slots[0] to buckets[1], via slots[1] to | |
719 | * buckets[2], and so on. | |
720 | * | |
721 | * Move all the nodes across the path "backward". After each | |
722 | * step some node appears in two buckets. Thus, every node is | |
723 | * always visible to a concurrent search. */ | |
724 | for (k = path->n + 1; k > 0; k--) { | |
725 | int slot = slots[k - 1]; | |
726 | ||
5c416811 JR |
727 | cmap_set_bucket( |
728 | buckets[k], slots[k], | |
729 | cmap_node_next_protected(&buckets[k - 1]->nodes[slot]), | |
730 | buckets[k - 1]->hashes[slot]); | |
0e666160 BP |
731 | } |
732 | ||
733 | /* Finally, replace the first node on the path by | |
734 | * 'new_node'. */ | |
735 | cmap_set_bucket(buckets[0], slots[0], new_node, hash); | |
736 | ||
737 | return true; | |
738 | } | |
739 | ||
52a6bdaf | 740 | if (path->n < MAX_DEPTH && head < MAX_QUEUE) { |
0e666160 BP |
741 | struct cmap_path *new_path = &queue[head++]; |
742 | ||
743 | *new_path = *path; | |
744 | new_path->end = next; | |
745 | new_path->slots[new_path->n++] = i; | |
746 | } | |
747 | } | |
748 | } | |
749 | ||
750 | return false; | |
751 | } | |
752 | ||
753 | /* Adds 'node', with the given 'hash', to 'impl'. | |
754 | * | |
755 | * 'node' is ordinarily a single node, with a null 'next' pointer. When | |
756 | * rehashing, however, it may be a longer chain of nodes. */ | |
757 | static bool | |
758 | cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash) | |
759 | { | |
760 | uint32_t h1 = rehash(impl, hash); | |
761 | uint32_t h2 = other_hash(h1); | |
762 | struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask]; | |
763 | struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask]; | |
764 | ||
765 | return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) || | |
766 | cmap_insert_dup(node, hash, b2)) || | |
767 | OVS_LIKELY(cmap_insert_bucket(node, hash, b1) || | |
768 | cmap_insert_bucket(node, hash, b2)) || | |
769 | cmap_insert_bfs(impl, node, hash, b1, b2)); | |
770 | } | |
771 | ||
772 | /* Inserts 'node', with the given 'hash', into 'cmap'. The caller must ensure | |
773 | * that 'cmap' cannot change concurrently (from another thread). If duplicates | |
774 | * are undesirable, the caller must have already verified that 'cmap' does not | |
ee58b469 JR |
775 | * contain a duplicate of 'node'. |
776 | * | |
777 | * Returns the current number of nodes in the cmap after the insertion. */ | |
778 | size_t | |
0e666160 BP |
779 | cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash) |
780 | { | |
781 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
782 | ||
7e5f06c3 | 783 | ovsrcu_set_hidden(&node->next, NULL); |
0e666160 BP |
784 | |
785 | if (OVS_UNLIKELY(impl->n >= impl->max_n)) { | |
8f9392dc | 786 | COVERAGE_INC(cmap_expand); |
0e666160 BP |
787 | impl = cmap_rehash(cmap, (impl->mask << 1) | 1); |
788 | } | |
789 | ||
790 | while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) { | |
791 | impl = cmap_rehash(cmap, impl->mask); | |
792 | } | |
ee58b469 | 793 | return ++impl->n; |
0e666160 BP |
794 | } |
795 | ||
796 | static bool | |
9d933fc7 JR |
797 | cmap_replace__(struct cmap_impl *impl, struct cmap_node *node, |
798 | struct cmap_node *replacement, uint32_t hash, uint32_t h) | |
0e666160 BP |
799 | { |
800 | struct cmap_bucket *b = &impl->buckets[h & impl->mask]; | |
801 | int slot; | |
802 | ||
6ff0d5d6 | 803 | slot = cmap_find_slot_protected(b, hash); |
0e666160 BP |
804 | if (slot < 0) { |
805 | return false; | |
806 | } | |
807 | ||
9d933fc7 JR |
808 | /* The pointer to 'node' is changed to point to 'replacement', |
809 | * which is the next node if no replacement node is given. */ | |
810 | if (!replacement) { | |
811 | replacement = cmap_node_next_protected(node); | |
812 | } else { | |
813 | /* 'replacement' takes the position of 'node' in the list. */ | |
7e5f06c3 | 814 | ovsrcu_set_hidden(&replacement->next, cmap_node_next_protected(node)); |
9d933fc7 JR |
815 | } |
816 | ||
6ff0d5d6 JR |
817 | struct cmap_node *iter = &b->nodes[slot]; |
818 | for (;;) { | |
819 | struct cmap_node *next = cmap_node_next_protected(iter); | |
820 | ||
821 | if (next == node) { | |
822 | ovsrcu_set(&iter->next, replacement); | |
823 | return true; | |
0e666160 | 824 | } |
6ff0d5d6 | 825 | iter = next; |
0e666160 | 826 | } |
0e666160 BP |
827 | } |
828 | ||
9d933fc7 JR |
829 | /* Replaces 'old_node' in 'cmap' with 'new_node'. The caller must |
830 | * ensure that 'cmap' cannot change concurrently (from another thread). | |
831 | * | |
832 | * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or | |
833 | * into any other concurrent hash map while any other thread might be accessing | |
834 | * it. One correct way to do this is to free it from an RCU callback with | |
ee58b469 JR |
835 | * ovsrcu_postpone(). |
836 | * | |
837 | * Returns the current number of nodes in the cmap after the replacement. The | |
838 | * number of nodes decreases by one if 'new_node' is NULL. */ | |
839 | size_t | |
9d933fc7 JR |
840 | cmap_replace(struct cmap *cmap, struct cmap_node *old_node, |
841 | struct cmap_node *new_node, uint32_t hash) | |
0e666160 BP |
842 | { |
843 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
844 | uint32_t h1 = rehash(impl, hash); | |
845 | uint32_t h2 = other_hash(h1); | |
846 | bool ok; | |
847 | ||
9d933fc7 JR |
848 | ok = cmap_replace__(impl, old_node, new_node, hash, h1) |
849 | || cmap_replace__(impl, old_node, new_node, hash, h2); | |
0e666160 | 850 | ovs_assert(ok); |
ee58b469 JR |
851 | |
852 | if (!new_node) { | |
853 | impl->n--; | |
8f9392dc AW |
854 | if (OVS_UNLIKELY(impl->n < impl->min_n)) { |
855 | COVERAGE_INC(cmap_shrink); | |
856 | impl = cmap_rehash(cmap, impl->mask >> 1); | |
857 | } | |
ee58b469 JR |
858 | } |
859 | return impl->n; | |
0e666160 BP |
860 | } |
861 | ||
862 | static bool | |
863 | cmap_try_rehash(const struct cmap_impl *old, struct cmap_impl *new) | |
864 | { | |
865 | const struct cmap_bucket *b; | |
866 | ||
867 | for (b = old->buckets; b <= &old->buckets[old->mask]; b++) { | |
868 | int i; | |
869 | ||
870 | for (i = 0; i < CMAP_K; i++) { | |
871 | /* possible optimization here because we know the hashes are | |
872 | * unique */ | |
6ff0d5d6 | 873 | struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]); |
0e666160 | 874 | |
5c416811 | 875 | if (node && !cmap_try_insert(new, node, b->hashes[i])) { |
0e666160 BP |
876 | return false; |
877 | } | |
878 | } | |
879 | } | |
880 | return true; | |
881 | } | |
882 | ||
883 | static struct cmap_impl * | |
884 | cmap_rehash(struct cmap *cmap, uint32_t mask) | |
885 | { | |
886 | struct cmap_impl *old = cmap_get_impl(cmap); | |
887 | struct cmap_impl *new; | |
888 | ||
889 | new = cmap_impl_create(mask); | |
890 | ovs_assert(old->n < new->max_n); | |
891 | ||
892 | while (!cmap_try_rehash(old, new)) { | |
893 | memset(new->buckets, 0, (mask + 1) * sizeof *new->buckets); | |
894 | new->basis = random_uint32(); | |
895 | } | |
896 | ||
897 | new->n = old->n; | |
898 | ovsrcu_set(&cmap->impl, new); | |
b70e6976 BP |
899 | if (old != &empty_cmap) { |
900 | ovsrcu_postpone(free_cacheline, old); | |
901 | } | |
0e666160 BP |
902 | |
903 | return new; | |
904 | } | |
905 | ||
78c8df12 BP |
906 | struct cmap_cursor |
907 | cmap_cursor_start(const struct cmap *cmap) | |
0e666160 | 908 | { |
78c8df12 BP |
909 | struct cmap_cursor cursor; |
910 | ||
911 | cursor.impl = cmap_get_impl(cmap); | |
912 | cursor.bucket_idx = 0; | |
913 | cursor.entry_idx = 0; | |
914 | cursor.node = NULL; | |
915 | cmap_cursor_advance(&cursor); | |
916 | ||
917 | return cursor; | |
0e666160 BP |
918 | } |
919 | ||
78c8df12 BP |
920 | void |
921 | cmap_cursor_advance(struct cmap_cursor *cursor) | |
0e666160 BP |
922 | { |
923 | const struct cmap_impl *impl = cursor->impl; | |
924 | ||
78c8df12 BP |
925 | if (cursor->node) { |
926 | cursor->node = cmap_node_next(cursor->node); | |
927 | if (cursor->node) { | |
928 | return; | |
0e666160 BP |
929 | } |
930 | } | |
931 | ||
932 | while (cursor->bucket_idx <= impl->mask) { | |
933 | const struct cmap_bucket *b = &impl->buckets[cursor->bucket_idx]; | |
934 | ||
935 | while (cursor->entry_idx < CMAP_K) { | |
78c8df12 BP |
936 | cursor->node = cmap_node_next(&b->nodes[cursor->entry_idx++]); |
937 | if (cursor->node) { | |
938 | return; | |
0e666160 BP |
939 | } |
940 | } | |
941 | ||
942 | cursor->bucket_idx++; | |
943 | cursor->entry_idx = 0; | |
944 | } | |
0e666160 BP |
945 | } |
946 | ||
947 | /* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in | |
948 | * 'cmap'. Uses '*pos' to determine where to begin iteration, and updates | |
949 | * '*pos' to pass on the next iteration into them before returning. | |
950 | * | |
951 | * It's better to use plain CMAP_FOR_EACH and related functions, since they are | |
952 | * faster and better at dealing with cmaps that change during iteration. | |
953 | * | |
954 | * Before beginning iteration, set '*pos' to all zeros. */ | |
955 | struct cmap_node * | |
956 | cmap_next_position(const struct cmap *cmap, | |
957 | struct cmap_position *pos) | |
958 | { | |
959 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
960 | unsigned int bucket = pos->bucket; | |
961 | unsigned int entry = pos->entry; | |
962 | unsigned int offset = pos->offset; | |
963 | ||
964 | while (bucket <= impl->mask) { | |
965 | const struct cmap_bucket *b = &impl->buckets[bucket]; | |
966 | ||
967 | while (entry < CMAP_K) { | |
6ff0d5d6 | 968 | const struct cmap_node *node = cmap_node_next(&b->nodes[entry]); |
0e666160 BP |
969 | unsigned int i; |
970 | ||
cd50723c | 971 | for (i = 0; node; i++, node = cmap_node_next(node)) { |
0e666160 BP |
972 | if (i == offset) { |
973 | if (cmap_node_next(node)) { | |
974 | offset++; | |
975 | } else { | |
976 | entry++; | |
977 | offset = 0; | |
978 | } | |
979 | pos->bucket = bucket; | |
980 | pos->entry = entry; | |
981 | pos->offset = offset; | |
982 | return CONST_CAST(struct cmap_node *, node); | |
983 | } | |
984 | } | |
985 | ||
986 | entry++; | |
987 | offset = 0; | |
988 | } | |
989 | ||
990 | bucket++; | |
991 | entry = offset = 0; | |
992 | } | |
993 | ||
994 | pos->bucket = pos->entry = pos->offset = 0; | |
995 | return NULL; | |
996 | } |