]>
Commit | Line | Data |
---|---|---|
0e666160 BP |
1 | /* |
2 | * Copyright (c) 2014 Nicira, Inc. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #include <config.h> | |
18 | #include "cmap.h" | |
19 | #include "hash.h" | |
20 | #include "ovs-rcu.h" | |
21 | #include "random.h" | |
22 | #include "util.h" | |
23 | ||
24 | /* Optimistic Concurrent Cuckoo Hash | |
25 | * ================================= | |
26 | * | |
27 | * A "cuckoo hash" is an open addressing hash table schema, designed such that | |
28 | * a given element can be in one of only a small number of buckets 'd', each of | |
29 | * which holds up to a small number 'k' elements. Thus, the expected and | |
30 | * worst-case lookup times are O(1) because they require comparing no more than | |
31 | * a fixed number of elements (k * d). Inserting a new element can require | |
32 | * moving around existing elements, but it is also O(1) amortized expected | |
33 | * time. | |
34 | * | |
35 | * An optimistic concurrent hash table goes one step further, making it | |
36 | * possible for a single writer to execute concurrently with any number of | |
37 | * readers without requiring the readers to take any locks. | |
38 | * | |
39 | * This cuckoo hash implementation uses: | |
40 | * | |
41 | * - Two hash functions (d=2). More hash functions allow for a higher load | |
42 | * factor, but increasing 'k' is easier and the benefits of increasing 'd' | |
43 | * quickly fall off with the 'k' values used here. Also, the method of | |
44 | * generating hashes used in this implementation is hard to reasonably | |
45 | * extend beyond d=2. Finally, each additional hash function means that a | |
46 | * lookup has to look at least one extra cache line. | |
47 | * | |
48 | * - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets | |
49 | * exactly one cache line in size. | |
50 | * | |
51 | * According to Erlingsson [4], these parameters suggest a maximum load factor | |
52 | * of about 93%. The current implementation is conservative, expanding the | |
53 | * hash table when it is over 85% full. | |
54 | * | |
55 | * | |
56 | * Hash Functions | |
57 | * ============== | |
58 | * | |
59 | * A cuckoo hash requires multiple hash functions. When reorganizing the hash | |
60 | * becomes too difficult, it also requires the ability to change the hash | |
61 | * functions. Requiring the client to provide multiple hashes and to be able | |
62 | * to change them to new hashes upon insertion is inconvenient. | |
63 | * | |
64 | * This implementation takes another approach. The client provides a single, | |
65 | * fixed hash. The cuckoo hash internally "rehashes" this hash against a | |
66 | * randomly selected basis value (see rehash()). This rehashed value is one of | |
67 | * the two hashes. The other hash is computed by 16-bit circular rotation of | |
68 | * the rehashed value. Updating the basis changes the hash functions. | |
69 | * | |
70 | * To work properly, the hash functions used by a cuckoo hash must be | |
71 | * independent. If one hash function is a function of the other (e.g. h2(x) = | |
72 | * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail | |
73 | * catastrophically (loop forever) because of collisions. With this rehashing | |
74 | * technique, the two hashes are completely independent for masks up to 16 bits | |
75 | * wide. For masks wider than 16 bits, only 32-n bits are independent between | |
76 | * the two hashes. Thus, it becomes risky to grow a cuckoo hash table beyond | |
77 | * about 2**24 buckets (about 71 million elements with k=5 and maximum load | |
78 | * 85%). Fortunately, Open vSwitch does not normally deal with hash tables | |
79 | * this large. | |
80 | * | |
81 | * | |
82 | * Handling Duplicates | |
83 | * =================== | |
84 | * | |
85 | * This cuckoo hash table implementation deals with duplicate client-provided | |
86 | * hash values by chaining: the second and subsequent cmap_nodes with a given | |
87 | * hash are chained off the initially inserted node's 'next' member. The hash | |
88 | * table maintains the invariant that a single client-provided hash value | |
89 | * exists in only a single chain in a single bucket (even though that hash | |
90 | * could be stored in two buckets). | |
91 | * | |
92 | * | |
93 | * References | |
94 | * ========== | |
95 | * | |
96 | * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High | |
97 | * Performance Ethernet Forwarding with CuckooSwitch". In Proc. 9th | |
98 | * CoNEXT, Dec. 2013. | |
99 | * | |
100 | * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent | |
101 | * memcache with dumber caching and smarter hashing". In Proc. 10th USENIX | |
102 | * NSDI, Apr. 2013 | |
103 | * | |
104 | * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2): | |
105 | * 122-144, May 2004. | |
106 | * | |
107 | * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical | |
108 | * Alternative to Traditional Hash Tables". In Proc. 7th Workshop on | |
109 | * Distributed Data and Structures (WDAS'06), 2006. | |
110 | */ | |
111 | /* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */ | |
112 | #define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8)) | |
113 | ||
114 | /* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit. */ | |
115 | #define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE) | |
116 | ||
117 | /* Pad to make a bucket a full cache line in size: 4 on 32-bit, 0 on 64-bit. */ | |
118 | #define CMAP_PADDING ((CACHE_LINE_SIZE - 4) - (CMAP_K * CMAP_ENTRY_SIZE)) | |
119 | ||
120 | /* A cuckoo hash bucket. Designed to be cache-aligned and exactly one cache | |
121 | * line long. */ | |
122 | struct cmap_bucket { | |
123 | /* Allows readers to track in-progress changes. Initially zero, each | |
124 | * writer increments this value just before and just after each change (see | |
125 | * cmap_set_bucket()). Thus, a reader can ensure that it gets a consistent | |
126 | * snapshot by waiting for the counter to become even (see | |
127 | * read_even_counter()), then checking that its value does not change while | |
128 | * examining the bucket (see cmap_find()). */ | |
129 | atomic_uint32_t counter; | |
130 | ||
131 | /* (hash, node) slots. They are parallel arrays instead of an array of | |
132 | * structs to reduce the amount of space lost to padding. | |
133 | * | |
134 | * The slots are in no particular order. A null pointer indicates that a | |
135 | * pair is unused. In-use slots are not necessarily in the earliest | |
136 | * slots. */ | |
6ff0d5d6 JR |
137 | atomic_uint32_t hashes[CMAP_K]; |
138 | struct cmap_node nodes[CMAP_K]; | |
0e666160 BP |
139 | |
140 | /* Padding to make cmap_bucket exactly one cache line long. */ | |
141 | #if CMAP_PADDING > 0 | |
142 | uint8_t pad[CMAP_PADDING]; | |
143 | #endif | |
144 | }; | |
145 | BUILD_ASSERT_DECL(sizeof(struct cmap_bucket) == CACHE_LINE_SIZE); | |
146 | ||
147 | /* Default maximum load factor (as a fraction of UINT32_MAX + 1) before | |
148 | * enlarging a cmap. Reasonable values lie between about 75% and 93%. Smaller | |
149 | * values waste memory; larger values increase the average insertion time. */ | |
150 | #define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85)) | |
151 | ||
152 | /* The implementation of a concurrent hash map. */ | |
153 | struct cmap_impl { | |
154 | unsigned int n; /* Number of in-use elements. */ | |
155 | unsigned int max_n; /* Max elements before enlarging. */ | |
156 | uint32_t mask; /* Number of 'buckets', minus one. */ | |
157 | uint32_t basis; /* Basis for rehashing client's hash values. */ | |
158 | ||
159 | /* Padding to make cmap_impl exactly one cache line long. */ | |
160 | uint8_t pad[CACHE_LINE_SIZE - sizeof(unsigned int) * 4]; | |
161 | ||
162 | struct cmap_bucket buckets[]; | |
163 | }; | |
164 | BUILD_ASSERT_DECL(sizeof(struct cmap_impl) == CACHE_LINE_SIZE); | |
165 | ||
6ff0d5d6 JR |
166 | static uint32_t cmap_get_hash__(const atomic_uint32_t *hash, |
167 | memory_order order) | |
168 | { | |
169 | uint32_t hash__; | |
170 | ||
171 | atomic_read_explicit(CONST_CAST(ATOMIC(uint32_t) *, hash), &hash__, order); | |
172 | return hash__; | |
173 | } | |
174 | ||
175 | #define cmap_get_hash(HASH) \ | |
176 | cmap_get_hash__(HASH, memory_order_acquire) | |
177 | #define cmap_get_hash_protected(HASH) \ | |
178 | cmap_get_hash__(HASH, memory_order_relaxed) | |
179 | ||
0e666160 BP |
180 | static struct cmap_impl *cmap_rehash(struct cmap *, uint32_t mask); |
181 | ||
182 | /* Given a rehashed value 'hash', returns the other hash for that rehashed | |
183 | * value. This is symmetric: other_hash(other_hash(x)) == x. (See also "Hash | |
184 | * Functions" at the top of this file.) */ | |
185 | static uint32_t | |
186 | other_hash(uint32_t hash) | |
187 | { | |
188 | return (hash << 16) | (hash >> 16); | |
189 | } | |
190 | ||
191 | /* Returns the rehashed value for 'hash' within 'impl'. (See also "Hash | |
192 | * Functions" at the top of this file.) */ | |
193 | static uint32_t | |
194 | rehash(const struct cmap_impl *impl, uint32_t hash) | |
195 | { | |
33c6a1b9 | 196 | return hash_finish(impl->basis, hash); |
0e666160 BP |
197 | } |
198 | ||
199 | static struct cmap_impl * | |
200 | cmap_get_impl(const struct cmap *cmap) | |
201 | { | |
202 | return ovsrcu_get(struct cmap_impl *, &cmap->impl); | |
203 | } | |
204 | ||
205 | static uint32_t | |
206 | calc_max_n(uint32_t mask) | |
207 | { | |
208 | return ((uint64_t) (mask + 1) * CMAP_K * CMAP_MAX_LOAD) >> 32; | |
209 | } | |
210 | ||
211 | static struct cmap_impl * | |
212 | cmap_impl_create(uint32_t mask) | |
213 | { | |
214 | struct cmap_impl *impl; | |
215 | ||
216 | ovs_assert(is_pow2(mask + 1)); | |
217 | ||
218 | impl = xzalloc_cacheline(sizeof *impl | |
219 | + (mask + 1) * sizeof *impl->buckets); | |
220 | impl->n = 0; | |
221 | impl->max_n = calc_max_n(mask); | |
222 | impl->mask = mask; | |
223 | impl->basis = random_uint32(); | |
224 | ||
225 | return impl; | |
226 | } | |
227 | ||
228 | /* Initializes 'cmap' as an empty concurrent hash map. */ | |
229 | void | |
230 | cmap_init(struct cmap *cmap) | |
231 | { | |
232 | ovsrcu_set(&cmap->impl, cmap_impl_create(0)); | |
233 | } | |
234 | ||
235 | /* Destroys 'cmap'. | |
236 | * | |
237 | * The client is responsible for destroying any data previously held in | |
238 | * 'cmap'. */ | |
239 | void | |
240 | cmap_destroy(struct cmap *cmap) | |
241 | { | |
242 | if (cmap) { | |
7530fe0d | 243 | ovsrcu_postpone(free_cacheline, cmap_get_impl(cmap)); |
0e666160 BP |
244 | } |
245 | } | |
246 | ||
247 | /* Returns the number of elements in 'cmap'. */ | |
248 | size_t | |
249 | cmap_count(const struct cmap *cmap) | |
250 | { | |
251 | return cmap_get_impl(cmap)->n; | |
252 | } | |
253 | ||
254 | /* Returns true if 'cmap' is empty, false otherwise. */ | |
255 | bool | |
256 | cmap_is_empty(const struct cmap *cmap) | |
257 | { | |
258 | return cmap_count(cmap) == 0; | |
259 | } | |
260 | ||
261 | static uint32_t | |
34a0d778 | 262 | read_counter(struct cmap_bucket *bucket) |
0e666160 BP |
263 | { |
264 | uint32_t counter; | |
265 | ||
34a0d778 | 266 | atomic_read_explicit(&bucket->counter, &counter, memory_order_acquire); |
0e666160 BP |
267 | return counter; |
268 | } | |
269 | ||
270 | static uint32_t | |
271 | read_even_counter(struct cmap_bucket *bucket) | |
272 | { | |
273 | uint32_t counter; | |
274 | ||
275 | do { | |
34a0d778 | 276 | counter = read_counter(bucket); |
0e666160 BP |
277 | } while (OVS_UNLIKELY(counter & 1)); |
278 | ||
279 | return counter; | |
280 | } | |
281 | ||
282 | static bool | |
283 | counter_changed(struct cmap_bucket *b, uint32_t c) | |
284 | { | |
34a0d778 | 285 | return OVS_UNLIKELY(read_counter(b) != c); |
0e666160 BP |
286 | } |
287 | ||
288 | /* Searches 'cmap' for an element with the specified 'hash'. If one or more is | |
289 | * found, returns a pointer to the first one, otherwise a null pointer. All of | |
290 | * the nodes on the returned list are guaranteed to have exactly the given | |
291 | * 'hash'. | |
292 | * | |
293 | * This function works even if 'cmap' is changing concurrently. If 'cmap' is | |
294 | * not changing, then cmap_find_protected() is slightly faster. | |
295 | * | |
296 | * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */ | |
297 | struct cmap_node * | |
298 | cmap_find(const struct cmap *cmap, uint32_t hash) | |
299 | { | |
300 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
301 | uint32_t h1 = rehash(impl, hash); | |
302 | uint32_t h2 = other_hash(h1); | |
303 | struct cmap_bucket *b1; | |
304 | struct cmap_bucket *b2; | |
305 | uint32_t c1, c2; | |
306 | int i; | |
307 | ||
308 | retry: | |
309 | b1 = &impl->buckets[h1 & impl->mask]; | |
310 | c1 = read_even_counter(b1); | |
311 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 JR |
312 | struct cmap_node *node = cmap_node_next(&b1->nodes[i]); |
313 | ||
314 | if (node && cmap_get_hash(&b1->hashes[i]) == hash) { | |
0e666160 BP |
315 | if (counter_changed(b1, c1)) { |
316 | goto retry; | |
317 | } | |
318 | return node; | |
319 | } | |
320 | } | |
321 | ||
322 | b2 = &impl->buckets[h2 & impl->mask]; | |
323 | c2 = read_even_counter(b2); | |
324 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 JR |
325 | struct cmap_node *node = cmap_node_next(&b2->nodes[i]); |
326 | ||
327 | if (node && cmap_get_hash(&b2->hashes[i]) == hash) { | |
0e666160 BP |
328 | if (counter_changed(b2, c2)) { |
329 | goto retry; | |
330 | } | |
331 | return node; | |
332 | } | |
333 | } | |
334 | ||
335 | if (counter_changed(b1, c1) || counter_changed(b2, c2)) { | |
336 | goto retry; | |
337 | } | |
338 | return NULL; | |
339 | } | |
340 | ||
341 | static int | |
6ff0d5d6 | 342 | cmap_find_slot_protected(struct cmap_bucket *b, uint32_t hash) |
0e666160 BP |
343 | { |
344 | int i; | |
345 | ||
346 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 JR |
347 | struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]); |
348 | ||
349 | if (node && cmap_get_hash_protected(&b->hashes[i]) == hash) { | |
0e666160 BP |
350 | return i; |
351 | } | |
352 | } | |
353 | return -1; | |
354 | } | |
355 | ||
356 | static struct cmap_node * | |
357 | cmap_find_bucket_protected(struct cmap_impl *impl, uint32_t hash, uint32_t h) | |
358 | { | |
359 | struct cmap_bucket *b = &impl->buckets[h & impl->mask]; | |
360 | int i; | |
361 | ||
362 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 JR |
363 | struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]); |
364 | ||
365 | if (node && cmap_get_hash_protected(&b->hashes[i]) == hash) { | |
0e666160 BP |
366 | return node; |
367 | } | |
368 | } | |
369 | return NULL; | |
370 | } | |
371 | ||
372 | /* Like cmap_find(), but only for use if 'cmap' cannot change concurrently. | |
373 | * | |
374 | * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */ | |
375 | struct cmap_node * | |
376 | cmap_find_protected(const struct cmap *cmap, uint32_t hash) | |
377 | { | |
378 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
379 | uint32_t h1 = rehash(impl, hash); | |
380 | uint32_t h2 = other_hash(hash); | |
381 | struct cmap_node *node; | |
382 | ||
383 | node = cmap_find_bucket_protected(impl, hash, h1); | |
384 | if (node) { | |
385 | return node; | |
386 | } | |
387 | return cmap_find_bucket_protected(impl, hash, h2); | |
388 | } | |
389 | ||
390 | static int | |
6ff0d5d6 | 391 | cmap_find_empty_slot_protected(const struct cmap_bucket *b) |
0e666160 BP |
392 | { |
393 | int i; | |
394 | ||
395 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 | 396 | if (!cmap_node_next_protected(&b->nodes[i])) { |
0e666160 BP |
397 | return i; |
398 | } | |
399 | } | |
400 | return -1; | |
401 | } | |
402 | ||
403 | static void | |
404 | cmap_set_bucket(struct cmap_bucket *b, int i, | |
405 | struct cmap_node *node, uint32_t hash) | |
406 | { | |
407 | uint32_t c; | |
408 | ||
409 | atomic_read_explicit(&b->counter, &c, memory_order_acquire); | |
410 | atomic_store_explicit(&b->counter, c + 1, memory_order_release); | |
6ff0d5d6 JR |
411 | ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */ |
412 | atomic_store_explicit(&b->hashes[i], hash, memory_order_release); | |
0e666160 BP |
413 | atomic_store_explicit(&b->counter, c + 2, memory_order_release); |
414 | } | |
415 | ||
416 | /* Searches 'b' for a node with the given 'hash'. If it finds one, adds | |
417 | * 'new_node' to the node's linked list and returns true. If it does not find | |
418 | * one, returns false. */ | |
419 | static bool | |
420 | cmap_insert_dup(struct cmap_node *new_node, uint32_t hash, | |
421 | struct cmap_bucket *b) | |
422 | { | |
423 | int i; | |
424 | ||
425 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 JR |
426 | struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]); |
427 | ||
428 | if (cmap_get_hash_protected(&b->hashes[i]) == hash) { | |
0e666160 BP |
429 | if (node) { |
430 | struct cmap_node *p; | |
431 | ||
6ff0d5d6 JR |
432 | /* The common case is that 'new_node' is a singleton, |
433 | * with a null 'next' pointer. Rehashing can add a | |
434 | * longer chain, but due to our invariant of always | |
435 | * having all nodes with the same (user) hash value at | |
436 | * a single chain, rehashing will always insert the | |
437 | * chain to an empty node. The only way we can end up | |
438 | * here is by the user inserting a chain of nodes at | |
439 | * once. Find the end of the chain starting at | |
440 | * 'new_node', then splice 'node' to the end of that | |
441 | * chain. */ | |
0e666160 BP |
442 | p = new_node; |
443 | for (;;) { | |
444 | struct cmap_node *next = cmap_node_next_protected(p); | |
6ff0d5d6 | 445 | |
0e666160 BP |
446 | if (!next) { |
447 | break; | |
448 | } | |
449 | p = next; | |
450 | } | |
7e5f06c3 | 451 | ovsrcu_set_hidden(&p->next, node); |
0e666160 BP |
452 | } else { |
453 | /* The hash value is there from some previous insertion, but | |
454 | * the associated node has been removed. We're not really | |
455 | * inserting a duplicate, but we can still reuse the slot. | |
456 | * Carry on. */ | |
457 | } | |
458 | ||
459 | /* Change the bucket to point to 'new_node'. This is a degenerate | |
460 | * form of cmap_set_bucket() that doesn't update the counter since | |
461 | * we're only touching one field and in a way that doesn't change | |
462 | * the bucket's meaning for readers. */ | |
6ff0d5d6 | 463 | ovsrcu_set(&b->nodes[i].next, new_node); |
0e666160 BP |
464 | |
465 | return true; | |
466 | } | |
467 | } | |
468 | return false; | |
469 | } | |
470 | ||
471 | /* Searches 'b' for an empty slot. If successful, stores 'node' and 'hash' in | |
472 | * the slot and returns true. Otherwise, returns false. */ | |
473 | static bool | |
474 | cmap_insert_bucket(struct cmap_node *node, uint32_t hash, | |
475 | struct cmap_bucket *b) | |
476 | { | |
477 | int i; | |
478 | ||
479 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 | 480 | if (!cmap_node_next_protected(&b->nodes[i])) { |
0e666160 BP |
481 | cmap_set_bucket(b, i, node, hash); |
482 | return true; | |
483 | } | |
484 | } | |
485 | return false; | |
486 | } | |
487 | ||
488 | /* Returns the other bucket that b->nodes[slot] could occupy in 'impl'. (This | |
489 | * might be the same as 'b'.) */ | |
490 | static struct cmap_bucket * | |
6ff0d5d6 | 491 | other_bucket_protected(struct cmap_impl *impl, struct cmap_bucket *b, int slot) |
0e666160 | 492 | { |
6ff0d5d6 | 493 | uint32_t h1 = rehash(impl, cmap_get_hash_protected(&b->hashes[slot])); |
0e666160 BP |
494 | uint32_t h2 = other_hash(h1); |
495 | uint32_t b_idx = b - impl->buckets; | |
496 | uint32_t other_h = (h1 & impl->mask) == b_idx ? h2 : h1; | |
497 | ||
498 | return &impl->buckets[other_h & impl->mask]; | |
499 | } | |
500 | ||
501 | /* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1' | |
502 | * and 'b2' are full. This function attempts to rearrange buckets within | |
503 | * 'impl' to make room for 'new_node'. | |
504 | * | |
505 | * The implementation is a general-purpose breadth-first search. At first | |
506 | * glance, this is more complex than a random walk through 'impl' (suggested by | |
507 | * some references), but random walks have a tendency to loop back through a | |
508 | * single bucket. We have to move nodes backward along the path that we find, | |
509 | * so that no node actually disappears from the hash table, which means a | |
510 | * random walk would have to be careful to deal with loops. By contrast, a | |
511 | * successful breadth-first search always finds a *shortest* path through the | |
512 | * hash table, and a shortest path will never contain loops, so it avoids that | |
513 | * problem entirely. | |
514 | */ | |
515 | static bool | |
516 | cmap_insert_bfs(struct cmap_impl *impl, struct cmap_node *new_node, | |
517 | uint32_t hash, struct cmap_bucket *b1, struct cmap_bucket *b2) | |
518 | { | |
52a6bdaf | 519 | enum { MAX_DEPTH = 4 }; |
0e666160 BP |
520 | |
521 | /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'. | |
522 | * | |
523 | * One can follow the path via: | |
524 | * | |
525 | * struct cmap_bucket *b; | |
526 | * int i; | |
527 | * | |
528 | * b = path->start; | |
529 | * for (i = 0; i < path->n; i++) { | |
6ff0d5d6 | 530 | * b = other_bucket_protected(impl, b, path->slots[i]); |
0e666160 BP |
531 | * } |
532 | * ovs_assert(b == path->end); | |
533 | */ | |
534 | struct cmap_path { | |
535 | struct cmap_bucket *start; /* First bucket along the path. */ | |
536 | struct cmap_bucket *end; /* Last bucket on the path. */ | |
52a6bdaf | 537 | uint8_t slots[MAX_DEPTH]; /* Slots used for each hop. */ |
0e666160 BP |
538 | int n; /* Number of slots[]. */ |
539 | }; | |
540 | ||
541 | /* We need to limit the amount of work we do trying to find a path. It | |
542 | * might actually be impossible to rearrange the cmap, and after some time | |
543 | * it is likely to be easier to rehash the entire cmap. | |
544 | * | |
545 | * This value of MAX_QUEUE is an arbitrary limit suggested by one of the | |
546 | * references. Empirically, it seems to work OK. */ | |
547 | enum { MAX_QUEUE = 500 }; | |
548 | struct cmap_path queue[MAX_QUEUE]; | |
549 | int head = 0; | |
550 | int tail = 0; | |
551 | ||
552 | /* Add 'b1' and 'b2' as starting points for the search. */ | |
553 | queue[head].start = b1; | |
554 | queue[head].end = b1; | |
555 | queue[head].n = 0; | |
556 | head++; | |
557 | if (b1 != b2) { | |
558 | queue[head].start = b2; | |
559 | queue[head].end = b2; | |
560 | queue[head].n = 0; | |
561 | head++; | |
562 | } | |
563 | ||
564 | while (tail < head) { | |
565 | const struct cmap_path *path = &queue[tail++]; | |
566 | struct cmap_bucket *this = path->end; | |
567 | int i; | |
568 | ||
569 | for (i = 0; i < CMAP_K; i++) { | |
6ff0d5d6 | 570 | struct cmap_bucket *next = other_bucket_protected(impl, this, i); |
0e666160 BP |
571 | int j; |
572 | ||
573 | if (this == next) { | |
574 | continue; | |
575 | } | |
576 | ||
6ff0d5d6 | 577 | j = cmap_find_empty_slot_protected(next); |
0e666160 BP |
578 | if (j >= 0) { |
579 | /* We've found a path along which we can rearrange the hash | |
580 | * table: Start at path->start, follow all the slots in | |
581 | * path->slots[], then follow slot 'i', then the bucket you | |
582 | * arrive at has slot 'j' empty. */ | |
52a6bdaf GS |
583 | struct cmap_bucket *buckets[MAX_DEPTH + 2]; |
584 | int slots[MAX_DEPTH + 2]; | |
0e666160 BP |
585 | int k; |
586 | ||
587 | /* Figure out the full sequence of slots. */ | |
588 | for (k = 0; k < path->n; k++) { | |
589 | slots[k] = path->slots[k]; | |
590 | } | |
591 | slots[path->n] = i; | |
592 | slots[path->n + 1] = j; | |
593 | ||
594 | /* Figure out the full sequence of buckets. */ | |
595 | buckets[0] = path->start; | |
596 | for (k = 0; k <= path->n; k++) { | |
6ff0d5d6 | 597 | buckets[k + 1] = other_bucket_protected(impl, buckets[k], slots[k]); |
0e666160 BP |
598 | } |
599 | ||
600 | /* Now the path is fully expressed. One can start from | |
601 | * buckets[0], go via slots[0] to buckets[1], via slots[1] to | |
602 | * buckets[2], and so on. | |
603 | * | |
604 | * Move all the nodes across the path "backward". After each | |
605 | * step some node appears in two buckets. Thus, every node is | |
606 | * always visible to a concurrent search. */ | |
607 | for (k = path->n + 1; k > 0; k--) { | |
608 | int slot = slots[k - 1]; | |
609 | ||
610 | cmap_set_bucket(buckets[k], slots[k], | |
6ff0d5d6 JR |
611 | cmap_node_next_protected(&buckets[k - 1]->nodes[slot]), |
612 | cmap_get_hash_protected(&buckets[k - 1]->hashes[slot])); | |
0e666160 BP |
613 | } |
614 | ||
615 | /* Finally, replace the first node on the path by | |
616 | * 'new_node'. */ | |
617 | cmap_set_bucket(buckets[0], slots[0], new_node, hash); | |
618 | ||
619 | return true; | |
620 | } | |
621 | ||
52a6bdaf | 622 | if (path->n < MAX_DEPTH && head < MAX_QUEUE) { |
0e666160 BP |
623 | struct cmap_path *new_path = &queue[head++]; |
624 | ||
625 | *new_path = *path; | |
626 | new_path->end = next; | |
627 | new_path->slots[new_path->n++] = i; | |
628 | } | |
629 | } | |
630 | } | |
631 | ||
632 | return false; | |
633 | } | |
634 | ||
635 | /* Adds 'node', with the given 'hash', to 'impl'. | |
636 | * | |
637 | * 'node' is ordinarily a single node, with a null 'next' pointer. When | |
638 | * rehashing, however, it may be a longer chain of nodes. */ | |
639 | static bool | |
640 | cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash) | |
641 | { | |
642 | uint32_t h1 = rehash(impl, hash); | |
643 | uint32_t h2 = other_hash(h1); | |
644 | struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask]; | |
645 | struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask]; | |
646 | ||
647 | return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) || | |
648 | cmap_insert_dup(node, hash, b2)) || | |
649 | OVS_LIKELY(cmap_insert_bucket(node, hash, b1) || | |
650 | cmap_insert_bucket(node, hash, b2)) || | |
651 | cmap_insert_bfs(impl, node, hash, b1, b2)); | |
652 | } | |
653 | ||
654 | /* Inserts 'node', with the given 'hash', into 'cmap'. The caller must ensure | |
655 | * that 'cmap' cannot change concurrently (from another thread). If duplicates | |
656 | * are undesirable, the caller must have already verified that 'cmap' does not | |
657 | * contain a duplicate of 'node'. */ | |
658 | void | |
659 | cmap_insert(struct cmap *cmap, struct cmap_node *node, uint32_t hash) | |
660 | { | |
661 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
662 | ||
7e5f06c3 | 663 | ovsrcu_set_hidden(&node->next, NULL); |
0e666160 BP |
664 | |
665 | if (OVS_UNLIKELY(impl->n >= impl->max_n)) { | |
666 | impl = cmap_rehash(cmap, (impl->mask << 1) | 1); | |
667 | } | |
668 | ||
669 | while (OVS_UNLIKELY(!cmap_try_insert(impl, node, hash))) { | |
670 | impl = cmap_rehash(cmap, impl->mask); | |
671 | } | |
672 | impl->n++; | |
673 | } | |
674 | ||
675 | static bool | |
9d933fc7 JR |
676 | cmap_replace__(struct cmap_impl *impl, struct cmap_node *node, |
677 | struct cmap_node *replacement, uint32_t hash, uint32_t h) | |
0e666160 BP |
678 | { |
679 | struct cmap_bucket *b = &impl->buckets[h & impl->mask]; | |
680 | int slot; | |
681 | ||
6ff0d5d6 | 682 | slot = cmap_find_slot_protected(b, hash); |
0e666160 BP |
683 | if (slot < 0) { |
684 | return false; | |
685 | } | |
686 | ||
9d933fc7 JR |
687 | /* The pointer to 'node' is changed to point to 'replacement', |
688 | * which is the next node if no replacement node is given. */ | |
689 | if (!replacement) { | |
690 | replacement = cmap_node_next_protected(node); | |
691 | } else { | |
692 | /* 'replacement' takes the position of 'node' in the list. */ | |
7e5f06c3 | 693 | ovsrcu_set_hidden(&replacement->next, cmap_node_next_protected(node)); |
9d933fc7 JR |
694 | } |
695 | ||
6ff0d5d6 JR |
696 | struct cmap_node *iter = &b->nodes[slot]; |
697 | for (;;) { | |
698 | struct cmap_node *next = cmap_node_next_protected(iter); | |
699 | ||
700 | if (next == node) { | |
701 | ovsrcu_set(&iter->next, replacement); | |
702 | return true; | |
0e666160 | 703 | } |
6ff0d5d6 | 704 | iter = next; |
0e666160 | 705 | } |
0e666160 BP |
706 | } |
707 | ||
708 | /* Removes 'node' from 'cmap'. The caller must ensure that 'cmap' cannot | |
709 | * change concurrently (from another thread). | |
710 | * | |
711 | * 'node' must not be destroyed or modified or inserted back into 'cmap' or | |
712 | * into any other concurrent hash map while any other thread might be accessing | |
713 | * it. One correct way to do this is to free it from an RCU callback with | |
714 | * ovsrcu_postpone(). */ | |
715 | void | |
716 | cmap_remove(struct cmap *cmap, struct cmap_node *node, uint32_t hash) | |
9d933fc7 JR |
717 | { |
718 | cmap_replace(cmap, node, NULL, hash); | |
719 | cmap_get_impl(cmap)->n--; | |
720 | } | |
721 | ||
722 | /* Replaces 'old_node' in 'cmap' with 'new_node'. The caller must | |
723 | * ensure that 'cmap' cannot change concurrently (from another thread). | |
724 | * | |
725 | * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or | |
726 | * into any other concurrent hash map while any other thread might be accessing | |
727 | * it. One correct way to do this is to free it from an RCU callback with | |
728 | * ovsrcu_postpone(). */ | |
729 | void | |
730 | cmap_replace(struct cmap *cmap, struct cmap_node *old_node, | |
731 | struct cmap_node *new_node, uint32_t hash) | |
0e666160 BP |
732 | { |
733 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
734 | uint32_t h1 = rehash(impl, hash); | |
735 | uint32_t h2 = other_hash(h1); | |
736 | bool ok; | |
737 | ||
9d933fc7 JR |
738 | ok = cmap_replace__(impl, old_node, new_node, hash, h1) |
739 | || cmap_replace__(impl, old_node, new_node, hash, h2); | |
0e666160 | 740 | ovs_assert(ok); |
0e666160 BP |
741 | } |
742 | ||
743 | static bool | |
744 | cmap_try_rehash(const struct cmap_impl *old, struct cmap_impl *new) | |
745 | { | |
746 | const struct cmap_bucket *b; | |
747 | ||
748 | for (b = old->buckets; b <= &old->buckets[old->mask]; b++) { | |
749 | int i; | |
750 | ||
751 | for (i = 0; i < CMAP_K; i++) { | |
752 | /* possible optimization here because we know the hashes are | |
753 | * unique */ | |
6ff0d5d6 | 754 | struct cmap_node *node = cmap_node_next_protected(&b->nodes[i]); |
0e666160 | 755 | |
6ff0d5d6 JR |
756 | if (node && |
757 | !cmap_try_insert(new, node, | |
758 | cmap_get_hash_protected(&b->hashes[i]))) { | |
0e666160 BP |
759 | return false; |
760 | } | |
761 | } | |
762 | } | |
763 | return true; | |
764 | } | |
765 | ||
766 | static struct cmap_impl * | |
767 | cmap_rehash(struct cmap *cmap, uint32_t mask) | |
768 | { | |
769 | struct cmap_impl *old = cmap_get_impl(cmap); | |
770 | struct cmap_impl *new; | |
771 | ||
772 | new = cmap_impl_create(mask); | |
773 | ovs_assert(old->n < new->max_n); | |
774 | ||
775 | while (!cmap_try_rehash(old, new)) { | |
776 | memset(new->buckets, 0, (mask + 1) * sizeof *new->buckets); | |
777 | new->basis = random_uint32(); | |
778 | } | |
779 | ||
780 | new->n = old->n; | |
781 | ovsrcu_set(&cmap->impl, new); | |
782 | ovsrcu_postpone(free_cacheline, old); | |
783 | ||
784 | return new; | |
785 | } | |
786 | ||
78c8df12 BP |
787 | struct cmap_cursor |
788 | cmap_cursor_start(const struct cmap *cmap) | |
0e666160 | 789 | { |
78c8df12 BP |
790 | struct cmap_cursor cursor; |
791 | ||
792 | cursor.impl = cmap_get_impl(cmap); | |
793 | cursor.bucket_idx = 0; | |
794 | cursor.entry_idx = 0; | |
795 | cursor.node = NULL; | |
796 | cmap_cursor_advance(&cursor); | |
797 | ||
798 | return cursor; | |
0e666160 BP |
799 | } |
800 | ||
78c8df12 BP |
801 | void |
802 | cmap_cursor_advance(struct cmap_cursor *cursor) | |
0e666160 BP |
803 | { |
804 | const struct cmap_impl *impl = cursor->impl; | |
805 | ||
78c8df12 BP |
806 | if (cursor->node) { |
807 | cursor->node = cmap_node_next(cursor->node); | |
808 | if (cursor->node) { | |
809 | return; | |
0e666160 BP |
810 | } |
811 | } | |
812 | ||
813 | while (cursor->bucket_idx <= impl->mask) { | |
814 | const struct cmap_bucket *b = &impl->buckets[cursor->bucket_idx]; | |
815 | ||
816 | while (cursor->entry_idx < CMAP_K) { | |
78c8df12 BP |
817 | cursor->node = cmap_node_next(&b->nodes[cursor->entry_idx++]); |
818 | if (cursor->node) { | |
819 | return; | |
0e666160 BP |
820 | } |
821 | } | |
822 | ||
823 | cursor->bucket_idx++; | |
824 | cursor->entry_idx = 0; | |
825 | } | |
0e666160 BP |
826 | } |
827 | ||
828 | /* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in | |
829 | * 'cmap'. Uses '*pos' to determine where to begin iteration, and updates | |
830 | * '*pos' to pass on the next iteration into them before returning. | |
831 | * | |
832 | * It's better to use plain CMAP_FOR_EACH and related functions, since they are | |
833 | * faster and better at dealing with cmaps that change during iteration. | |
834 | * | |
835 | * Before beginning iteration, set '*pos' to all zeros. */ | |
836 | struct cmap_node * | |
837 | cmap_next_position(const struct cmap *cmap, | |
838 | struct cmap_position *pos) | |
839 | { | |
840 | struct cmap_impl *impl = cmap_get_impl(cmap); | |
841 | unsigned int bucket = pos->bucket; | |
842 | unsigned int entry = pos->entry; | |
843 | unsigned int offset = pos->offset; | |
844 | ||
845 | while (bucket <= impl->mask) { | |
846 | const struct cmap_bucket *b = &impl->buckets[bucket]; | |
847 | ||
848 | while (entry < CMAP_K) { | |
6ff0d5d6 | 849 | const struct cmap_node *node = cmap_node_next(&b->nodes[entry]); |
0e666160 BP |
850 | unsigned int i; |
851 | ||
cd50723c | 852 | for (i = 0; node; i++, node = cmap_node_next(node)) { |
0e666160 BP |
853 | if (i == offset) { |
854 | if (cmap_node_next(node)) { | |
855 | offset++; | |
856 | } else { | |
857 | entry++; | |
858 | offset = 0; | |
859 | } | |
860 | pos->bucket = bucket; | |
861 | pos->entry = entry; | |
862 | pos->offset = offset; | |
863 | return CONST_CAST(struct cmap_node *, node); | |
864 | } | |
865 | } | |
866 | ||
867 | entry++; | |
868 | offset = 0; | |
869 | } | |
870 | ||
871 | bucket++; | |
872 | entry = offset = 0; | |
873 | } | |
874 | ||
875 | pos->bucket = pos->entry = pos->offset = 0; | |
876 | return NULL; | |
877 | } |