]>
git.proxmox.com Git - mirror_ovs.git/blob - lib/cmap.c
2 * Copyright (c) 2014, 2016 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 COVERAGE_DEFINE(cmap_expand
);
27 COVERAGE_DEFINE(cmap_shrink
);
29 /* Optimistic Concurrent Cuckoo Hash
30 * =================================
32 * A "cuckoo hash" is an open addressing hash table schema, designed such that
33 * a given element can be in one of only a small number of buckets 'd', each of
34 * which holds up to a small number 'k' elements. Thus, the expected and
35 * worst-case lookup times are O(1) because they require comparing no more than
36 * a fixed number of elements (k * d). Inserting a new element can require
37 * moving around existing elements, but it is also O(1) amortized expected
40 * An optimistic concurrent hash table goes one step further, making it
41 * possible for a single writer to execute concurrently with any number of
42 * readers without requiring the readers to take any locks.
44 * This cuckoo hash implementation uses:
46 * - Two hash functions (d=2). More hash functions allow for a higher load
47 * factor, but increasing 'k' is easier and the benefits of increasing 'd'
48 * quickly fall off with the 'k' values used here. Also, the method of
49 * generating hashes used in this implementation is hard to reasonably
50 * extend beyond d=2. Finally, each additional hash function means that a
51 * lookup has to look at least one extra cache line.
53 * - 5 or 7 elements per bucket (k=5 or k=7), chosen to make buckets
54 * exactly one cache line in size.
56 * According to Erlingsson [4], these parameters suggest a maximum load factor
57 * of about 93%. The current implementation is conservative, expanding the
58 * hash table when it is over 85% full.
60 * When the load factor is below 20%, the hash table will be shrinked by half.
61 * This is to reduce the memory utilization of the hash table and to avoid
62 * the hash table occupying the top of heap chunk which prevents the trimming
68 * A cuckoo hash requires multiple hash functions. When reorganizing the hash
69 * becomes too difficult, it also requires the ability to change the hash
70 * functions. Requiring the client to provide multiple hashes and to be able
71 * to change them to new hashes upon insertion is inconvenient.
73 * This implementation takes another approach. The client provides a single,
74 * fixed hash. The cuckoo hash internally "rehashes" this hash against a
75 * randomly selected basis value (see rehash()). This rehashed value is one of
76 * the two hashes. The other hash is computed by 16-bit circular rotation of
77 * the rehashed value. Updating the basis changes the hash functions.
79 * To work properly, the hash functions used by a cuckoo hash must be
80 * independent. If one hash function is a function of the other (e.g. h2(x) =
81 * h1(x) + 1, or h2(x) = hash(h1(x))), then insertion will eventually fail
82 * catastrophically (loop forever) because of collisions. With this rehashing
83 * technique, the two hashes are completely independent for masks up to 16 bits
84 * wide. For masks wider than 16 bits, only 32-n bits are independent between
85 * the two hashes. Thus, it becomes risky to grow a cuckoo hash table beyond
86 * about 2**24 buckets (about 71 million elements with k=5 and maximum load
87 * 85%). Fortunately, Open vSwitch does not normally deal with hash tables
94 * This cuckoo hash table implementation deals with duplicate client-provided
95 * hash values by chaining: the second and subsequent cmap_nodes with a given
96 * hash are chained off the initially inserted node's 'next' member. The hash
97 * table maintains the invariant that a single client-provided hash value
98 * exists in only a single chain in a single bucket (even though that hash
99 * could be stored in two buckets).
105 * [1] D. Zhou, B. Fan, H. Lim, M. Kaminsky, D. G. Andersen, "Scalable, High
106 * Performance Ethernet Forwarding with CuckooSwitch". In Proc. 9th
109 * [2] B. Fan, D. G. Andersen, and M. Kaminsky. "MemC3: Compact and concurrent
110 * memcache with dumber caching and smarter hashing". In Proc. 10th USENIX
113 * [3] R. Pagh and F. Rodler. "Cuckoo hashing". Journal of Algorithms, 51(2):
116 * [4] U. Erlingsson, M. Manasse, F. McSherry, "A Cool and Practical
117 * Alternative to Traditional Hash Tables". In Proc. 7th Workshop on
118 * Distributed Data and Structures (WDAS'06), 2006.
120 /* An entry is an int and a pointer: 8 bytes on 32-bit, 12 bytes on 64-bit. */
121 #define CMAP_ENTRY_SIZE (4 + (UINTPTR_MAX == UINT32_MAX ? 4 : 8))
123 /* Number of entries per bucket: 7 on 32-bit, 5 on 64-bit for 64B cacheline. */
124 #define CMAP_K ((CACHE_LINE_SIZE - 4) / CMAP_ENTRY_SIZE)
126 /* A cuckoo hash bucket. Designed to be cache-aligned and exactly one cache
129 /* Padding to make cmap_bucket exactly one cache line long. */
130 PADDED_MEMBERS(CACHE_LINE_SIZE
,
131 /* Allows readers to track in-progress changes. Initially zero, each
132 * writer increments this value just before and just after each change
133 * (see cmap_set_bucket()). Thus, a reader can ensure that it gets a
134 * consistent snapshot by waiting for the counter to become even (see
135 * read_even_counter()), then checking that its value does not change
136 * while examining the bucket (see cmap_find()). */
137 atomic_uint32_t counter
;
139 /* (hash, node) slots. They are parallel arrays instead of an array of
140 * structs to reduce the amount of space lost to padding.
142 * The slots are in no particular order. A null pointer indicates that
143 * a pair is unused. In-use slots are not necessarily in the earliest
145 uint32_t hashes
[CMAP_K
];
146 struct cmap_node nodes
[CMAP_K
];
149 BUILD_ASSERT_DECL(sizeof(struct cmap_bucket
) == CACHE_LINE_SIZE
);
151 /* Default maximum load factor (as a fraction of UINT32_MAX + 1) before
152 * enlarging a cmap. Reasonable values lie between about 75% and 93%. Smaller
153 * values waste memory; larger values increase the average insertion time. */
154 #define CMAP_MAX_LOAD ((uint32_t) (UINT32_MAX * .85))
156 /* Default minimum load factor (as a fraction of UINT32_MAX + 1) before
157 * shrinking a cmap. Currently, the value is chosen to be 20%, this
158 * means cmap will have a 40% load factor after shrink. */
159 #define CMAP_MIN_LOAD ((uint32_t) (UINT32_MAX * .20))
161 /* The implementation of a concurrent hash map. */
163 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE
, cacheline0
,
164 unsigned int n
; /* Number of in-use elements. */
165 unsigned int max_n
; /* Max elements before enlarging. */
166 unsigned int min_n
; /* Min elements before shrinking. */
167 uint32_t mask
; /* Number of 'buckets', minus one. */
168 uint32_t basis
; /* Basis for rehashing client's
172 PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE
, cacheline1
,
173 struct cmap_bucket buckets
[1];
176 BUILD_ASSERT_DECL(sizeof(struct cmap_impl
) == CACHE_LINE_SIZE
* 2);
179 OVS_ALIGNED_VAR(CACHE_LINE_SIZE
) const struct cmap_impl empty_cmap
;
181 static struct cmap_impl
*cmap_rehash(struct cmap
*, uint32_t mask
);
183 /* Explicit inline keywords in utility functions seem to be necessary
184 * to prevent performance regression on cmap_find(). */
186 /* Given a rehashed value 'hash', returns the other hash for that rehashed
187 * value. This is symmetric: other_hash(other_hash(x)) == x. (See also "Hash
188 * Functions" at the top of this file.) */
189 static inline uint32_t
190 other_hash(uint32_t hash
)
192 return (hash
<< 16) | (hash
>> 16);
195 /* Returns the rehashed value for 'hash' within 'impl'. (See also "Hash
196 * Functions" at the top of this file.) */
197 static inline uint32_t
198 rehash(const struct cmap_impl
*impl
, uint32_t hash
)
200 return hash_finish(impl
->basis
, hash
);
203 /* Not always without the inline keyword. */
204 static inline struct cmap_impl
*
205 cmap_get_impl(const struct cmap
*cmap
)
207 return ovsrcu_get(struct cmap_impl
*, &cmap
->impl
);
211 calc_max_n(uint32_t mask
)
213 return ((uint64_t) (mask
+ 1) * CMAP_K
* CMAP_MAX_LOAD
) >> 32;
217 calc_min_n(uint32_t mask
)
219 return ((uint64_t) (mask
+ 1) * CMAP_K
* CMAP_MIN_LOAD
) >> 32;
222 static struct cmap_impl
*
223 cmap_impl_create(uint32_t mask
)
225 struct cmap_impl
*impl
;
227 ovs_assert(is_pow2(mask
+ 1));
229 /* There are 'mask + 1' buckets but struct cmap_impl has one bucket built
230 * in, so we only need to add space for the extra 'mask' buckets. */
231 impl
= xzalloc_cacheline(sizeof *impl
+ mask
* sizeof *impl
->buckets
);
233 impl
->max_n
= calc_max_n(mask
);
234 impl
->min_n
= calc_min_n(mask
);
236 impl
->basis
= random_uint32();
241 /* Initializes 'cmap' as an empty concurrent hash map. */
243 cmap_init(struct cmap
*cmap
)
245 ovsrcu_set(&cmap
->impl
, CONST_CAST(struct cmap_impl
*, &empty_cmap
));
250 * The client is responsible for destroying any data previously held in
253 cmap_destroy(struct cmap
*cmap
)
256 struct cmap_impl
*impl
= cmap_get_impl(cmap
);
257 if (impl
!= &empty_cmap
) {
258 ovsrcu_postpone(free_cacheline
, impl
);
263 /* Returns the number of elements in 'cmap'. */
265 cmap_count(const struct cmap
*cmap
)
267 return cmap_get_impl(cmap
)->n
;
270 /* Returns true if 'cmap' is empty, false otherwise. */
272 cmap_is_empty(const struct cmap
*cmap
)
274 return cmap_count(cmap
) == 0;
277 static inline uint32_t
278 read_counter(const struct cmap_bucket
*bucket_
)
280 struct cmap_bucket
*bucket
= CONST_CAST(struct cmap_bucket
*, bucket_
);
283 atomic_read_explicit(&bucket
->counter
, &counter
, memory_order_acquire
);
288 static inline uint32_t
289 read_even_counter(const struct cmap_bucket
*bucket
)
294 counter
= read_counter(bucket
);
295 } while (OVS_UNLIKELY(counter
& 1));
301 counter_changed(const struct cmap_bucket
*b_
, uint32_t c
)
303 struct cmap_bucket
*b
= CONST_CAST(struct cmap_bucket
*, b_
);
306 /* Need to make sure the counter read is not moved up, before the hash and
307 * cmap_node_next(). Using atomic_read_explicit with memory_order_acquire
308 * would allow prior reads to be moved after the barrier.
309 * atomic_thread_fence prevents all following memory accesses from moving
310 * prior to preceding loads. */
311 atomic_thread_fence(memory_order_acquire
);
312 atomic_read_relaxed(&b
->counter
, &counter
);
314 return OVS_UNLIKELY(counter
!= c
);
317 static inline const struct cmap_node
*
318 cmap_find_in_bucket(const struct cmap_bucket
*bucket
, uint32_t hash
)
320 for (int i
= 0; i
< CMAP_K
; i
++) {
321 if (bucket
->hashes
[i
] == hash
) {
322 return cmap_node_next(&bucket
->nodes
[i
]);
328 static inline const struct cmap_node
*
329 cmap_find__(const struct cmap_bucket
*b1
, const struct cmap_bucket
*b2
,
333 const struct cmap_node
*node
;
337 c1
= read_even_counter(b1
);
338 node
= cmap_find_in_bucket(b1
, hash
);
339 } while (OVS_UNLIKELY(counter_changed(b1
, c1
)));
344 c2
= read_even_counter(b2
);
345 node
= cmap_find_in_bucket(b2
, hash
);
346 } while (OVS_UNLIKELY(counter_changed(b2
, c2
)));
350 } while (OVS_UNLIKELY(counter_changed(b1
, c1
)));
355 /* Searches 'cmap' for an element with the specified 'hash'. If one or more is
356 * found, returns a pointer to the first one, otherwise a null pointer. All of
357 * the nodes on the returned list are guaranteed to have exactly the given
360 * This function works even if 'cmap' is changing concurrently. If 'cmap' is
361 * not changing, then cmap_find_protected() is slightly faster.
363 * CMAP_FOR_EACH_WITH_HASH is usually more convenient. */
364 const struct cmap_node
*
365 cmap_find(const struct cmap
*cmap
, uint32_t hash
)
367 const struct cmap_impl
*impl
= cmap_get_impl(cmap
);
368 uint32_t h1
= rehash(impl
, hash
);
369 uint32_t h2
= other_hash(h1
);
371 return cmap_find__(&impl
->buckets
[h1
& impl
->mask
],
372 &impl
->buckets
[h2
& impl
->mask
],
376 /* Find a node by the index of the entry of cmap. Index N means the N/CMAP_K
377 * bucket and N%CMAP_K entry in that bucket.
378 * Notice that it is not protected by the optimistic lock (versioning) because
379 * it does not compare the hashes. Currently it is only used by the datapath
382 * Return node for the entry of index or NULL if the index beyond boundary */
383 const struct cmap_node
*
384 cmap_find_by_index(const struct cmap
*cmap
, uint32_t index
)
386 const struct cmap_impl
*impl
= cmap_get_impl(cmap
);
388 uint32_t b
= index
/ CMAP_K
;
389 uint32_t e
= index
% CMAP_K
;
391 if (b
> impl
->mask
) {
395 const struct cmap_bucket
*bucket
= &impl
->buckets
[b
];
397 return cmap_node_next(&bucket
->nodes
[e
]);
400 /* Find the index of certain hash value. Currently only used by the datapath
403 * Return the index of the entry if found, or UINT32_MAX if not found. The
404 * function assumes entry index cannot be larger than UINT32_MAX. */
406 cmap_find_index(const struct cmap
*cmap
, uint32_t hash
)
408 const struct cmap_impl
*impl
= cmap_get_impl(cmap
);
409 uint32_t h1
= rehash(impl
, hash
);
410 uint32_t h2
= other_hash(h1
);
412 uint32_t b_index1
= h1
& impl
->mask
;
413 uint32_t b_index2
= h2
& impl
->mask
;
416 uint32_t index
= UINT32_MAX
;
418 const struct cmap_bucket
*b1
= &impl
->buckets
[b_index1
];
419 const struct cmap_bucket
*b2
= &impl
->buckets
[b_index2
];
423 c1
= read_even_counter(b1
);
424 for (int i
= 0; i
< CMAP_K
; i
++) {
425 if (b1
->hashes
[i
] == hash
) {
426 index
= b_index1
* CMAP_K
+ i
;
429 } while (OVS_UNLIKELY(counter_changed(b1
, c1
)));
430 if (index
!= UINT32_MAX
) {
434 c2
= read_even_counter(b2
);
435 for (int i
= 0; i
< CMAP_K
; i
++) {
436 if (b2
->hashes
[i
] == hash
) {
437 index
= b_index2
* CMAP_K
+ i
;
440 } while (OVS_UNLIKELY(counter_changed(b2
, c2
)));
442 if (index
!= UINT32_MAX
) {
445 } while (OVS_UNLIKELY(counter_changed(b1
, c1
)));
450 /* Looks up multiple 'hashes', when the corresponding bit in 'map' is 1,
451 * and sets the corresponding pointer in 'nodes', if the hash value was
452 * found from the 'cmap'. In other cases the 'nodes' values are not changed,
453 * i.e., no NULL pointers are stored there.
454 * Returns a map where a bit is set to 1 if the corresponding 'nodes' pointer
455 * was stored, 0 otherwise.
456 * Generally, the caller wants to use CMAP_NODE_FOR_EACH to verify for
457 * hash collisions. */
459 cmap_find_batch(const struct cmap
*cmap
, unsigned long map
,
460 uint32_t hashes
[], const struct cmap_node
*nodes
[])
462 const struct cmap_impl
*impl
= cmap_get_impl(cmap
);
463 unsigned long result
= map
;
465 uint32_t h1s
[sizeof map
* CHAR_BIT
];
466 const struct cmap_bucket
*b1s
[sizeof map
* CHAR_BIT
];
467 const struct cmap_bucket
*b2s
[sizeof map
* CHAR_BIT
];
468 uint32_t c1s
[sizeof map
* CHAR_BIT
];
470 /* Compute hashes and prefetch 1st buckets. */
471 ULLONG_FOR_EACH_1(i
, map
) {
472 h1s
[i
] = rehash(impl
, hashes
[i
]);
473 b1s
[i
] = &impl
->buckets
[h1s
[i
] & impl
->mask
];
474 OVS_PREFETCH(b1s
[i
]);
476 /* Lookups, Round 1. Only look up at the first bucket. */
477 ULLONG_FOR_EACH_1(i
, map
) {
479 const struct cmap_bucket
*b1
= b1s
[i
];
480 const struct cmap_node
*node
;
483 c1
= read_even_counter(b1
);
484 node
= cmap_find_in_bucket(b1
, hashes
[i
]);
485 } while (OVS_UNLIKELY(counter_changed(b1
, c1
)));
488 /* Not found (yet); Prefetch the 2nd bucket. */
489 b2s
[i
] = &impl
->buckets
[other_hash(h1s
[i
]) & impl
->mask
];
490 OVS_PREFETCH(b2s
[i
]);
491 c1s
[i
] = c1
; /* We may need to check this after Round 2. */
495 ULLONG_SET0(map
, i
); /* Ignore this on round 2. */
499 /* Round 2. Look into the 2nd bucket, if needed. */
500 ULLONG_FOR_EACH_1(i
, map
) {
502 const struct cmap_bucket
*b2
= b2s
[i
];
503 const struct cmap_node
*node
;
506 c2
= read_even_counter(b2
);
507 node
= cmap_find_in_bucket(b2
, hashes
[i
]);
508 } while (OVS_UNLIKELY(counter_changed(b2
, c2
)));
511 /* Not found, but the node may have been moved from b2 to b1 right
512 * after we finished with b1 earlier. We just got a clean reading
513 * of the 2nd bucket, so we check the counter of the 1st bucket
514 * only. However, we need to check both buckets again, as the
515 * entry may be moved again to the 2nd bucket. Basically, we
516 * need to loop as long as it takes to get stable readings of
517 * both buckets. cmap_find__() does that, and now that we have
518 * fetched both buckets we can just use it. */
519 if (OVS_UNLIKELY(counter_changed(b1s
[i
], c1s
[i
]))) {
520 node
= cmap_find__(b1s
[i
], b2s
[i
], hashes
[i
]);
526 ULLONG_SET0(result
, i
); /* Fix the result. */
537 cmap_find_slot_protected(struct cmap_bucket
*b
, uint32_t hash
)
541 for (i
= 0; i
< CMAP_K
; i
++) {
542 if (b
->hashes
[i
] == hash
&& cmap_node_next_protected(&b
->nodes
[i
])) {
549 static struct cmap_node
*
550 cmap_find_bucket_protected(struct cmap_impl
*impl
, uint32_t hash
, uint32_t h
)
552 struct cmap_bucket
*b
= &impl
->buckets
[h
& impl
->mask
];
555 for (i
= 0; i
< CMAP_K
; i
++) {
556 if (b
->hashes
[i
] == hash
) {
557 return cmap_node_next_protected(&b
->nodes
[i
]);
563 /* Like cmap_find(), but only for use if 'cmap' cannot change concurrently.
565 * CMAP_FOR_EACH_WITH_HASH_PROTECTED is usually more convenient. */
567 cmap_find_protected(const struct cmap
*cmap
, uint32_t hash
)
569 struct cmap_impl
*impl
= cmap_get_impl(cmap
);
570 uint32_t h1
= rehash(impl
, hash
);
571 uint32_t h2
= other_hash(h1
);
572 struct cmap_node
*node
;
574 node
= cmap_find_bucket_protected(impl
, hash
, h1
);
578 return cmap_find_bucket_protected(impl
, hash
, h2
);
582 cmap_find_empty_slot_protected(const struct cmap_bucket
*b
)
586 for (i
= 0; i
< CMAP_K
; i
++) {
587 if (!cmap_node_next_protected(&b
->nodes
[i
])) {
595 cmap_set_bucket(struct cmap_bucket
*b
, int i
,
596 struct cmap_node
*node
, uint32_t hash
)
600 atomic_read_explicit(&b
->counter
, &c
, memory_order_acquire
);
601 atomic_store_explicit(&b
->counter
, c
+ 1, memory_order_release
);
602 ovsrcu_set(&b
->nodes
[i
].next
, node
); /* Also atomic. */
604 atomic_store_explicit(&b
->counter
, c
+ 2, memory_order_release
);
607 /* Searches 'b' for a node with the given 'hash'. If it finds one, adds
608 * 'new_node' to the node's linked list and returns true. If it does not find
609 * one, returns false. */
611 cmap_insert_dup(struct cmap_node
*new_node
, uint32_t hash
,
612 struct cmap_bucket
*b
)
616 for (i
= 0; i
< CMAP_K
; i
++) {
617 if (b
->hashes
[i
] == hash
) {
618 struct cmap_node
*node
= cmap_node_next_protected(&b
->nodes
[i
]);
623 /* The common case is that 'new_node' is a singleton,
624 * with a null 'next' pointer. Rehashing can add a
625 * longer chain, but due to our invariant of always
626 * having all nodes with the same (user) hash value at
627 * a single chain, rehashing will always insert the
628 * chain to an empty node. The only way we can end up
629 * here is by the user inserting a chain of nodes at
630 * once. Find the end of the chain starting at
631 * 'new_node', then splice 'node' to the end of that
635 struct cmap_node
*next
= cmap_node_next_protected(p
);
642 ovsrcu_set_hidden(&p
->next
, node
);
644 /* The hash value is there from some previous insertion, but
645 * the associated node has been removed. We're not really
646 * inserting a duplicate, but we can still reuse the slot.
650 /* Change the bucket to point to 'new_node'. This is a degenerate
651 * form of cmap_set_bucket() that doesn't update the counter since
652 * we're only touching one field and in a way that doesn't change
653 * the bucket's meaning for readers. */
654 ovsrcu_set(&b
->nodes
[i
].next
, new_node
);
662 /* Searches 'b' for an empty slot. If successful, stores 'node' and 'hash' in
663 * the slot and returns true. Otherwise, returns false. */
665 cmap_insert_bucket(struct cmap_node
*node
, uint32_t hash
,
666 struct cmap_bucket
*b
)
670 for (i
= 0; i
< CMAP_K
; i
++) {
671 if (!cmap_node_next_protected(&b
->nodes
[i
])) {
672 cmap_set_bucket(b
, i
, node
, hash
);
679 /* Returns the other bucket that b->nodes[slot] could occupy in 'impl'. (This
680 * might be the same as 'b'.) */
681 static struct cmap_bucket
*
682 other_bucket_protected(struct cmap_impl
*impl
, struct cmap_bucket
*b
, int slot
)
684 uint32_t h1
= rehash(impl
, b
->hashes
[slot
]);
685 uint32_t h2
= other_hash(h1
);
686 uint32_t b_idx
= b
- impl
->buckets
;
687 uint32_t other_h
= (h1
& impl
->mask
) == b_idx
? h2
: h1
;
689 return &impl
->buckets
[other_h
& impl
->mask
];
692 /* 'new_node' is to be inserted into 'impl', but both candidate buckets 'b1'
693 * and 'b2' are full. This function attempts to rearrange buckets within
694 * 'impl' to make room for 'new_node'.
696 * The implementation is a general-purpose breadth-first search. At first
697 * glance, this is more complex than a random walk through 'impl' (suggested by
698 * some references), but random walks have a tendency to loop back through a
699 * single bucket. We have to move nodes backward along the path that we find,
700 * so that no node actually disappears from the hash table, which means a
701 * random walk would have to be careful to deal with loops. By contrast, a
702 * successful breadth-first search always finds a *shortest* path through the
703 * hash table, and a shortest path will never contain loops, so it avoids that
707 cmap_insert_bfs(struct cmap_impl
*impl
, struct cmap_node
*new_node
,
708 uint32_t hash
, struct cmap_bucket
*b1
, struct cmap_bucket
*b2
)
710 enum { MAX_DEPTH
= 4 };
712 /* A path from 'start' to 'end' via the 'n' steps in 'slots[]'.
714 * One can follow the path via:
716 * struct cmap_bucket *b;
720 * for (i = 0; i < path->n; i++) {
721 * b = other_bucket_protected(impl, b, path->slots[i]);
723 * ovs_assert(b == path->end);
726 struct cmap_bucket
*start
; /* First bucket along the path. */
727 struct cmap_bucket
*end
; /* Last bucket on the path. */
728 uint8_t slots
[MAX_DEPTH
]; /* Slots used for each hop. */
729 int n
; /* Number of slots[]. */
732 /* We need to limit the amount of work we do trying to find a path. It
733 * might actually be impossible to rearrange the cmap, and after some time
734 * it is likely to be easier to rehash the entire cmap.
736 * This value of MAX_QUEUE is an arbitrary limit suggested by one of the
737 * references. Empirically, it seems to work OK. */
738 enum { MAX_QUEUE
= 500 };
739 struct cmap_path queue
[MAX_QUEUE
];
743 /* Add 'b1' and 'b2' as starting points for the search. */
744 queue
[head
].start
= b1
;
745 queue
[head
].end
= b1
;
749 queue
[head
].start
= b2
;
750 queue
[head
].end
= b2
;
755 while (tail
< head
) {
756 const struct cmap_path
*path
= &queue
[tail
++];
757 struct cmap_bucket
*this = path
->end
;
760 for (i
= 0; i
< CMAP_K
; i
++) {
761 struct cmap_bucket
*next
= other_bucket_protected(impl
, this, i
);
768 j
= cmap_find_empty_slot_protected(next
);
770 /* We've found a path along which we can rearrange the hash
771 * table: Start at path->start, follow all the slots in
772 * path->slots[], then follow slot 'i', then the bucket you
773 * arrive at has slot 'j' empty. */
774 struct cmap_bucket
*buckets
[MAX_DEPTH
+ 2];
775 int slots
[MAX_DEPTH
+ 2];
778 /* Figure out the full sequence of slots. */
779 for (k
= 0; k
< path
->n
; k
++) {
780 slots
[k
] = path
->slots
[k
];
783 slots
[path
->n
+ 1] = j
;
785 /* Figure out the full sequence of buckets. */
786 buckets
[0] = path
->start
;
787 for (k
= 0; k
<= path
->n
; k
++) {
788 buckets
[k
+ 1] = other_bucket_protected(impl
, buckets
[k
], slots
[k
]);
791 /* Now the path is fully expressed. One can start from
792 * buckets[0], go via slots[0] to buckets[1], via slots[1] to
793 * buckets[2], and so on.
795 * Move all the nodes across the path "backward". After each
796 * step some node appears in two buckets. Thus, every node is
797 * always visible to a concurrent search. */
798 for (k
= path
->n
+ 1; k
> 0; k
--) {
799 int slot
= slots
[k
- 1];
802 buckets
[k
], slots
[k
],
803 cmap_node_next_protected(&buckets
[k
- 1]->nodes
[slot
]),
804 buckets
[k
- 1]->hashes
[slot
]);
807 /* Finally, replace the first node on the path by
809 cmap_set_bucket(buckets
[0], slots
[0], new_node
, hash
);
814 if (path
->n
< MAX_DEPTH
&& head
< MAX_QUEUE
) {
815 struct cmap_path
*new_path
= &queue
[head
++];
818 new_path
->end
= next
;
819 new_path
->slots
[new_path
->n
++] = i
;
827 /* Adds 'node', with the given 'hash', to 'impl'.
829 * 'node' is ordinarily a single node, with a null 'next' pointer. When
830 * rehashing, however, it may be a longer chain of nodes. */
832 cmap_try_insert(struct cmap_impl
*impl
, struct cmap_node
*node
, uint32_t hash
)
834 uint32_t h1
= rehash(impl
, hash
);
835 uint32_t h2
= other_hash(h1
);
836 struct cmap_bucket
*b1
= &impl
->buckets
[h1
& impl
->mask
];
837 struct cmap_bucket
*b2
= &impl
->buckets
[h2
& impl
->mask
];
839 return (OVS_UNLIKELY(cmap_insert_dup(node
, hash
, b1
) ||
840 cmap_insert_dup(node
, hash
, b2
)) ||
841 OVS_LIKELY(cmap_insert_bucket(node
, hash
, b1
) ||
842 cmap_insert_bucket(node
, hash
, b2
)) ||
843 cmap_insert_bfs(impl
, node
, hash
, b1
, b2
));
846 /* Inserts 'node', with the given 'hash', into 'cmap'. The caller must ensure
847 * that 'cmap' cannot change concurrently (from another thread). If duplicates
848 * are undesirable, the caller must have already verified that 'cmap' does not
849 * contain a duplicate of 'node'.
851 * Returns the current number of nodes in the cmap after the insertion. */
853 cmap_insert(struct cmap
*cmap
, struct cmap_node
*node
, uint32_t hash
)
855 struct cmap_impl
*impl
= cmap_get_impl(cmap
);
857 ovsrcu_set_hidden(&node
->next
, NULL
);
859 if (OVS_UNLIKELY(impl
->n
>= impl
->max_n
)) {
860 COVERAGE_INC(cmap_expand
);
861 impl
= cmap_rehash(cmap
, (impl
->mask
<< 1) | 1);
864 while (OVS_UNLIKELY(!cmap_try_insert(impl
, node
, hash
))) {
865 impl
= cmap_rehash(cmap
, impl
->mask
);
871 cmap_replace__(struct cmap_impl
*impl
, struct cmap_node
*node
,
872 struct cmap_node
*replacement
, uint32_t hash
, uint32_t h
)
874 struct cmap_bucket
*b
= &impl
->buckets
[h
& impl
->mask
];
877 slot
= cmap_find_slot_protected(b
, hash
);
882 /* The pointer to 'node' is changed to point to 'replacement',
883 * which is the next node if no replacement node is given. */
885 replacement
= cmap_node_next_protected(node
);
887 /* 'replacement' takes the position of 'node' in the list. */
888 ovsrcu_set_hidden(&replacement
->next
, cmap_node_next_protected(node
));
891 struct cmap_node
*iter
= &b
->nodes
[slot
];
893 struct cmap_node
*next
= cmap_node_next_protected(iter
);
896 ovsrcu_set(&iter
->next
, replacement
);
903 /* Replaces 'old_node' in 'cmap' with 'new_node'. The caller must
904 * ensure that 'cmap' cannot change concurrently (from another thread).
906 * 'old_node' must not be destroyed or modified or inserted back into 'cmap' or
907 * into any other concurrent hash map while any other thread might be accessing
908 * it. One correct way to do this is to free it from an RCU callback with
911 * Returns the current number of nodes in the cmap after the replacement. The
912 * number of nodes decreases by one if 'new_node' is NULL. */
914 cmap_replace(struct cmap
*cmap
, struct cmap_node
*old_node
,
915 struct cmap_node
*new_node
, uint32_t hash
)
917 struct cmap_impl
*impl
= cmap_get_impl(cmap
);
918 uint32_t h1
= rehash(impl
, hash
);
919 uint32_t h2
= other_hash(h1
);
921 ovs_assert(cmap_replace__(impl
, old_node
, new_node
, hash
, h1
) ||
922 cmap_replace__(impl
, old_node
, new_node
, hash
, h2
));
926 if (OVS_UNLIKELY(impl
->n
< impl
->min_n
)) {
927 COVERAGE_INC(cmap_shrink
);
928 impl
= cmap_rehash(cmap
, impl
->mask
>> 1);
935 cmap_try_rehash(const struct cmap_impl
*old
, struct cmap_impl
*new)
937 const struct cmap_bucket
*b
;
939 for (b
= old
->buckets
; b
<= &old
->buckets
[old
->mask
]; b
++) {
942 for (i
= 0; i
< CMAP_K
; i
++) {
943 /* possible optimization here because we know the hashes are
945 struct cmap_node
*node
= cmap_node_next_protected(&b
->nodes
[i
]);
947 if (node
&& !cmap_try_insert(new, node
, b
->hashes
[i
])) {
955 static struct cmap_impl
*
956 cmap_rehash(struct cmap
*cmap
, uint32_t mask
)
958 struct cmap_impl
*old
= cmap_get_impl(cmap
);
959 struct cmap_impl
*new;
961 new = cmap_impl_create(mask
);
962 ovs_assert(old
->n
< new->max_n
);
964 while (!cmap_try_rehash(old
, new)) {
965 memset(new->buckets
, 0, (mask
+ 1) * sizeof *new->buckets
);
966 new->basis
= random_uint32();
970 ovsrcu_set(&cmap
->impl
, new);
971 if (old
!= &empty_cmap
) {
972 ovsrcu_postpone(free_cacheline
, old
);
979 cmap_cursor_start(const struct cmap
*cmap
)
981 struct cmap_cursor cursor
;
983 cursor
.impl
= cmap_get_impl(cmap
);
984 cursor
.bucket_idx
= 0;
985 cursor
.entry_idx
= 0;
987 cmap_cursor_advance(&cursor
);
993 cmap_cursor_advance(struct cmap_cursor
*cursor
)
995 const struct cmap_impl
*impl
= cursor
->impl
;
998 cursor
->node
= cmap_node_next(cursor
->node
);
1004 while (cursor
->bucket_idx
<= impl
->mask
) {
1005 const struct cmap_bucket
*b
= &impl
->buckets
[cursor
->bucket_idx
];
1007 while (cursor
->entry_idx
< CMAP_K
) {
1008 cursor
->node
= cmap_node_next(&b
->nodes
[cursor
->entry_idx
++]);
1014 cursor
->bucket_idx
++;
1015 cursor
->entry_idx
= 0;
1019 /* Returns the next node in 'cmap' in hash order, or NULL if no nodes remain in
1020 * 'cmap'. Uses '*pos' to determine where to begin iteration, and updates
1021 * '*pos' to pass on the next iteration into them before returning.
1023 * It's better to use plain CMAP_FOR_EACH and related functions, since they are
1024 * faster and better at dealing with cmaps that change during iteration.
1026 * Before beginning iteration, set '*pos' to all zeros. */
1028 cmap_next_position(const struct cmap
*cmap
,
1029 struct cmap_position
*pos
)
1031 struct cmap_impl
*impl
= cmap_get_impl(cmap
);
1032 unsigned int bucket
= pos
->bucket
;
1033 unsigned int entry
= pos
->entry
;
1034 unsigned int offset
= pos
->offset
;
1036 while (bucket
<= impl
->mask
) {
1037 const struct cmap_bucket
*b
= &impl
->buckets
[bucket
];
1039 while (entry
< CMAP_K
) {
1040 const struct cmap_node
*node
= cmap_node_next(&b
->nodes
[entry
]);
1043 for (i
= 0; node
; i
++, node
= cmap_node_next(node
)) {
1045 if (cmap_node_next(node
)) {
1051 pos
->bucket
= bucket
;
1053 pos
->offset
= offset
;
1054 return CONST_CAST(struct cmap_node
*, node
);
1066 pos
->bucket
= pos
->entry
= pos
->offset
= 0;