]>
Commit | Line | Data |
---|---|---|
7e1e7763 TG |
1 | /* |
2 | * Resizable, Scalable, Concurrent Hash Table | |
3 | * | |
4 | * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch> | |
5 | * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> | |
6 | * | |
7 | * Based on the following paper: | |
8 | * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf | |
9 | * | |
10 | * Code partially derived from nft_hash | |
11 | * | |
12 | * This program is free software; you can redistribute it and/or modify | |
13 | * it under the terms of the GNU General Public License version 2 as | |
14 | * published by the Free Software Foundation. | |
15 | */ | |
16 | ||
17 | #include <linux/kernel.h> | |
18 | #include <linux/init.h> | |
19 | #include <linux/log2.h> | |
20 | #include <linux/slab.h> | |
21 | #include <linux/vmalloc.h> | |
22 | #include <linux/mm.h> | |
87545899 | 23 | #include <linux/jhash.h> |
7e1e7763 TG |
24 | #include <linux/random.h> |
25 | #include <linux/rhashtable.h> | |
7e1e7763 TG |
26 | |
27 | #define HASH_DEFAULT_SIZE 64UL | |
28 | #define HASH_MIN_SIZE 4UL | |
97defe1e TG |
29 | #define BUCKET_LOCKS_PER_CPU 128UL |
30 | ||
31 | enum { | |
32 | RHT_LOCK_NORMAL, | |
33 | RHT_LOCK_NESTED, | |
34 | RHT_LOCK_NESTED2, | |
35 | }; | |
36 | ||
37 | /* The bucket lock is selected based on the hash and protects mutations | |
38 | * on a group of hash buckets. | |
39 | * | |
40 | * IMPORTANT: When holding the bucket lock of both the old and new table | |
41 | * during expansions and shrinking, the old bucket lock must always be | |
42 | * acquired first. | |
43 | */ | |
44 | static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) | |
45 | { | |
46 | return &tbl->locks[hash & tbl->locks_mask]; | |
47 | } | |
7e1e7763 TG |
48 | |
49 | #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) | |
97defe1e TG |
50 | #define ASSERT_BUCKET_LOCK(TBL, HASH) \ |
51 | BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH)) | |
7e1e7763 TG |
52 | |
53 | #ifdef CONFIG_PROVE_LOCKING | |
97defe1e | 54 | int lockdep_rht_mutex_is_held(struct rhashtable *ht) |
7e1e7763 | 55 | { |
97defe1e | 56 | return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; |
7e1e7763 TG |
57 | } |
58 | EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); | |
88d6ed15 TG |
59 | |
60 | int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) | |
61 | { | |
97defe1e TG |
62 | spinlock_t *lock = bucket_lock(tbl, hash); |
63 | ||
64 | return (debug_locks) ? lockdep_is_held(lock) : 1; | |
88d6ed15 TG |
65 | } |
66 | EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); | |
7e1e7763 TG |
67 | #endif |
68 | ||
c91eee56 | 69 | static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) |
7e1e7763 TG |
70 | { |
71 | return (void *) he - ht->p.head_offset; | |
72 | } | |
7e1e7763 | 73 | |
8d24c0b4 | 74 | static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash) |
7e1e7763 | 75 | { |
8d24c0b4 | 76 | return hash & (tbl->size - 1); |
7e1e7763 | 77 | } |
7e1e7763 | 78 | |
8d24c0b4 | 79 | static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) |
7e1e7763 | 80 | { |
8d24c0b4 | 81 | u32 hash; |
7e1e7763 | 82 | |
8d24c0b4 TG |
83 | if (unlikely(!ht->p.key_len)) |
84 | hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd); | |
85 | else | |
86 | hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len, | |
87 | ht->p.hash_rnd); | |
7e1e7763 | 88 | |
8d24c0b4 | 89 | return hash; |
7e1e7763 TG |
90 | } |
91 | ||
97defe1e | 92 | static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) |
7e1e7763 TG |
93 | { |
94 | struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); | |
8d24c0b4 TG |
95 | u32 hash; |
96 | ||
97 | hash = ht->p.hashfn(key, len, ht->p.hash_rnd); | |
7e1e7763 | 98 | |
8d24c0b4 | 99 | return rht_bucket_index(tbl, hash); |
7e1e7763 | 100 | } |
7e1e7763 TG |
101 | |
102 | static u32 head_hashfn(const struct rhashtable *ht, | |
8d24c0b4 TG |
103 | const struct bucket_table *tbl, |
104 | const struct rhash_head *he) | |
7e1e7763 | 105 | { |
8d24c0b4 | 106 | return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he))); |
7e1e7763 TG |
107 | } |
108 | ||
b8e1943e TG |
109 | static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) |
110 | { | |
111 | struct rhash_head __rcu **pprev; | |
112 | ||
113 | for (pprev = &tbl->buckets[n]; | |
114 | rht_dereference_bucket(*pprev, tbl, n); | |
115 | pprev = &rht_dereference_bucket(*pprev, tbl, n)->next) | |
116 | ; | |
117 | ||
118 | return pprev; | |
119 | } | |
120 | ||
97defe1e TG |
121 | static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) |
122 | { | |
123 | unsigned int i, size; | |
124 | #if defined(CONFIG_PROVE_LOCKING) | |
125 | unsigned int nr_pcpus = 2; | |
126 | #else | |
127 | unsigned int nr_pcpus = num_possible_cpus(); | |
128 | #endif | |
129 | ||
130 | nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); | |
131 | size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); | |
132 | ||
133 | /* Never allocate more than one lock per bucket */ | |
134 | size = min_t(unsigned int, size, tbl->size); | |
135 | ||
136 | if (sizeof(spinlock_t) != 0) { | |
137 | #ifdef CONFIG_NUMA | |
138 | if (size * sizeof(spinlock_t) > PAGE_SIZE) | |
139 | tbl->locks = vmalloc(size * sizeof(spinlock_t)); | |
140 | else | |
141 | #endif | |
142 | tbl->locks = kmalloc_array(size, sizeof(spinlock_t), | |
143 | GFP_KERNEL); | |
144 | if (!tbl->locks) | |
145 | return -ENOMEM; | |
146 | for (i = 0; i < size; i++) | |
147 | spin_lock_init(&tbl->locks[i]); | |
148 | } | |
149 | tbl->locks_mask = size - 1; | |
150 | ||
151 | return 0; | |
152 | } | |
153 | ||
154 | static void bucket_table_free(const struct bucket_table *tbl) | |
155 | { | |
156 | if (tbl) | |
157 | kvfree(tbl->locks); | |
158 | ||
159 | kvfree(tbl); | |
160 | } | |
161 | ||
162 | static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, | |
163 | size_t nbuckets) | |
7e1e7763 TG |
164 | { |
165 | struct bucket_table *tbl; | |
166 | size_t size; | |
167 | ||
168 | size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); | |
6eba8224 | 169 | tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); |
7e1e7763 TG |
170 | if (tbl == NULL) |
171 | tbl = vzalloc(size); | |
172 | ||
173 | if (tbl == NULL) | |
174 | return NULL; | |
175 | ||
176 | tbl->size = nbuckets; | |
177 | ||
97defe1e TG |
178 | if (alloc_bucket_locks(ht, tbl) < 0) { |
179 | bucket_table_free(tbl); | |
180 | return NULL; | |
181 | } | |
7e1e7763 | 182 | |
97defe1e | 183 | return tbl; |
7e1e7763 TG |
184 | } |
185 | ||
186 | /** | |
187 | * rht_grow_above_75 - returns true if nelems > 0.75 * table-size | |
188 | * @ht: hash table | |
189 | * @new_size: new table size | |
190 | */ | |
191 | bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) | |
192 | { | |
193 | /* Expand table when exceeding 75% load */ | |
97defe1e | 194 | return atomic_read(&ht->nelems) > (new_size / 4 * 3); |
7e1e7763 TG |
195 | } |
196 | EXPORT_SYMBOL_GPL(rht_grow_above_75); | |
197 | ||
198 | /** | |
199 | * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size | |
200 | * @ht: hash table | |
201 | * @new_size: new table size | |
202 | */ | |
203 | bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) | |
204 | { | |
205 | /* Shrink table beneath 30% load */ | |
97defe1e | 206 | return atomic_read(&ht->nelems) < (new_size * 3 / 10); |
7e1e7763 TG |
207 | } |
208 | EXPORT_SYMBOL_GPL(rht_shrink_below_30); | |
209 | ||
210 | static void hashtable_chain_unzip(const struct rhashtable *ht, | |
211 | const struct bucket_table *new_tbl, | |
97defe1e TG |
212 | struct bucket_table *old_tbl, |
213 | size_t old_hash) | |
7e1e7763 TG |
214 | { |
215 | struct rhash_head *he, *p, *next; | |
97defe1e TG |
216 | spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL; |
217 | unsigned int new_hash, new_hash2; | |
218 | ||
219 | ASSERT_BUCKET_LOCK(old_tbl, old_hash); | |
7e1e7763 TG |
220 | |
221 | /* Old bucket empty, no work needed. */ | |
97defe1e TG |
222 | p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, |
223 | old_hash); | |
7e1e7763 TG |
224 | if (!p) |
225 | return; | |
226 | ||
97defe1e TG |
227 | new_hash = new_hash2 = head_hashfn(ht, new_tbl, p); |
228 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | |
229 | ||
7e1e7763 TG |
230 | /* Advance the old bucket pointer one or more times until it |
231 | * reaches a node that doesn't hash to the same bucket as the | |
232 | * previous node p. Call the previous node p; | |
233 | */ | |
97defe1e TG |
234 | rht_for_each_continue(he, p->next, old_tbl, old_hash) { |
235 | new_hash2 = head_hashfn(ht, new_tbl, he); | |
236 | if (new_hash != new_hash2) | |
7e1e7763 TG |
237 | break; |
238 | p = he; | |
239 | } | |
97defe1e TG |
240 | rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); |
241 | ||
242 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | |
243 | ||
244 | /* If we have encountered an entry that maps to a different bucket in | |
245 | * the new table, lock down that bucket as well as we might cut off | |
246 | * the end of the chain. | |
247 | */ | |
248 | new_bucket_lock2 = bucket_lock(new_tbl, new_hash); | |
249 | if (new_bucket_lock != new_bucket_lock2) | |
250 | spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2); | |
7e1e7763 TG |
251 | |
252 | /* Find the subsequent node which does hash to the same | |
253 | * bucket as node P, or NULL if no such node exists. | |
254 | */ | |
255 | next = NULL; | |
256 | if (he) { | |
97defe1e TG |
257 | rht_for_each_continue(he, he->next, old_tbl, old_hash) { |
258 | if (head_hashfn(ht, new_tbl, he) == new_hash) { | |
7e1e7763 TG |
259 | next = he; |
260 | break; | |
261 | } | |
262 | } | |
263 | } | |
264 | ||
265 | /* Set p's next pointer to that subsequent node pointer, | |
266 | * bypassing the nodes which do not hash to p's bucket | |
267 | */ | |
97defe1e TG |
268 | rcu_assign_pointer(p->next, next); |
269 | ||
270 | if (new_bucket_lock != new_bucket_lock2) | |
271 | spin_unlock_bh(new_bucket_lock2); | |
272 | spin_unlock_bh(new_bucket_lock); | |
273 | } | |
274 | ||
275 | static void link_old_to_new(struct bucket_table *new_tbl, | |
276 | unsigned int new_hash, struct rhash_head *entry) | |
277 | { | |
278 | spinlock_t *new_bucket_lock; | |
279 | ||
280 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | |
281 | ||
282 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); | |
283 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); | |
284 | spin_unlock_bh(new_bucket_lock); | |
7e1e7763 TG |
285 | } |
286 | ||
287 | /** | |
288 | * rhashtable_expand - Expand hash table while allowing concurrent lookups | |
289 | * @ht: the hash table to expand | |
7e1e7763 TG |
290 | * |
291 | * A secondary bucket array is allocated and the hash entries are migrated | |
292 | * while keeping them on both lists until the end of the RCU grace period. | |
293 | * | |
294 | * This function may only be called in a context where it is safe to call | |
295 | * synchronize_rcu(), e.g. not within a rcu_read_lock() section. | |
296 | * | |
97defe1e TG |
297 | * The caller must ensure that no concurrent resizing occurs by holding |
298 | * ht->mutex. | |
299 | * | |
300 | * It is valid to have concurrent insertions and deletions protected by per | |
301 | * bucket locks or concurrent RCU protected lookups and traversals. | |
7e1e7763 | 302 | */ |
6eba8224 | 303 | int rhashtable_expand(struct rhashtable *ht) |
7e1e7763 TG |
304 | { |
305 | struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); | |
306 | struct rhash_head *he; | |
97defe1e TG |
307 | spinlock_t *old_bucket_lock; |
308 | unsigned int new_hash, old_hash; | |
309 | bool complete = false; | |
7e1e7763 TG |
310 | |
311 | ASSERT_RHT_MUTEX(ht); | |
312 | ||
313 | if (ht->p.max_shift && ht->shift >= ht->p.max_shift) | |
314 | return 0; | |
315 | ||
97defe1e | 316 | new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); |
7e1e7763 TG |
317 | if (new_tbl == NULL) |
318 | return -ENOMEM; | |
319 | ||
320 | ht->shift++; | |
321 | ||
97defe1e TG |
322 | /* Make insertions go into the new, empty table right away. Deletions |
323 | * and lookups will be attempted in both tables until we synchronize. | |
324 | * The synchronize_rcu() guarantees for the new table to be picked up | |
325 | * so no new additions go into the old table while we relink. | |
326 | */ | |
327 | rcu_assign_pointer(ht->future_tbl, new_tbl); | |
328 | synchronize_rcu(); | |
329 | ||
330 | /* For each new bucket, search the corresponding old bucket for the | |
331 | * first entry that hashes to the new bucket, and link the end of | |
332 | * newly formed bucket chain (containing entries added to future | |
333 | * table) to that entry. Since all the entries which will end up in | |
334 | * the new bucket appear in the same old bucket, this constructs an | |
335 | * entirely valid new hash table, but with multiple buckets | |
336 | * "zipped" together into a single imprecise chain. | |
7e1e7763 | 337 | */ |
97defe1e TG |
338 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { |
339 | old_hash = rht_bucket_index(old_tbl, new_hash); | |
340 | old_bucket_lock = bucket_lock(old_tbl, old_hash); | |
341 | ||
342 | spin_lock_bh(old_bucket_lock); | |
343 | rht_for_each(he, old_tbl, old_hash) { | |
344 | if (head_hashfn(ht, new_tbl, he) == new_hash) { | |
345 | link_old_to_new(new_tbl, new_hash, he); | |
7e1e7763 TG |
346 | break; |
347 | } | |
348 | } | |
97defe1e | 349 | spin_unlock_bh(old_bucket_lock); |
7e1e7763 TG |
350 | } |
351 | ||
352 | /* Publish the new table pointer. Lookups may now traverse | |
0c828f2f HX |
353 | * the new table, but they will not benefit from any |
354 | * additional efficiency until later steps unzip the buckets. | |
7e1e7763 TG |
355 | */ |
356 | rcu_assign_pointer(ht->tbl, new_tbl); | |
357 | ||
358 | /* Unzip interleaved hash chains */ | |
97defe1e | 359 | while (!complete && !ht->being_destroyed) { |
7e1e7763 TG |
360 | /* Wait for readers. All new readers will see the new |
361 | * table, and thus no references to the old table will | |
362 | * remain. | |
363 | */ | |
364 | synchronize_rcu(); | |
365 | ||
366 | /* For each bucket in the old table (each of which | |
367 | * contains items from multiple buckets of the new | |
368 | * table): ... | |
369 | */ | |
370 | complete = true; | |
97defe1e TG |
371 | for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { |
372 | old_bucket_lock = bucket_lock(old_tbl, old_hash); | |
373 | spin_lock_bh(old_bucket_lock); | |
374 | ||
375 | hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash); | |
376 | if (old_tbl->buckets[old_hash] != NULL) | |
7e1e7763 | 377 | complete = false; |
97defe1e TG |
378 | |
379 | spin_unlock_bh(old_bucket_lock); | |
7e1e7763 | 380 | } |
97defe1e | 381 | } |
7e1e7763 TG |
382 | |
383 | bucket_table_free(old_tbl); | |
384 | return 0; | |
385 | } | |
386 | EXPORT_SYMBOL_GPL(rhashtable_expand); | |
387 | ||
388 | /** | |
389 | * rhashtable_shrink - Shrink hash table while allowing concurrent lookups | |
390 | * @ht: the hash table to shrink | |
7e1e7763 TG |
391 | * |
392 | * This function may only be called in a context where it is safe to call | |
393 | * synchronize_rcu(), e.g. not within a rcu_read_lock() section. | |
394 | * | |
97defe1e TG |
395 | * The caller must ensure that no concurrent resizing occurs by holding |
396 | * ht->mutex. | |
397 | * | |
7e1e7763 TG |
398 | * The caller must ensure that no concurrent table mutations take place. |
399 | * It is however valid to have concurrent lookups if they are RCU protected. | |
97defe1e TG |
400 | * |
401 | * It is valid to have concurrent insertions and deletions protected by per | |
402 | * bucket locks or concurrent RCU protected lookups and traversals. | |
7e1e7763 | 403 | */ |
6eba8224 | 404 | int rhashtable_shrink(struct rhashtable *ht) |
7e1e7763 | 405 | { |
97defe1e TG |
406 | struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); |
407 | spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2; | |
408 | unsigned int new_hash; | |
7e1e7763 TG |
409 | |
410 | ASSERT_RHT_MUTEX(ht); | |
411 | ||
94000176 | 412 | if (ht->shift <= ht->p.min_shift) |
7e1e7763 TG |
413 | return 0; |
414 | ||
97defe1e TG |
415 | new_tbl = bucket_table_alloc(ht, tbl->size / 2); |
416 | if (new_tbl == NULL) | |
7e1e7763 TG |
417 | return -ENOMEM; |
418 | ||
97defe1e TG |
419 | rcu_assign_pointer(ht->future_tbl, new_tbl); |
420 | synchronize_rcu(); | |
7e1e7763 | 421 | |
97defe1e TG |
422 | /* Link the first entry in the old bucket to the end of the |
423 | * bucket in the new table. As entries are concurrently being | |
424 | * added to the new table, lock down the new bucket. As we | |
425 | * always divide the size in half when shrinking, each bucket | |
426 | * in the new table maps to exactly two buckets in the old | |
427 | * table. | |
428 | * | |
429 | * As removals can occur concurrently on the old table, we need | |
430 | * to lock down both matching buckets in the old table. | |
7e1e7763 | 431 | */ |
97defe1e TG |
432 | for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { |
433 | old_bucket_lock1 = bucket_lock(tbl, new_hash); | |
434 | old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size); | |
435 | new_bucket_lock = bucket_lock(new_tbl, new_hash); | |
436 | ||
437 | spin_lock_bh(old_bucket_lock1); | |
438 | spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED); | |
439 | spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2); | |
440 | ||
441 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), | |
442 | tbl->buckets[new_hash]); | |
443 | rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), | |
444 | tbl->buckets[new_hash + new_tbl->size]); | |
445 | ||
446 | spin_unlock_bh(new_bucket_lock); | |
447 | spin_unlock_bh(old_bucket_lock2); | |
448 | spin_unlock_bh(old_bucket_lock1); | |
7e1e7763 TG |
449 | } |
450 | ||
451 | /* Publish the new, valid hash table */ | |
97defe1e TG |
452 | rcu_assign_pointer(ht->tbl, new_tbl); |
453 | ht->shift--; | |
7e1e7763 TG |
454 | |
455 | /* Wait for readers. No new readers will have references to the | |
456 | * old hash table. | |
457 | */ | |
458 | synchronize_rcu(); | |
459 | ||
460 | bucket_table_free(tbl); | |
461 | ||
462 | return 0; | |
463 | } | |
464 | EXPORT_SYMBOL_GPL(rhashtable_shrink); | |
465 | ||
97defe1e TG |
466 | static void rht_deferred_worker(struct work_struct *work) |
467 | { | |
468 | struct rhashtable *ht; | |
469 | struct bucket_table *tbl; | |
470 | ||
471 | ht = container_of(work, struct rhashtable, run_work.work); | |
472 | mutex_lock(&ht->mutex); | |
473 | tbl = rht_dereference(ht->tbl, ht); | |
474 | ||
475 | if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) | |
476 | rhashtable_expand(ht); | |
477 | else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) | |
478 | rhashtable_shrink(ht); | |
479 | ||
480 | mutex_unlock(&ht->mutex); | |
481 | } | |
482 | ||
7e1e7763 TG |
483 | /** |
484 | * rhashtable_insert - insert object into hash hash table | |
485 | * @ht: hash table | |
486 | * @obj: pointer to hash head inside object | |
7e1e7763 | 487 | * |
97defe1e TG |
488 | * Will take a per bucket spinlock to protect against mutual mutations |
489 | * on the same bucket. Multiple insertions may occur in parallel unless | |
490 | * they map to the same bucket lock. | |
7e1e7763 | 491 | * |
97defe1e TG |
492 | * It is safe to call this function from atomic context. |
493 | * | |
494 | * Will trigger an automatic deferred table resizing if the size grows | |
495 | * beyond the watermark indicated by grow_decision() which can be passed | |
496 | * to rhashtable_init(). | |
7e1e7763 | 497 | */ |
6eba8224 | 498 | void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) |
7e1e7763 | 499 | { |
97defe1e TG |
500 | struct bucket_table *tbl; |
501 | spinlock_t *lock; | |
502 | unsigned hash; | |
7e1e7763 | 503 | |
97defe1e | 504 | rcu_read_lock(); |
7e1e7763 | 505 | |
97defe1e | 506 | tbl = rht_dereference_rcu(ht->future_tbl, ht); |
8d24c0b4 | 507 | hash = head_hashfn(ht, tbl, obj); |
97defe1e TG |
508 | lock = bucket_lock(tbl, hash); |
509 | ||
510 | spin_lock_bh(lock); | |
7e1e7763 TG |
511 | RCU_INIT_POINTER(obj->next, tbl->buckets[hash]); |
512 | rcu_assign_pointer(tbl->buckets[hash], obj); | |
97defe1e | 513 | spin_unlock_bh(lock); |
7e1e7763 | 514 | |
97defe1e TG |
515 | atomic_inc(&ht->nelems); |
516 | ||
517 | /* Only grow the table if no resizing is currently in progress. */ | |
518 | if (ht->tbl != ht->future_tbl && | |
519 | ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) | |
520 | schedule_delayed_work(&ht->run_work, 0); | |
521 | ||
522 | rcu_read_unlock(); | |
7e1e7763 TG |
523 | } |
524 | EXPORT_SYMBOL_GPL(rhashtable_insert); | |
525 | ||
7e1e7763 TG |
526 | /** |
527 | * rhashtable_remove - remove object from hash table | |
528 | * @ht: hash table | |
529 | * @obj: pointer to hash head inside object | |
7e1e7763 TG |
530 | * |
531 | * Since the hash chain is single linked, the removal operation needs to | |
532 | * walk the bucket chain upon removal. The removal operation is thus | |
533 | * considerable slow if the hash table is not correctly sized. | |
534 | * | |
535 | * Will automatically shrink the table via rhashtable_expand() if the the | |
536 | * shrink_decision function specified at rhashtable_init() returns true. | |
537 | * | |
538 | * The caller must ensure that no concurrent table mutations occur. It is | |
539 | * however valid to have concurrent lookups if they are RCU protected. | |
540 | */ | |
6eba8224 | 541 | bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) |
7e1e7763 | 542 | { |
97defe1e | 543 | struct bucket_table *tbl; |
7e1e7763 TG |
544 | struct rhash_head __rcu **pprev; |
545 | struct rhash_head *he; | |
97defe1e TG |
546 | spinlock_t *lock; |
547 | unsigned int hash; | |
7e1e7763 | 548 | |
97defe1e TG |
549 | rcu_read_lock(); |
550 | tbl = rht_dereference_rcu(ht->tbl, ht); | |
551 | hash = head_hashfn(ht, tbl, obj); | |
7e1e7763 | 552 | |
97defe1e TG |
553 | lock = bucket_lock(tbl, hash); |
554 | spin_lock_bh(lock); | |
7e1e7763 | 555 | |
97defe1e TG |
556 | restart: |
557 | pprev = &tbl->buckets[hash]; | |
558 | rht_for_each(he, tbl, hash) { | |
7e1e7763 TG |
559 | if (he != obj) { |
560 | pprev = &he->next; | |
561 | continue; | |
562 | } | |
563 | ||
97defe1e TG |
564 | rcu_assign_pointer(*pprev, obj->next); |
565 | atomic_dec(&ht->nelems); | |
897362e4 | 566 | |
97defe1e TG |
567 | spin_unlock_bh(lock); |
568 | ||
569 | if (ht->tbl != ht->future_tbl && | |
570 | ht->p.shrink_decision && | |
897362e4 | 571 | ht->p.shrink_decision(ht, tbl->size)) |
97defe1e TG |
572 | schedule_delayed_work(&ht->run_work, 0); |
573 | ||
574 | rcu_read_unlock(); | |
897362e4 | 575 | |
7e1e7763 TG |
576 | return true; |
577 | } | |
578 | ||
97defe1e TG |
579 | if (tbl != rht_dereference_rcu(ht->tbl, ht)) { |
580 | spin_unlock_bh(lock); | |
581 | ||
582 | tbl = rht_dereference_rcu(ht->tbl, ht); | |
583 | hash = head_hashfn(ht, tbl, obj); | |
584 | ||
585 | lock = bucket_lock(tbl, hash); | |
586 | spin_lock_bh(lock); | |
587 | goto restart; | |
588 | } | |
589 | ||
590 | spin_unlock_bh(lock); | |
591 | rcu_read_unlock(); | |
592 | ||
7e1e7763 TG |
593 | return false; |
594 | } | |
595 | EXPORT_SYMBOL_GPL(rhashtable_remove); | |
596 | ||
597 | /** | |
598 | * rhashtable_lookup - lookup key in hash table | |
599 | * @ht: hash table | |
600 | * @key: pointer to key | |
601 | * | |
602 | * Computes the hash value for the key and traverses the bucket chain looking | |
603 | * for a entry with an identical key. The first matching entry is returned. | |
604 | * | |
605 | * This lookup function may only be used for fixed key hash table (key_len | |
606 | * paramter set). It will BUG() if used inappropriately. | |
607 | * | |
97defe1e | 608 | * Lookups may occur in parallel with hashtable mutations and resizing. |
7e1e7763 | 609 | */ |
97defe1e | 610 | void *rhashtable_lookup(struct rhashtable *ht, const void *key) |
7e1e7763 | 611 | { |
97defe1e | 612 | const struct bucket_table *tbl, *old_tbl; |
7e1e7763 | 613 | struct rhash_head *he; |
97defe1e | 614 | u32 hash; |
7e1e7763 TG |
615 | |
616 | BUG_ON(!ht->p.key_len); | |
617 | ||
97defe1e TG |
618 | rcu_read_lock(); |
619 | old_tbl = rht_dereference_rcu(ht->tbl, ht); | |
620 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | |
621 | hash = key_hashfn(ht, key, ht->p.key_len); | |
622 | restart: | |
623 | rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { | |
7e1e7763 TG |
624 | if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key, |
625 | ht->p.key_len)) | |
626 | continue; | |
97defe1e | 627 | rcu_read_unlock(); |
a4b18cda | 628 | return rht_obj(ht, he); |
7e1e7763 TG |
629 | } |
630 | ||
97defe1e TG |
631 | if (unlikely(tbl != old_tbl)) { |
632 | tbl = old_tbl; | |
633 | goto restart; | |
634 | } | |
635 | ||
636 | rcu_read_unlock(); | |
7e1e7763 TG |
637 | return NULL; |
638 | } | |
639 | EXPORT_SYMBOL_GPL(rhashtable_lookup); | |
640 | ||
641 | /** | |
642 | * rhashtable_lookup_compare - search hash table with compare function | |
643 | * @ht: hash table | |
8d24c0b4 | 644 | * @key: the pointer to the key |
7e1e7763 TG |
645 | * @compare: compare function, must return true on match |
646 | * @arg: argument passed on to compare function | |
647 | * | |
648 | * Traverses the bucket chain behind the provided hash value and calls the | |
649 | * specified compare function for each entry. | |
650 | * | |
97defe1e | 651 | * Lookups may occur in parallel with hashtable mutations and resizing. |
7e1e7763 TG |
652 | * |
653 | * Returns the first entry on which the compare function returned true. | |
654 | */ | |
97defe1e | 655 | void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, |
7e1e7763 TG |
656 | bool (*compare)(void *, void *), void *arg) |
657 | { | |
97defe1e | 658 | const struct bucket_table *tbl, *old_tbl; |
7e1e7763 | 659 | struct rhash_head *he; |
8d24c0b4 | 660 | u32 hash; |
7e1e7763 | 661 | |
97defe1e TG |
662 | rcu_read_lock(); |
663 | ||
664 | old_tbl = rht_dereference_rcu(ht->tbl, ht); | |
665 | tbl = rht_dereference_rcu(ht->future_tbl, ht); | |
8d24c0b4 | 666 | hash = key_hashfn(ht, key, ht->p.key_len); |
97defe1e TG |
667 | restart: |
668 | rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { | |
7e1e7763 TG |
669 | if (!compare(rht_obj(ht, he), arg)) |
670 | continue; | |
97defe1e | 671 | rcu_read_unlock(); |
a4b18cda | 672 | return rht_obj(ht, he); |
7e1e7763 TG |
673 | } |
674 | ||
97defe1e TG |
675 | if (unlikely(tbl != old_tbl)) { |
676 | tbl = old_tbl; | |
677 | goto restart; | |
678 | } | |
679 | rcu_read_unlock(); | |
680 | ||
7e1e7763 TG |
681 | return NULL; |
682 | } | |
683 | EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); | |
684 | ||
94000176 | 685 | static size_t rounded_hashtable_size(struct rhashtable_params *params) |
7e1e7763 | 686 | { |
94000176 YX |
687 | return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), |
688 | 1UL << params->min_shift); | |
7e1e7763 TG |
689 | } |
690 | ||
691 | /** | |
692 | * rhashtable_init - initialize a new hash table | |
693 | * @ht: hash table to be initialized | |
694 | * @params: configuration parameters | |
695 | * | |
696 | * Initializes a new hash table based on the provided configuration | |
697 | * parameters. A table can be configured either with a variable or | |
698 | * fixed length key: | |
699 | * | |
700 | * Configuration Example 1: Fixed length keys | |
701 | * struct test_obj { | |
702 | * int key; | |
703 | * void * my_member; | |
704 | * struct rhash_head node; | |
705 | * }; | |
706 | * | |
707 | * struct rhashtable_params params = { | |
708 | * .head_offset = offsetof(struct test_obj, node), | |
709 | * .key_offset = offsetof(struct test_obj, key), | |
710 | * .key_len = sizeof(int), | |
87545899 | 711 | * .hashfn = jhash, |
7e1e7763 TG |
712 | * }; |
713 | * | |
714 | * Configuration Example 2: Variable length keys | |
715 | * struct test_obj { | |
716 | * [...] | |
717 | * struct rhash_head node; | |
718 | * }; | |
719 | * | |
720 | * u32 my_hash_fn(const void *data, u32 seed) | |
721 | * { | |
722 | * struct test_obj *obj = data; | |
723 | * | |
724 | * return [... hash ...]; | |
725 | * } | |
726 | * | |
727 | * struct rhashtable_params params = { | |
728 | * .head_offset = offsetof(struct test_obj, node), | |
87545899 | 729 | * .hashfn = jhash, |
7e1e7763 | 730 | * .obj_hashfn = my_hash_fn, |
7e1e7763 TG |
731 | * }; |
732 | */ | |
733 | int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) | |
734 | { | |
735 | struct bucket_table *tbl; | |
736 | size_t size; | |
737 | ||
738 | size = HASH_DEFAULT_SIZE; | |
739 | ||
740 | if ((params->key_len && !params->hashfn) || | |
741 | (!params->key_len && !params->obj_hashfn)) | |
742 | return -EINVAL; | |
743 | ||
94000176 YX |
744 | params->min_shift = max_t(size_t, params->min_shift, |
745 | ilog2(HASH_MIN_SIZE)); | |
746 | ||
7e1e7763 | 747 | if (params->nelem_hint) |
94000176 | 748 | size = rounded_hashtable_size(params); |
7e1e7763 | 749 | |
97defe1e TG |
750 | memset(ht, 0, sizeof(*ht)); |
751 | mutex_init(&ht->mutex); | |
752 | memcpy(&ht->p, params, sizeof(*params)); | |
753 | ||
754 | if (params->locks_mul) | |
755 | ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); | |
756 | else | |
757 | ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; | |
758 | ||
759 | tbl = bucket_table_alloc(ht, size); | |
7e1e7763 TG |
760 | if (tbl == NULL) |
761 | return -ENOMEM; | |
762 | ||
7e1e7763 | 763 | ht->shift = ilog2(tbl->size); |
7e1e7763 | 764 | RCU_INIT_POINTER(ht->tbl, tbl); |
97defe1e | 765 | RCU_INIT_POINTER(ht->future_tbl, tbl); |
7e1e7763 TG |
766 | |
767 | if (!ht->p.hash_rnd) | |
768 | get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); | |
769 | ||
97defe1e TG |
770 | if (ht->p.grow_decision || ht->p.shrink_decision) |
771 | INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker); | |
772 | ||
7e1e7763 TG |
773 | return 0; |
774 | } | |
775 | EXPORT_SYMBOL_GPL(rhashtable_init); | |
776 | ||
777 | /** | |
778 | * rhashtable_destroy - destroy hash table | |
779 | * @ht: the hash table to destroy | |
780 | * | |
ae82ddcf PNA |
781 | * Frees the bucket array. This function is not rcu safe, therefore the caller |
782 | * has to make sure that no resizing may happen by unpublishing the hashtable | |
783 | * and waiting for the quiescent cycle before releasing the bucket array. | |
7e1e7763 | 784 | */ |
97defe1e | 785 | void rhashtable_destroy(struct rhashtable *ht) |
7e1e7763 | 786 | { |
97defe1e TG |
787 | ht->being_destroyed = true; |
788 | ||
789 | mutex_lock(&ht->mutex); | |
790 | ||
791 | cancel_delayed_work(&ht->run_work); | |
792 | bucket_table_free(rht_dereference(ht->tbl, ht)); | |
793 | ||
794 | mutex_unlock(&ht->mutex); | |
7e1e7763 TG |
795 | } |
796 | EXPORT_SYMBOL_GPL(rhashtable_destroy); | |
797 | ||
798 | /************************************************************************** | |
799 | * Self Test | |
800 | **************************************************************************/ | |
801 | ||
802 | #ifdef CONFIG_TEST_RHASHTABLE | |
803 | ||
804 | #define TEST_HT_SIZE 8 | |
805 | #define TEST_ENTRIES 2048 | |
806 | #define TEST_PTR ((void *) 0xdeadbeef) | |
807 | #define TEST_NEXPANDS 4 | |
808 | ||
7e1e7763 TG |
809 | struct test_obj { |
810 | void *ptr; | |
811 | int value; | |
812 | struct rhash_head node; | |
813 | }; | |
814 | ||
815 | static int __init test_rht_lookup(struct rhashtable *ht) | |
816 | { | |
817 | unsigned int i; | |
818 | ||
819 | for (i = 0; i < TEST_ENTRIES * 2; i++) { | |
820 | struct test_obj *obj; | |
821 | bool expected = !(i % 2); | |
822 | u32 key = i; | |
823 | ||
824 | obj = rhashtable_lookup(ht, &key); | |
825 | ||
826 | if (expected && !obj) { | |
827 | pr_warn("Test failed: Could not find key %u\n", key); | |
828 | return -ENOENT; | |
829 | } else if (!expected && obj) { | |
830 | pr_warn("Test failed: Unexpected entry found for key %u\n", | |
831 | key); | |
832 | return -EEXIST; | |
833 | } else if (expected && obj) { | |
834 | if (obj->ptr != TEST_PTR || obj->value != i) { | |
835 | pr_warn("Test failed: Lookup value mismatch %p!=%p, %u!=%u\n", | |
836 | obj->ptr, TEST_PTR, obj->value, i); | |
837 | return -EINVAL; | |
838 | } | |
839 | } | |
840 | } | |
841 | ||
842 | return 0; | |
843 | } | |
844 | ||
3e7b2ec4 | 845 | static void test_bucket_stats(struct rhashtable *ht, bool quiet) |
7e1e7763 | 846 | { |
3e7b2ec4 | 847 | unsigned int cnt, rcu_cnt, i, total = 0; |
88d6ed15 | 848 | struct rhash_head *pos; |
7e1e7763 | 849 | struct test_obj *obj; |
3e7b2ec4 | 850 | struct bucket_table *tbl; |
7e1e7763 | 851 | |
3e7b2ec4 | 852 | tbl = rht_dereference_rcu(ht->tbl, ht); |
7e1e7763 | 853 | for (i = 0; i < tbl->size; i++) { |
3e7b2ec4 | 854 | rcu_cnt = cnt = 0; |
7e1e7763 TG |
855 | |
856 | if (!quiet) | |
857 | pr_info(" [%#4x/%zu]", i, tbl->size); | |
858 | ||
88d6ed15 | 859 | rht_for_each_entry_rcu(obj, pos, tbl, i, node) { |
7e1e7763 TG |
860 | cnt++; |
861 | total++; | |
862 | if (!quiet) | |
863 | pr_cont(" [%p],", obj); | |
864 | } | |
865 | ||
88d6ed15 | 866 | rht_for_each_entry_rcu(obj, pos, tbl, i, node) |
3e7b2ec4 TG |
867 | rcu_cnt++; |
868 | ||
869 | if (rcu_cnt != cnt) | |
870 | pr_warn("Test failed: Chain count mismach %d != %d", | |
871 | cnt, rcu_cnt); | |
872 | ||
7e1e7763 TG |
873 | if (!quiet) |
874 | pr_cont("\n [%#x] first element: %p, chain length: %u\n", | |
875 | i, tbl->buckets[i], cnt); | |
876 | } | |
877 | ||
97defe1e TG |
878 | pr_info(" Traversal complete: counted=%u, nelems=%u, entries=%d\n", |
879 | total, atomic_read(&ht->nelems), TEST_ENTRIES); | |
3e7b2ec4 | 880 | |
97defe1e | 881 | if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES) |
3e7b2ec4 | 882 | pr_warn("Test failed: Total count mismatch ^^^"); |
7e1e7763 TG |
883 | } |
884 | ||
885 | static int __init test_rhashtable(struct rhashtable *ht) | |
886 | { | |
887 | struct bucket_table *tbl; | |
88d6ed15 TG |
888 | struct test_obj *obj; |
889 | struct rhash_head *pos, *next; | |
7e1e7763 TG |
890 | int err; |
891 | unsigned int i; | |
892 | ||
893 | /* | |
894 | * Insertion Test: | |
895 | * Insert TEST_ENTRIES into table with all keys even numbers | |
896 | */ | |
897 | pr_info(" Adding %d keys\n", TEST_ENTRIES); | |
898 | for (i = 0; i < TEST_ENTRIES; i++) { | |
899 | struct test_obj *obj; | |
900 | ||
901 | obj = kzalloc(sizeof(*obj), GFP_KERNEL); | |
902 | if (!obj) { | |
903 | err = -ENOMEM; | |
904 | goto error; | |
905 | } | |
906 | ||
907 | obj->ptr = TEST_PTR; | |
908 | obj->value = i * 2; | |
909 | ||
6eba8224 | 910 | rhashtable_insert(ht, &obj->node); |
7e1e7763 TG |
911 | } |
912 | ||
913 | rcu_read_lock(); | |
3e7b2ec4 | 914 | test_bucket_stats(ht, true); |
7e1e7763 TG |
915 | test_rht_lookup(ht); |
916 | rcu_read_unlock(); | |
917 | ||
918 | for (i = 0; i < TEST_NEXPANDS; i++) { | |
919 | pr_info(" Table expansion iteration %u...\n", i); | |
97defe1e | 920 | mutex_lock(&ht->mutex); |
6eba8224 | 921 | rhashtable_expand(ht); |
97defe1e | 922 | mutex_unlock(&ht->mutex); |
7e1e7763 TG |
923 | |
924 | rcu_read_lock(); | |
925 | pr_info(" Verifying lookups...\n"); | |
926 | test_rht_lookup(ht); | |
927 | rcu_read_unlock(); | |
928 | } | |
929 | ||
930 | for (i = 0; i < TEST_NEXPANDS; i++) { | |
931 | pr_info(" Table shrinkage iteration %u...\n", i); | |
97defe1e | 932 | mutex_lock(&ht->mutex); |
6eba8224 | 933 | rhashtable_shrink(ht); |
97defe1e | 934 | mutex_unlock(&ht->mutex); |
7e1e7763 TG |
935 | |
936 | rcu_read_lock(); | |
937 | pr_info(" Verifying lookups...\n"); | |
938 | test_rht_lookup(ht); | |
939 | rcu_read_unlock(); | |
940 | } | |
941 | ||
3e7b2ec4 TG |
942 | rcu_read_lock(); |
943 | test_bucket_stats(ht, true); | |
944 | rcu_read_unlock(); | |
945 | ||
7e1e7763 TG |
946 | pr_info(" Deleting %d keys\n", TEST_ENTRIES); |
947 | for (i = 0; i < TEST_ENTRIES; i++) { | |
948 | u32 key = i * 2; | |
949 | ||
950 | obj = rhashtable_lookup(ht, &key); | |
951 | BUG_ON(!obj); | |
952 | ||
6eba8224 | 953 | rhashtable_remove(ht, &obj->node); |
7e1e7763 TG |
954 | kfree(obj); |
955 | } | |
956 | ||
957 | return 0; | |
958 | ||
959 | error: | |
960 | tbl = rht_dereference_rcu(ht->tbl, ht); | |
961 | for (i = 0; i < tbl->size; i++) | |
88d6ed15 | 962 | rht_for_each_entry_safe(obj, pos, next, tbl, i, node) |
7e1e7763 TG |
963 | kfree(obj); |
964 | ||
965 | return err; | |
966 | } | |
967 | ||
968 | static int __init test_rht_init(void) | |
969 | { | |
970 | struct rhashtable ht; | |
971 | struct rhashtable_params params = { | |
972 | .nelem_hint = TEST_HT_SIZE, | |
973 | .head_offset = offsetof(struct test_obj, node), | |
974 | .key_offset = offsetof(struct test_obj, value), | |
975 | .key_len = sizeof(int), | |
87545899 | 976 | .hashfn = jhash, |
7e1e7763 TG |
977 | .grow_decision = rht_grow_above_75, |
978 | .shrink_decision = rht_shrink_below_30, | |
979 | }; | |
980 | int err; | |
981 | ||
982 | pr_info("Running resizable hashtable tests...\n"); | |
983 | ||
984 | err = rhashtable_init(&ht, ¶ms); | |
985 | if (err < 0) { | |
986 | pr_warn("Test failed: Unable to initialize hashtable: %d\n", | |
987 | err); | |
988 | return err; | |
989 | } | |
990 | ||
991 | err = test_rhashtable(&ht); | |
992 | ||
993 | rhashtable_destroy(&ht); | |
994 | ||
995 | return err; | |
996 | } | |
997 | ||
998 | subsys_initcall(test_rht_init); | |
999 | ||
1000 | #endif /* CONFIG_TEST_RHASHTABLE */ |