]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - lib/rhashtable.c
rhashtable: Per bucket locks & deferred expansion/shrinking
[mirror_ubuntu-focal-kernel.git] / lib / rhashtable.c
CommitLineData
7e1e7763
TG
1/*
2 * Resizable, Scalable, Concurrent Hash Table
3 *
4 * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch>
5 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
6 *
7 * Based on the following paper:
8 * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf
9 *
10 * Code partially derived from nft_hash
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 */
16
17#include <linux/kernel.h>
18#include <linux/init.h>
19#include <linux/log2.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/mm.h>
87545899 23#include <linux/jhash.h>
7e1e7763
TG
24#include <linux/random.h>
25#include <linux/rhashtable.h>
7e1e7763
TG
26
27#define HASH_DEFAULT_SIZE 64UL
28#define HASH_MIN_SIZE 4UL
97defe1e
TG
29#define BUCKET_LOCKS_PER_CPU 128UL
30
31enum {
32 RHT_LOCK_NORMAL,
33 RHT_LOCK_NESTED,
34 RHT_LOCK_NESTED2,
35};
36
37/* The bucket lock is selected based on the hash and protects mutations
38 * on a group of hash buckets.
39 *
40 * IMPORTANT: When holding the bucket lock of both the old and new table
41 * during expansions and shrinking, the old bucket lock must always be
42 * acquired first.
43 */
44static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
45{
46 return &tbl->locks[hash & tbl->locks_mask];
47}
7e1e7763
TG
48
49#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
97defe1e
TG
50#define ASSERT_BUCKET_LOCK(TBL, HASH) \
51 BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
7e1e7763
TG
52
53#ifdef CONFIG_PROVE_LOCKING
97defe1e 54int lockdep_rht_mutex_is_held(struct rhashtable *ht)
7e1e7763 55{
97defe1e 56 return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
7e1e7763
TG
57}
58EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
88d6ed15
TG
59
60int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
61{
97defe1e
TG
62 spinlock_t *lock = bucket_lock(tbl, hash);
63
64 return (debug_locks) ? lockdep_is_held(lock) : 1;
88d6ed15
TG
65}
66EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
7e1e7763
TG
67#endif
68
c91eee56 69static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
7e1e7763
TG
70{
71 return (void *) he - ht->p.head_offset;
72}
7e1e7763 73
8d24c0b4 74static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
7e1e7763 75{
8d24c0b4 76 return hash & (tbl->size - 1);
7e1e7763 77}
7e1e7763 78
8d24c0b4 79static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
7e1e7763 80{
8d24c0b4 81 u32 hash;
7e1e7763 82
8d24c0b4
TG
83 if (unlikely(!ht->p.key_len))
84 hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
85 else
86 hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
87 ht->p.hash_rnd);
7e1e7763 88
8d24c0b4 89 return hash;
7e1e7763
TG
90}
91
97defe1e 92static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
7e1e7763
TG
93{
94 struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
8d24c0b4
TG
95 u32 hash;
96
97 hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
7e1e7763 98
8d24c0b4 99 return rht_bucket_index(tbl, hash);
7e1e7763 100}
7e1e7763
TG
101
102static u32 head_hashfn(const struct rhashtable *ht,
8d24c0b4
TG
103 const struct bucket_table *tbl,
104 const struct rhash_head *he)
7e1e7763 105{
8d24c0b4 106 return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
7e1e7763
TG
107}
108
b8e1943e
TG
109static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
110{
111 struct rhash_head __rcu **pprev;
112
113 for (pprev = &tbl->buckets[n];
114 rht_dereference_bucket(*pprev, tbl, n);
115 pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
116 ;
117
118 return pprev;
119}
120
97defe1e
TG
121static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
122{
123 unsigned int i, size;
124#if defined(CONFIG_PROVE_LOCKING)
125 unsigned int nr_pcpus = 2;
126#else
127 unsigned int nr_pcpus = num_possible_cpus();
128#endif
129
130 nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
131 size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
132
133 /* Never allocate more than one lock per bucket */
134 size = min_t(unsigned int, size, tbl->size);
135
136 if (sizeof(spinlock_t) != 0) {
137#ifdef CONFIG_NUMA
138 if (size * sizeof(spinlock_t) > PAGE_SIZE)
139 tbl->locks = vmalloc(size * sizeof(spinlock_t));
140 else
141#endif
142 tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
143 GFP_KERNEL);
144 if (!tbl->locks)
145 return -ENOMEM;
146 for (i = 0; i < size; i++)
147 spin_lock_init(&tbl->locks[i]);
148 }
149 tbl->locks_mask = size - 1;
150
151 return 0;
152}
153
154static void bucket_table_free(const struct bucket_table *tbl)
155{
156 if (tbl)
157 kvfree(tbl->locks);
158
159 kvfree(tbl);
160}
161
162static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
163 size_t nbuckets)
7e1e7763
TG
164{
165 struct bucket_table *tbl;
166 size_t size;
167
168 size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
6eba8224 169 tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
7e1e7763
TG
170 if (tbl == NULL)
171 tbl = vzalloc(size);
172
173 if (tbl == NULL)
174 return NULL;
175
176 tbl->size = nbuckets;
177
97defe1e
TG
178 if (alloc_bucket_locks(ht, tbl) < 0) {
179 bucket_table_free(tbl);
180 return NULL;
181 }
7e1e7763 182
97defe1e 183 return tbl;
7e1e7763
TG
184}
185
186/**
187 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
188 * @ht: hash table
189 * @new_size: new table size
190 */
191bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
192{
193 /* Expand table when exceeding 75% load */
97defe1e 194 return atomic_read(&ht->nelems) > (new_size / 4 * 3);
7e1e7763
TG
195}
196EXPORT_SYMBOL_GPL(rht_grow_above_75);
197
198/**
199 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
200 * @ht: hash table
201 * @new_size: new table size
202 */
203bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
204{
205 /* Shrink table beneath 30% load */
97defe1e 206 return atomic_read(&ht->nelems) < (new_size * 3 / 10);
7e1e7763
TG
207}
208EXPORT_SYMBOL_GPL(rht_shrink_below_30);
209
210static void hashtable_chain_unzip(const struct rhashtable *ht,
211 const struct bucket_table *new_tbl,
97defe1e
TG
212 struct bucket_table *old_tbl,
213 size_t old_hash)
7e1e7763
TG
214{
215 struct rhash_head *he, *p, *next;
97defe1e
TG
216 spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
217 unsigned int new_hash, new_hash2;
218
219 ASSERT_BUCKET_LOCK(old_tbl, old_hash);
7e1e7763
TG
220
221 /* Old bucket empty, no work needed. */
97defe1e
TG
222 p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
223 old_hash);
7e1e7763
TG
224 if (!p)
225 return;
226
97defe1e
TG
227 new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
228 new_bucket_lock = bucket_lock(new_tbl, new_hash);
229
7e1e7763
TG
230 /* Advance the old bucket pointer one or more times until it
231 * reaches a node that doesn't hash to the same bucket as the
232 * previous node p. Call the previous node p;
233 */
97defe1e
TG
234 rht_for_each_continue(he, p->next, old_tbl, old_hash) {
235 new_hash2 = head_hashfn(ht, new_tbl, he);
236 if (new_hash != new_hash2)
7e1e7763
TG
237 break;
238 p = he;
239 }
97defe1e
TG
240 rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
241
242 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
243
244 /* If we have encountered an entry that maps to a different bucket in
245 * the new table, lock down that bucket as well as we might cut off
246 * the end of the chain.
247 */
248 new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
249 if (new_bucket_lock != new_bucket_lock2)
250 spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
7e1e7763
TG
251
252 /* Find the subsequent node which does hash to the same
253 * bucket as node P, or NULL if no such node exists.
254 */
255 next = NULL;
256 if (he) {
97defe1e
TG
257 rht_for_each_continue(he, he->next, old_tbl, old_hash) {
258 if (head_hashfn(ht, new_tbl, he) == new_hash) {
7e1e7763
TG
259 next = he;
260 break;
261 }
262 }
263 }
264
265 /* Set p's next pointer to that subsequent node pointer,
266 * bypassing the nodes which do not hash to p's bucket
267 */
97defe1e
TG
268 rcu_assign_pointer(p->next, next);
269
270 if (new_bucket_lock != new_bucket_lock2)
271 spin_unlock_bh(new_bucket_lock2);
272 spin_unlock_bh(new_bucket_lock);
273}
274
275static void link_old_to_new(struct bucket_table *new_tbl,
276 unsigned int new_hash, struct rhash_head *entry)
277{
278 spinlock_t *new_bucket_lock;
279
280 new_bucket_lock = bucket_lock(new_tbl, new_hash);
281
282 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
283 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
284 spin_unlock_bh(new_bucket_lock);
7e1e7763
TG
285}
286
287/**
288 * rhashtable_expand - Expand hash table while allowing concurrent lookups
289 * @ht: the hash table to expand
7e1e7763
TG
290 *
291 * A secondary bucket array is allocated and the hash entries are migrated
292 * while keeping them on both lists until the end of the RCU grace period.
293 *
294 * This function may only be called in a context where it is safe to call
295 * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
296 *
97defe1e
TG
297 * The caller must ensure that no concurrent resizing occurs by holding
298 * ht->mutex.
299 *
300 * It is valid to have concurrent insertions and deletions protected by per
301 * bucket locks or concurrent RCU protected lookups and traversals.
7e1e7763 302 */
6eba8224 303int rhashtable_expand(struct rhashtable *ht)
7e1e7763
TG
304{
305 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
306 struct rhash_head *he;
97defe1e
TG
307 spinlock_t *old_bucket_lock;
308 unsigned int new_hash, old_hash;
309 bool complete = false;
7e1e7763
TG
310
311 ASSERT_RHT_MUTEX(ht);
312
313 if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
314 return 0;
315
97defe1e 316 new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
7e1e7763
TG
317 if (new_tbl == NULL)
318 return -ENOMEM;
319
320 ht->shift++;
321
97defe1e
TG
322 /* Make insertions go into the new, empty table right away. Deletions
323 * and lookups will be attempted in both tables until we synchronize.
324 * The synchronize_rcu() guarantees for the new table to be picked up
325 * so no new additions go into the old table while we relink.
326 */
327 rcu_assign_pointer(ht->future_tbl, new_tbl);
328 synchronize_rcu();
329
330 /* For each new bucket, search the corresponding old bucket for the
331 * first entry that hashes to the new bucket, and link the end of
332 * newly formed bucket chain (containing entries added to future
333 * table) to that entry. Since all the entries which will end up in
334 * the new bucket appear in the same old bucket, this constructs an
335 * entirely valid new hash table, but with multiple buckets
336 * "zipped" together into a single imprecise chain.
7e1e7763 337 */
97defe1e
TG
338 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
339 old_hash = rht_bucket_index(old_tbl, new_hash);
340 old_bucket_lock = bucket_lock(old_tbl, old_hash);
341
342 spin_lock_bh(old_bucket_lock);
343 rht_for_each(he, old_tbl, old_hash) {
344 if (head_hashfn(ht, new_tbl, he) == new_hash) {
345 link_old_to_new(new_tbl, new_hash, he);
7e1e7763
TG
346 break;
347 }
348 }
97defe1e 349 spin_unlock_bh(old_bucket_lock);
7e1e7763
TG
350 }
351
352 /* Publish the new table pointer. Lookups may now traverse
0c828f2f
HX
353 * the new table, but they will not benefit from any
354 * additional efficiency until later steps unzip the buckets.
7e1e7763
TG
355 */
356 rcu_assign_pointer(ht->tbl, new_tbl);
357
358 /* Unzip interleaved hash chains */
97defe1e 359 while (!complete && !ht->being_destroyed) {
7e1e7763
TG
360 /* Wait for readers. All new readers will see the new
361 * table, and thus no references to the old table will
362 * remain.
363 */
364 synchronize_rcu();
365
366 /* For each bucket in the old table (each of which
367 * contains items from multiple buckets of the new
368 * table): ...
369 */
370 complete = true;
97defe1e
TG
371 for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
372 old_bucket_lock = bucket_lock(old_tbl, old_hash);
373 spin_lock_bh(old_bucket_lock);
374
375 hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
376 if (old_tbl->buckets[old_hash] != NULL)
7e1e7763 377 complete = false;
97defe1e
TG
378
379 spin_unlock_bh(old_bucket_lock);
7e1e7763 380 }
97defe1e 381 }
7e1e7763
TG
382
383 bucket_table_free(old_tbl);
384 return 0;
385}
386EXPORT_SYMBOL_GPL(rhashtable_expand);
387
388/**
389 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
390 * @ht: the hash table to shrink
7e1e7763
TG
391 *
392 * This function may only be called in a context where it is safe to call
393 * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
394 *
97defe1e
TG
395 * The caller must ensure that no concurrent resizing occurs by holding
396 * ht->mutex.
397 *
7e1e7763
TG
398 * The caller must ensure that no concurrent table mutations take place.
399 * It is however valid to have concurrent lookups if they are RCU protected.
97defe1e
TG
400 *
401 * It is valid to have concurrent insertions and deletions protected by per
402 * bucket locks or concurrent RCU protected lookups and traversals.
7e1e7763 403 */
6eba8224 404int rhashtable_shrink(struct rhashtable *ht)
7e1e7763 405{
97defe1e
TG
406 struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
407 spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
408 unsigned int new_hash;
7e1e7763
TG
409
410 ASSERT_RHT_MUTEX(ht);
411
94000176 412 if (ht->shift <= ht->p.min_shift)
7e1e7763
TG
413 return 0;
414
97defe1e
TG
415 new_tbl = bucket_table_alloc(ht, tbl->size / 2);
416 if (new_tbl == NULL)
7e1e7763
TG
417 return -ENOMEM;
418
97defe1e
TG
419 rcu_assign_pointer(ht->future_tbl, new_tbl);
420 synchronize_rcu();
7e1e7763 421
97defe1e
TG
422 /* Link the first entry in the old bucket to the end of the
423 * bucket in the new table. As entries are concurrently being
424 * added to the new table, lock down the new bucket. As we
425 * always divide the size in half when shrinking, each bucket
426 * in the new table maps to exactly two buckets in the old
427 * table.
428 *
429 * As removals can occur concurrently on the old table, we need
430 * to lock down both matching buckets in the old table.
7e1e7763 431 */
97defe1e
TG
432 for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
433 old_bucket_lock1 = bucket_lock(tbl, new_hash);
434 old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
435 new_bucket_lock = bucket_lock(new_tbl, new_hash);
436
437 spin_lock_bh(old_bucket_lock1);
438 spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
439 spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
440
441 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
442 tbl->buckets[new_hash]);
443 rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
444 tbl->buckets[new_hash + new_tbl->size]);
445
446 spin_unlock_bh(new_bucket_lock);
447 spin_unlock_bh(old_bucket_lock2);
448 spin_unlock_bh(old_bucket_lock1);
7e1e7763
TG
449 }
450
451 /* Publish the new, valid hash table */
97defe1e
TG
452 rcu_assign_pointer(ht->tbl, new_tbl);
453 ht->shift--;
7e1e7763
TG
454
455 /* Wait for readers. No new readers will have references to the
456 * old hash table.
457 */
458 synchronize_rcu();
459
460 bucket_table_free(tbl);
461
462 return 0;
463}
464EXPORT_SYMBOL_GPL(rhashtable_shrink);
465
97defe1e
TG
466static void rht_deferred_worker(struct work_struct *work)
467{
468 struct rhashtable *ht;
469 struct bucket_table *tbl;
470
471 ht = container_of(work, struct rhashtable, run_work.work);
472 mutex_lock(&ht->mutex);
473 tbl = rht_dereference(ht->tbl, ht);
474
475 if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
476 rhashtable_expand(ht);
477 else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
478 rhashtable_shrink(ht);
479
480 mutex_unlock(&ht->mutex);
481}
482
7e1e7763
TG
483/**
484 * rhashtable_insert - insert object into hash hash table
485 * @ht: hash table
486 * @obj: pointer to hash head inside object
7e1e7763 487 *
97defe1e
TG
488 * Will take a per bucket spinlock to protect against mutual mutations
489 * on the same bucket. Multiple insertions may occur in parallel unless
490 * they map to the same bucket lock.
7e1e7763 491 *
97defe1e
TG
492 * It is safe to call this function from atomic context.
493 *
494 * Will trigger an automatic deferred table resizing if the size grows
495 * beyond the watermark indicated by grow_decision() which can be passed
496 * to rhashtable_init().
7e1e7763 497 */
6eba8224 498void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
7e1e7763 499{
97defe1e
TG
500 struct bucket_table *tbl;
501 spinlock_t *lock;
502 unsigned hash;
7e1e7763 503
97defe1e 504 rcu_read_lock();
7e1e7763 505
97defe1e 506 tbl = rht_dereference_rcu(ht->future_tbl, ht);
8d24c0b4 507 hash = head_hashfn(ht, tbl, obj);
97defe1e
TG
508 lock = bucket_lock(tbl, hash);
509
510 spin_lock_bh(lock);
7e1e7763
TG
511 RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
512 rcu_assign_pointer(tbl->buckets[hash], obj);
97defe1e 513 spin_unlock_bh(lock);
7e1e7763 514
97defe1e
TG
515 atomic_inc(&ht->nelems);
516
517 /* Only grow the table if no resizing is currently in progress. */
518 if (ht->tbl != ht->future_tbl &&
519 ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
520 schedule_delayed_work(&ht->run_work, 0);
521
522 rcu_read_unlock();
7e1e7763
TG
523}
524EXPORT_SYMBOL_GPL(rhashtable_insert);
525
7e1e7763
TG
526/**
527 * rhashtable_remove - remove object from hash table
528 * @ht: hash table
529 * @obj: pointer to hash head inside object
7e1e7763
TG
530 *
531 * Since the hash chain is single linked, the removal operation needs to
532 * walk the bucket chain upon removal. The removal operation is thus
533 * considerable slow if the hash table is not correctly sized.
534 *
535 * Will automatically shrink the table via rhashtable_expand() if the the
536 * shrink_decision function specified at rhashtable_init() returns true.
537 *
538 * The caller must ensure that no concurrent table mutations occur. It is
539 * however valid to have concurrent lookups if they are RCU protected.
540 */
6eba8224 541bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
7e1e7763 542{
97defe1e 543 struct bucket_table *tbl;
7e1e7763
TG
544 struct rhash_head __rcu **pprev;
545 struct rhash_head *he;
97defe1e
TG
546 spinlock_t *lock;
547 unsigned int hash;
7e1e7763 548
97defe1e
TG
549 rcu_read_lock();
550 tbl = rht_dereference_rcu(ht->tbl, ht);
551 hash = head_hashfn(ht, tbl, obj);
7e1e7763 552
97defe1e
TG
553 lock = bucket_lock(tbl, hash);
554 spin_lock_bh(lock);
7e1e7763 555
97defe1e
TG
556restart:
557 pprev = &tbl->buckets[hash];
558 rht_for_each(he, tbl, hash) {
7e1e7763
TG
559 if (he != obj) {
560 pprev = &he->next;
561 continue;
562 }
563
97defe1e
TG
564 rcu_assign_pointer(*pprev, obj->next);
565 atomic_dec(&ht->nelems);
897362e4 566
97defe1e
TG
567 spin_unlock_bh(lock);
568
569 if (ht->tbl != ht->future_tbl &&
570 ht->p.shrink_decision &&
897362e4 571 ht->p.shrink_decision(ht, tbl->size))
97defe1e
TG
572 schedule_delayed_work(&ht->run_work, 0);
573
574 rcu_read_unlock();
897362e4 575
7e1e7763
TG
576 return true;
577 }
578
97defe1e
TG
579 if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
580 spin_unlock_bh(lock);
581
582 tbl = rht_dereference_rcu(ht->tbl, ht);
583 hash = head_hashfn(ht, tbl, obj);
584
585 lock = bucket_lock(tbl, hash);
586 spin_lock_bh(lock);
587 goto restart;
588 }
589
590 spin_unlock_bh(lock);
591 rcu_read_unlock();
592
7e1e7763
TG
593 return false;
594}
595EXPORT_SYMBOL_GPL(rhashtable_remove);
596
597/**
598 * rhashtable_lookup - lookup key in hash table
599 * @ht: hash table
600 * @key: pointer to key
601 *
602 * Computes the hash value for the key and traverses the bucket chain looking
603 * for a entry with an identical key. The first matching entry is returned.
604 *
605 * This lookup function may only be used for fixed key hash table (key_len
606 * paramter set). It will BUG() if used inappropriately.
607 *
97defe1e 608 * Lookups may occur in parallel with hashtable mutations and resizing.
7e1e7763 609 */
97defe1e 610void *rhashtable_lookup(struct rhashtable *ht, const void *key)
7e1e7763 611{
97defe1e 612 const struct bucket_table *tbl, *old_tbl;
7e1e7763 613 struct rhash_head *he;
97defe1e 614 u32 hash;
7e1e7763
TG
615
616 BUG_ON(!ht->p.key_len);
617
97defe1e
TG
618 rcu_read_lock();
619 old_tbl = rht_dereference_rcu(ht->tbl, ht);
620 tbl = rht_dereference_rcu(ht->future_tbl, ht);
621 hash = key_hashfn(ht, key, ht->p.key_len);
622restart:
623 rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
7e1e7763
TG
624 if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
625 ht->p.key_len))
626 continue;
97defe1e 627 rcu_read_unlock();
a4b18cda 628 return rht_obj(ht, he);
7e1e7763
TG
629 }
630
97defe1e
TG
631 if (unlikely(tbl != old_tbl)) {
632 tbl = old_tbl;
633 goto restart;
634 }
635
636 rcu_read_unlock();
7e1e7763
TG
637 return NULL;
638}
639EXPORT_SYMBOL_GPL(rhashtable_lookup);
640
641/**
642 * rhashtable_lookup_compare - search hash table with compare function
643 * @ht: hash table
8d24c0b4 644 * @key: the pointer to the key
7e1e7763
TG
645 * @compare: compare function, must return true on match
646 * @arg: argument passed on to compare function
647 *
648 * Traverses the bucket chain behind the provided hash value and calls the
649 * specified compare function for each entry.
650 *
97defe1e 651 * Lookups may occur in parallel with hashtable mutations and resizing.
7e1e7763
TG
652 *
653 * Returns the first entry on which the compare function returned true.
654 */
97defe1e 655void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
7e1e7763
TG
656 bool (*compare)(void *, void *), void *arg)
657{
97defe1e 658 const struct bucket_table *tbl, *old_tbl;
7e1e7763 659 struct rhash_head *he;
8d24c0b4 660 u32 hash;
7e1e7763 661
97defe1e
TG
662 rcu_read_lock();
663
664 old_tbl = rht_dereference_rcu(ht->tbl, ht);
665 tbl = rht_dereference_rcu(ht->future_tbl, ht);
8d24c0b4 666 hash = key_hashfn(ht, key, ht->p.key_len);
97defe1e
TG
667restart:
668 rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
7e1e7763
TG
669 if (!compare(rht_obj(ht, he), arg))
670 continue;
97defe1e 671 rcu_read_unlock();
a4b18cda 672 return rht_obj(ht, he);
7e1e7763
TG
673 }
674
97defe1e
TG
675 if (unlikely(tbl != old_tbl)) {
676 tbl = old_tbl;
677 goto restart;
678 }
679 rcu_read_unlock();
680
7e1e7763
TG
681 return NULL;
682}
683EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
684
94000176 685static size_t rounded_hashtable_size(struct rhashtable_params *params)
7e1e7763 686{
94000176
YX
687 return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
688 1UL << params->min_shift);
7e1e7763
TG
689}
690
691/**
692 * rhashtable_init - initialize a new hash table
693 * @ht: hash table to be initialized
694 * @params: configuration parameters
695 *
696 * Initializes a new hash table based on the provided configuration
697 * parameters. A table can be configured either with a variable or
698 * fixed length key:
699 *
700 * Configuration Example 1: Fixed length keys
701 * struct test_obj {
702 * int key;
703 * void * my_member;
704 * struct rhash_head node;
705 * };
706 *
707 * struct rhashtable_params params = {
708 * .head_offset = offsetof(struct test_obj, node),
709 * .key_offset = offsetof(struct test_obj, key),
710 * .key_len = sizeof(int),
87545899 711 * .hashfn = jhash,
7e1e7763
TG
712 * };
713 *
714 * Configuration Example 2: Variable length keys
715 * struct test_obj {
716 * [...]
717 * struct rhash_head node;
718 * };
719 *
720 * u32 my_hash_fn(const void *data, u32 seed)
721 * {
722 * struct test_obj *obj = data;
723 *
724 * return [... hash ...];
725 * }
726 *
727 * struct rhashtable_params params = {
728 * .head_offset = offsetof(struct test_obj, node),
87545899 729 * .hashfn = jhash,
7e1e7763 730 * .obj_hashfn = my_hash_fn,
7e1e7763
TG
731 * };
732 */
733int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
734{
735 struct bucket_table *tbl;
736 size_t size;
737
738 size = HASH_DEFAULT_SIZE;
739
740 if ((params->key_len && !params->hashfn) ||
741 (!params->key_len && !params->obj_hashfn))
742 return -EINVAL;
743
94000176
YX
744 params->min_shift = max_t(size_t, params->min_shift,
745 ilog2(HASH_MIN_SIZE));
746
7e1e7763 747 if (params->nelem_hint)
94000176 748 size = rounded_hashtable_size(params);
7e1e7763 749
97defe1e
TG
750 memset(ht, 0, sizeof(*ht));
751 mutex_init(&ht->mutex);
752 memcpy(&ht->p, params, sizeof(*params));
753
754 if (params->locks_mul)
755 ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
756 else
757 ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
758
759 tbl = bucket_table_alloc(ht, size);
7e1e7763
TG
760 if (tbl == NULL)
761 return -ENOMEM;
762
7e1e7763 763 ht->shift = ilog2(tbl->size);
7e1e7763 764 RCU_INIT_POINTER(ht->tbl, tbl);
97defe1e 765 RCU_INIT_POINTER(ht->future_tbl, tbl);
7e1e7763
TG
766
767 if (!ht->p.hash_rnd)
768 get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
769
97defe1e
TG
770 if (ht->p.grow_decision || ht->p.shrink_decision)
771 INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
772
7e1e7763
TG
773 return 0;
774}
775EXPORT_SYMBOL_GPL(rhashtable_init);
776
777/**
778 * rhashtable_destroy - destroy hash table
779 * @ht: the hash table to destroy
780 *
ae82ddcf
PNA
781 * Frees the bucket array. This function is not rcu safe, therefore the caller
782 * has to make sure that no resizing may happen by unpublishing the hashtable
783 * and waiting for the quiescent cycle before releasing the bucket array.
7e1e7763 784 */
97defe1e 785void rhashtable_destroy(struct rhashtable *ht)
7e1e7763 786{
97defe1e
TG
787 ht->being_destroyed = true;
788
789 mutex_lock(&ht->mutex);
790
791 cancel_delayed_work(&ht->run_work);
792 bucket_table_free(rht_dereference(ht->tbl, ht));
793
794 mutex_unlock(&ht->mutex);
7e1e7763
TG
795}
796EXPORT_SYMBOL_GPL(rhashtable_destroy);
797
798/**************************************************************************
799 * Self Test
800 **************************************************************************/
801
802#ifdef CONFIG_TEST_RHASHTABLE
803
804#define TEST_HT_SIZE 8
805#define TEST_ENTRIES 2048
806#define TEST_PTR ((void *) 0xdeadbeef)
807#define TEST_NEXPANDS 4
808
7e1e7763
TG
809struct test_obj {
810 void *ptr;
811 int value;
812 struct rhash_head node;
813};
814
815static int __init test_rht_lookup(struct rhashtable *ht)
816{
817 unsigned int i;
818
819 for (i = 0; i < TEST_ENTRIES * 2; i++) {
820 struct test_obj *obj;
821 bool expected = !(i % 2);
822 u32 key = i;
823
824 obj = rhashtable_lookup(ht, &key);
825
826 if (expected && !obj) {
827 pr_warn("Test failed: Could not find key %u\n", key);
828 return -ENOENT;
829 } else if (!expected && obj) {
830 pr_warn("Test failed: Unexpected entry found for key %u\n",
831 key);
832 return -EEXIST;
833 } else if (expected && obj) {
834 if (obj->ptr != TEST_PTR || obj->value != i) {
835 pr_warn("Test failed: Lookup value mismatch %p!=%p, %u!=%u\n",
836 obj->ptr, TEST_PTR, obj->value, i);
837 return -EINVAL;
838 }
839 }
840 }
841
842 return 0;
843}
844
3e7b2ec4 845static void test_bucket_stats(struct rhashtable *ht, bool quiet)
7e1e7763 846{
3e7b2ec4 847 unsigned int cnt, rcu_cnt, i, total = 0;
88d6ed15 848 struct rhash_head *pos;
7e1e7763 849 struct test_obj *obj;
3e7b2ec4 850 struct bucket_table *tbl;
7e1e7763 851
3e7b2ec4 852 tbl = rht_dereference_rcu(ht->tbl, ht);
7e1e7763 853 for (i = 0; i < tbl->size; i++) {
3e7b2ec4 854 rcu_cnt = cnt = 0;
7e1e7763
TG
855
856 if (!quiet)
857 pr_info(" [%#4x/%zu]", i, tbl->size);
858
88d6ed15 859 rht_for_each_entry_rcu(obj, pos, tbl, i, node) {
7e1e7763
TG
860 cnt++;
861 total++;
862 if (!quiet)
863 pr_cont(" [%p],", obj);
864 }
865
88d6ed15 866 rht_for_each_entry_rcu(obj, pos, tbl, i, node)
3e7b2ec4
TG
867 rcu_cnt++;
868
869 if (rcu_cnt != cnt)
870 pr_warn("Test failed: Chain count mismach %d != %d",
871 cnt, rcu_cnt);
872
7e1e7763
TG
873 if (!quiet)
874 pr_cont("\n [%#x] first element: %p, chain length: %u\n",
875 i, tbl->buckets[i], cnt);
876 }
877
97defe1e
TG
878 pr_info(" Traversal complete: counted=%u, nelems=%u, entries=%d\n",
879 total, atomic_read(&ht->nelems), TEST_ENTRIES);
3e7b2ec4 880
97defe1e 881 if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
3e7b2ec4 882 pr_warn("Test failed: Total count mismatch ^^^");
7e1e7763
TG
883}
884
885static int __init test_rhashtable(struct rhashtable *ht)
886{
887 struct bucket_table *tbl;
88d6ed15
TG
888 struct test_obj *obj;
889 struct rhash_head *pos, *next;
7e1e7763
TG
890 int err;
891 unsigned int i;
892
893 /*
894 * Insertion Test:
895 * Insert TEST_ENTRIES into table with all keys even numbers
896 */
897 pr_info(" Adding %d keys\n", TEST_ENTRIES);
898 for (i = 0; i < TEST_ENTRIES; i++) {
899 struct test_obj *obj;
900
901 obj = kzalloc(sizeof(*obj), GFP_KERNEL);
902 if (!obj) {
903 err = -ENOMEM;
904 goto error;
905 }
906
907 obj->ptr = TEST_PTR;
908 obj->value = i * 2;
909
6eba8224 910 rhashtable_insert(ht, &obj->node);
7e1e7763
TG
911 }
912
913 rcu_read_lock();
3e7b2ec4 914 test_bucket_stats(ht, true);
7e1e7763
TG
915 test_rht_lookup(ht);
916 rcu_read_unlock();
917
918 for (i = 0; i < TEST_NEXPANDS; i++) {
919 pr_info(" Table expansion iteration %u...\n", i);
97defe1e 920 mutex_lock(&ht->mutex);
6eba8224 921 rhashtable_expand(ht);
97defe1e 922 mutex_unlock(&ht->mutex);
7e1e7763
TG
923
924 rcu_read_lock();
925 pr_info(" Verifying lookups...\n");
926 test_rht_lookup(ht);
927 rcu_read_unlock();
928 }
929
930 for (i = 0; i < TEST_NEXPANDS; i++) {
931 pr_info(" Table shrinkage iteration %u...\n", i);
97defe1e 932 mutex_lock(&ht->mutex);
6eba8224 933 rhashtable_shrink(ht);
97defe1e 934 mutex_unlock(&ht->mutex);
7e1e7763
TG
935
936 rcu_read_lock();
937 pr_info(" Verifying lookups...\n");
938 test_rht_lookup(ht);
939 rcu_read_unlock();
940 }
941
3e7b2ec4
TG
942 rcu_read_lock();
943 test_bucket_stats(ht, true);
944 rcu_read_unlock();
945
7e1e7763
TG
946 pr_info(" Deleting %d keys\n", TEST_ENTRIES);
947 for (i = 0; i < TEST_ENTRIES; i++) {
948 u32 key = i * 2;
949
950 obj = rhashtable_lookup(ht, &key);
951 BUG_ON(!obj);
952
6eba8224 953 rhashtable_remove(ht, &obj->node);
7e1e7763
TG
954 kfree(obj);
955 }
956
957 return 0;
958
959error:
960 tbl = rht_dereference_rcu(ht->tbl, ht);
961 for (i = 0; i < tbl->size; i++)
88d6ed15 962 rht_for_each_entry_safe(obj, pos, next, tbl, i, node)
7e1e7763
TG
963 kfree(obj);
964
965 return err;
966}
967
968static int __init test_rht_init(void)
969{
970 struct rhashtable ht;
971 struct rhashtable_params params = {
972 .nelem_hint = TEST_HT_SIZE,
973 .head_offset = offsetof(struct test_obj, node),
974 .key_offset = offsetof(struct test_obj, value),
975 .key_len = sizeof(int),
87545899 976 .hashfn = jhash,
7e1e7763
TG
977 .grow_decision = rht_grow_above_75,
978 .shrink_decision = rht_shrink_below_30,
979 };
980 int err;
981
982 pr_info("Running resizable hashtable tests...\n");
983
984 err = rhashtable_init(&ht, &params);
985 if (err < 0) {
986 pr_warn("Test failed: Unable to initialize hashtable: %d\n",
987 err);
988 return err;
989 }
990
991 err = test_rhashtable(&ht);
992
993 rhashtable_destroy(&ht);
994
995 return err;
996}
997
998subsys_initcall(test_rht_init);
999
1000#endif /* CONFIG_TEST_RHASHTABLE */