]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - kernel/bpf/hashtab.c
bpf: fix struct htab_elem layout
[mirror_ubuntu-bionic-kernel.git] / kernel / bpf / hashtab.c
CommitLineData
0f8e4bd8 1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
6c905981 2 * Copyright (c) 2016 Facebook
0f8e4bd8
AS
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/bpf.h>
14#include <linux/jhash.h>
15#include <linux/filter.h>
6c905981 16#include "percpu_freelist.h"
29ba732a 17#include "bpf_lru_list.h"
0f8e4bd8 18
688ecfe6 19struct bucket {
20 struct hlist_head head;
21 raw_spinlock_t lock;
22};
23
0f8e4bd8
AS
24struct bpf_htab {
25 struct bpf_map map;
688ecfe6 26 struct bucket *buckets;
6c905981 27 void *elems;
29ba732a
MKL
28 union {
29 struct pcpu_freelist freelist;
30 struct bpf_lru lru;
31 };
a6ed3ea6 32 void __percpu *extra_elems;
6591f1e6 33 atomic_t count; /* number of elements in this hashtable */
0f8e4bd8
AS
34 u32 n_buckets; /* number of hash buckets */
35 u32 elem_size; /* size of each element in bytes */
36};
37
a6ed3ea6
AS
38enum extra_elem_state {
39 HTAB_NOT_AN_EXTRA_ELEM = 0,
40 HTAB_EXTRA_ELEM_FREE,
41 HTAB_EXTRA_ELEM_USED
42};
43
0f8e4bd8
AS
44/* each htab element is struct htab_elem + key + value */
45struct htab_elem {
824bd0ce 46 union {
6c905981 47 struct hlist_node hash_node;
9f691549
AS
48 struct {
49 void *padding;
50 union {
51 struct bpf_htab *htab;
52 struct pcpu_freelist_node fnode;
53 };
54 };
824bd0ce 55 };
a6ed3ea6
AS
56 union {
57 struct rcu_head rcu;
58 enum extra_elem_state state;
29ba732a 59 struct bpf_lru_node lru_node;
a6ed3ea6 60 };
6c905981 61 u32 hash;
0f8e4bd8
AS
62 char key[0] __aligned(8);
63};
64
29ba732a
MKL
65static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
66
67static bool htab_is_lru(const struct bpf_htab *htab)
68{
8f844938
MKL
69 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
70 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
71}
72
73static bool htab_is_percpu(const struct bpf_htab *htab)
74{
75 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
76 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
29ba732a
MKL
77}
78
6c905981
AS
79static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
80 void __percpu *pptr)
81{
82 *(void __percpu **)(l->key + key_size) = pptr;
83}
84
85static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
86{
87 return *(void __percpu **)(l->key + key_size);
88}
89
90static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
91{
92 return (struct htab_elem *) (htab->elems + i * htab->elem_size);
93}
94
95static void htab_free_elems(struct bpf_htab *htab)
96{
97 int i;
98
8f844938 99 if (!htab_is_percpu(htab))
6c905981
AS
100 goto free_elems;
101
102 for (i = 0; i < htab->map.max_entries; i++) {
103 void __percpu *pptr;
104
105 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
106 htab->map.key_size);
107 free_percpu(pptr);
108 }
109free_elems:
d407bd25 110 bpf_map_area_free(htab->elems);
6c905981
AS
111}
112
29ba732a
MKL
113static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
114 u32 hash)
115{
116 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
117 struct htab_elem *l;
118
119 if (node) {
120 l = container_of(node, struct htab_elem, lru_node);
121 memcpy(l->key, key, htab->map.key_size);
122 return l;
123 }
124
125 return NULL;
126}
127
128static int prealloc_init(struct bpf_htab *htab)
6c905981
AS
129{
130 int err = -ENOMEM, i;
131
d407bd25
DB
132 htab->elems = bpf_map_area_alloc(htab->elem_size *
133 htab->map.max_entries);
6c905981
AS
134 if (!htab->elems)
135 return -ENOMEM;
136
8f844938 137 if (!htab_is_percpu(htab))
6c905981
AS
138 goto skip_percpu_elems;
139
140 for (i = 0; i < htab->map.max_entries; i++) {
141 u32 size = round_up(htab->map.value_size, 8);
142 void __percpu *pptr;
143
144 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
145 if (!pptr)
146 goto free_elems;
147 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
148 pptr);
149 }
150
151skip_percpu_elems:
29ba732a
MKL
152 if (htab_is_lru(htab))
153 err = bpf_lru_init(&htab->lru,
154 htab->map.map_flags & BPF_F_NO_COMMON_LRU,
155 offsetof(struct htab_elem, hash) -
156 offsetof(struct htab_elem, lru_node),
157 htab_lru_map_delete_node,
158 htab);
159 else
160 err = pcpu_freelist_init(&htab->freelist);
161
6c905981
AS
162 if (err)
163 goto free_elems;
164
29ba732a
MKL
165 if (htab_is_lru(htab))
166 bpf_lru_populate(&htab->lru, htab->elems,
167 offsetof(struct htab_elem, lru_node),
168 htab->elem_size, htab->map.max_entries);
169 else
9f691549
AS
170 pcpu_freelist_populate(&htab->freelist,
171 htab->elems + offsetof(struct htab_elem, fnode),
29ba732a
MKL
172 htab->elem_size, htab->map.max_entries);
173
6c905981
AS
174 return 0;
175
176free_elems:
177 htab_free_elems(htab);
178 return err;
179}
180
29ba732a
MKL
181static void prealloc_destroy(struct bpf_htab *htab)
182{
183 htab_free_elems(htab);
184
185 if (htab_is_lru(htab))
186 bpf_lru_destroy(&htab->lru);
187 else
188 pcpu_freelist_destroy(&htab->freelist);
189}
190
a6ed3ea6
AS
191static int alloc_extra_elems(struct bpf_htab *htab)
192{
193 void __percpu *pptr;
194 int cpu;
195
196 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
197 if (!pptr)
198 return -ENOMEM;
199
200 for_each_possible_cpu(cpu) {
201 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
202 HTAB_EXTRA_ELEM_FREE;
203 }
204 htab->extra_elems = pptr;
205 return 0;
206}
207
0f8e4bd8
AS
208/* Called from syscall */
209static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
210{
8f844938
MKL
211 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
212 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
213 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
214 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
29ba732a
MKL
215 /* percpu_lru means each cpu has its own LRU list.
216 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
217 * the map's value itself is percpu. percpu_lru has
218 * nothing to do with the map's value.
219 */
220 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
221 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
0f8e4bd8
AS
222 struct bpf_htab *htab;
223 int err, i;
824bd0ce 224 u64 cost;
0f8e4bd8 225
9f691549
AS
226 BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
227 offsetof(struct htab_elem, hash_node.pprev));
228 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
229 offsetof(struct htab_elem, hash_node.pprev));
230
29ba732a
MKL
231 if (lru && !capable(CAP_SYS_ADMIN))
232 /* LRU implementation is much complicated than other
233 * maps. Hence, limit to CAP_SYS_ADMIN for now.
234 */
235 return ERR_PTR(-EPERM);
236
237 if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
6c905981
AS
238 /* reserved bits should not be used */
239 return ERR_PTR(-EINVAL);
240
29ba732a
MKL
241 if (!lru && percpu_lru)
242 return ERR_PTR(-EINVAL);
243
244 if (lru && !prealloc)
245 return ERR_PTR(-ENOTSUPP);
246
0f8e4bd8
AS
247 htab = kzalloc(sizeof(*htab), GFP_USER);
248 if (!htab)
249 return ERR_PTR(-ENOMEM);
250
251 /* mandatory map attributes */
824bd0ce 252 htab->map.map_type = attr->map_type;
0f8e4bd8
AS
253 htab->map.key_size = attr->key_size;
254 htab->map.value_size = attr->value_size;
255 htab->map.max_entries = attr->max_entries;
6c905981 256 htab->map.map_flags = attr->map_flags;
0f8e4bd8
AS
257
258 /* check sanity of attributes.
259 * value_size == 0 may be allowed in the future to use map as a set
260 */
261 err = -EINVAL;
262 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
263 htab->map.value_size == 0)
264 goto free_htab;
265
29ba732a
MKL
266 if (percpu_lru) {
267 /* ensure each CPU's lru list has >=1 elements.
268 * since we are at it, make each lru list has the same
269 * number of elements.
270 */
271 htab->map.max_entries = roundup(attr->max_entries,
272 num_possible_cpus());
273 if (htab->map.max_entries < attr->max_entries)
274 htab->map.max_entries = rounddown(attr->max_entries,
275 num_possible_cpus());
276 }
277
0f8e4bd8
AS
278 /* hash table size must be power of 2 */
279 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
280
281 err = -E2BIG;
282 if (htab->map.key_size > MAX_BPF_STACK)
283 /* eBPF programs initialize keys on stack, so they cannot be
284 * larger than max stack size
285 */
286 goto free_htab;
287
7984c27c 288 if (htab->map.value_size >= KMALLOC_MAX_SIZE -
01b3f521
AS
289 MAX_BPF_STACK - sizeof(struct htab_elem))
290 /* if value_size is bigger, the user space won't be able to
291 * access the elements via bpf syscall. This check also makes
292 * sure that the elem_size doesn't overflow and it's
293 * kmalloc-able later in htab_map_update_elem()
294 */
295 goto free_htab;
296
824bd0ce
AS
297 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
298 /* make sure the size for pcpu_alloc() is reasonable */
299 goto free_htab;
300
01b3f521 301 htab->elem_size = sizeof(struct htab_elem) +
824bd0ce
AS
302 round_up(htab->map.key_size, 8);
303 if (percpu)
304 htab->elem_size += sizeof(void *);
305 else
6c905981 306 htab->elem_size += round_up(htab->map.value_size, 8);
01b3f521 307
daaf427c
AS
308 /* prevent zero size kmalloc and check for u32 overflow */
309 if (htab->n_buckets == 0 ||
688ecfe6 310 htab->n_buckets > U32_MAX / sizeof(struct bucket))
daaf427c
AS
311 goto free_htab;
312
824bd0ce
AS
313 cost = (u64) htab->n_buckets * sizeof(struct bucket) +
314 (u64) htab->elem_size * htab->map.max_entries;
315
316 if (percpu)
317 cost += (u64) round_up(htab->map.value_size, 8) *
318 num_possible_cpus() * htab->map.max_entries;
a6ed3ea6
AS
319 else
320 cost += (u64) htab->elem_size * num_possible_cpus();
824bd0ce
AS
321
322 if (cost >= U32_MAX - PAGE_SIZE)
01b3f521
AS
323 /* make sure page count doesn't overflow */
324 goto free_htab;
325
824bd0ce 326 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
01b3f521 327
6c905981
AS
328 /* if map size is larger than memlock limit, reject it early */
329 err = bpf_map_precharge_memlock(htab->map.pages);
330 if (err)
331 goto free_htab;
332
01b3f521 333 err = -ENOMEM;
d407bd25
DB
334 htab->buckets = bpf_map_area_alloc(htab->n_buckets *
335 sizeof(struct bucket));
336 if (!htab->buckets)
337 goto free_htab;
0f8e4bd8 338
688ecfe6 339 for (i = 0; i < htab->n_buckets; i++) {
340 INIT_HLIST_HEAD(&htab->buckets[i].head);
341 raw_spin_lock_init(&htab->buckets[i].lock);
342 }
0f8e4bd8 343
29ba732a
MKL
344 if (!percpu && !lru) {
345 /* lru itself can remove the least used element, so
346 * there is no need for an extra elem during map_update.
347 */
a6ed3ea6
AS
348 err = alloc_extra_elems(htab);
349 if (err)
350 goto free_buckets;
351 }
352
29ba732a
MKL
353 if (prealloc) {
354 err = prealloc_init(htab);
6c905981 355 if (err)
a6ed3ea6 356 goto free_extra_elems;
6c905981 357 }
0f8e4bd8 358
0f8e4bd8
AS
359 return &htab->map;
360
a6ed3ea6
AS
361free_extra_elems:
362 free_percpu(htab->extra_elems);
6c905981 363free_buckets:
d407bd25 364 bpf_map_area_free(htab->buckets);
0f8e4bd8
AS
365free_htab:
366 kfree(htab);
367 return ERR_PTR(err);
368}
369
370static inline u32 htab_map_hash(const void *key, u32 key_len)
371{
372 return jhash(key, key_len, 0);
373}
374
688ecfe6 375static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
0f8e4bd8
AS
376{
377 return &htab->buckets[hash & (htab->n_buckets - 1)];
378}
379
688ecfe6 380static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
381{
382 return &__select_bucket(htab, hash)->head;
383}
384
0f8e4bd8
AS
385static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
386 void *key, u32 key_size)
387{
388 struct htab_elem *l;
389
390 hlist_for_each_entry_rcu(l, head, hash_node)
391 if (l->hash == hash && !memcmp(&l->key, key, key_size))
392 return l;
393
394 return NULL;
395}
396
397/* Called from syscall or from eBPF program */
824bd0ce 398static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
0f8e4bd8
AS
399{
400 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
401 struct hlist_head *head;
402 struct htab_elem *l;
403 u32 hash, key_size;
404
405 /* Must be called with rcu_read_lock. */
406 WARN_ON_ONCE(!rcu_read_lock_held());
407
408 key_size = map->key_size;
409
410 hash = htab_map_hash(key, key_size);
411
412 head = select_bucket(htab, hash);
413
414 l = lookup_elem_raw(head, hash, key, key_size);
415
824bd0ce
AS
416 return l;
417}
418
419static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
420{
421 struct htab_elem *l = __htab_map_lookup_elem(map, key);
422
0f8e4bd8
AS
423 if (l)
424 return l->key + round_up(map->key_size, 8);
425
426 return NULL;
427}
428
29ba732a
MKL
429static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
430{
431 struct htab_elem *l = __htab_map_lookup_elem(map, key);
432
433 if (l) {
434 bpf_lru_node_set_ref(&l->lru_node);
435 return l->key + round_up(map->key_size, 8);
436 }
437
438 return NULL;
439}
440
441/* It is called from the bpf_lru_list when the LRU needs to delete
442 * older elements from the htab.
443 */
444static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
445{
446 struct bpf_htab *htab = (struct bpf_htab *)arg;
447 struct htab_elem *l, *tgt_l;
448 struct hlist_head *head;
449 unsigned long flags;
450 struct bucket *b;
451
452 tgt_l = container_of(node, struct htab_elem, lru_node);
453 b = __select_bucket(htab, tgt_l->hash);
454 head = &b->head;
455
456 raw_spin_lock_irqsave(&b->lock, flags);
457
458 hlist_for_each_entry_rcu(l, head, hash_node)
459 if (l == tgt_l) {
460 hlist_del_rcu(&l->hash_node);
461 break;
462 }
463
464 raw_spin_unlock_irqrestore(&b->lock, flags);
465
466 return l == tgt_l;
467}
468
0f8e4bd8
AS
469/* Called from syscall */
470static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
471{
472 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
473 struct hlist_head *head;
474 struct htab_elem *l, *next_l;
475 u32 hash, key_size;
476 int i;
477
478 WARN_ON_ONCE(!rcu_read_lock_held());
479
480 key_size = map->key_size;
481
482 hash = htab_map_hash(key, key_size);
483
484 head = select_bucket(htab, hash);
485
486 /* lookup the key */
487 l = lookup_elem_raw(head, hash, key, key_size);
488
489 if (!l) {
490 i = 0;
491 goto find_first_elem;
492 }
493
494 /* key was found, get next key in the same bucket */
495 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
496 struct htab_elem, hash_node);
497
498 if (next_l) {
499 /* if next elem in this hash list is non-zero, just return it */
500 memcpy(next_key, next_l->key, key_size);
501 return 0;
502 }
503
504 /* no more elements in this hash list, go to the next bucket */
505 i = hash & (htab->n_buckets - 1);
506 i++;
507
508find_first_elem:
509 /* iterate over buckets */
510 for (; i < htab->n_buckets; i++) {
511 head = select_bucket(htab, i);
512
513 /* pick first element in the bucket */
514 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
515 struct htab_elem, hash_node);
516 if (next_l) {
517 /* if it's not empty, just return it */
518 memcpy(next_key, next_l->key, key_size);
519 return 0;
520 }
521 }
522
6c905981 523 /* iterated over all buckets and all elements */
0f8e4bd8
AS
524 return -ENOENT;
525}
526
6c905981 527static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
824bd0ce 528{
6c905981
AS
529 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
530 free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
824bd0ce
AS
531 kfree(l);
532}
533
6c905981 534static void htab_elem_free_rcu(struct rcu_head *head)
824bd0ce
AS
535{
536 struct htab_elem *l = container_of(head, struct htab_elem, rcu);
6c905981 537 struct bpf_htab *htab = l->htab;
824bd0ce 538
6c905981
AS
539 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
540 * we're calling kfree, otherwise deadlock is possible if kprobes
541 * are placed somewhere inside of slub
542 */
543 preempt_disable();
544 __this_cpu_inc(bpf_prog_active);
545 htab_elem_free(htab, l);
546 __this_cpu_dec(bpf_prog_active);
547 preempt_enable();
824bd0ce
AS
548}
549
6c905981 550static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
824bd0ce 551{
a6ed3ea6
AS
552 if (l->state == HTAB_EXTRA_ELEM_USED) {
553 l->state = HTAB_EXTRA_ELEM_FREE;
554 return;
555 }
556
6c905981
AS
557 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
558 pcpu_freelist_push(&htab->freelist, &l->fnode);
824bd0ce 559 } else {
6c905981
AS
560 atomic_dec(&htab->count);
561 l->htab = htab;
562 call_rcu(&l->rcu, htab_elem_free_rcu);
824bd0ce
AS
563 }
564}
565
fd91de7b
MKL
566static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
567 void *value, bool onallcpus)
568{
569 if (!onallcpus) {
570 /* copy true value_size bytes */
571 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
572 } else {
573 u32 size = round_up(htab->map.value_size, 8);
574 int off = 0, cpu;
575
576 for_each_possible_cpu(cpu) {
577 bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
578 value + off, size);
579 off += size;
580 }
581 }
582}
583
824bd0ce
AS
584static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
585 void *value, u32 key_size, u32 hash,
a6ed3ea6
AS
586 bool percpu, bool onallcpus,
587 bool old_elem_exists)
824bd0ce
AS
588{
589 u32 size = htab->map.value_size;
6c905981 590 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
824bd0ce
AS
591 struct htab_elem *l_new;
592 void __percpu *pptr;
a6ed3ea6 593 int err = 0;
824bd0ce 594
6c905981 595 if (prealloc) {
9f691549
AS
596 struct pcpu_freelist_node *l;
597
598 l = pcpu_freelist_pop(&htab->freelist);
599 if (!l)
a6ed3ea6 600 err = -E2BIG;
9f691549
AS
601 else
602 l_new = container_of(l, struct htab_elem, fnode);
6c905981
AS
603 } else {
604 if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
605 atomic_dec(&htab->count);
a6ed3ea6
AS
606 err = -E2BIG;
607 } else {
608 l_new = kmalloc(htab->elem_size,
609 GFP_ATOMIC | __GFP_NOWARN);
610 if (!l_new)
611 return ERR_PTR(-ENOMEM);
6c905981 612 }
a6ed3ea6
AS
613 }
614
615 if (err) {
616 if (!old_elem_exists)
617 return ERR_PTR(err);
618
619 /* if we're updating the existing element and the hash table
620 * is full, use per-cpu extra elems
621 */
622 l_new = this_cpu_ptr(htab->extra_elems);
623 if (l_new->state != HTAB_EXTRA_ELEM_FREE)
624 return ERR_PTR(-E2BIG);
625 l_new->state = HTAB_EXTRA_ELEM_USED;
626 } else {
627 l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
6c905981 628 }
824bd0ce
AS
629
630 memcpy(l_new->key, key, key_size);
631 if (percpu) {
632 /* round up value_size to 8 bytes */
633 size = round_up(size, 8);
634
6c905981
AS
635 if (prealloc) {
636 pptr = htab_elem_get_ptr(l_new, key_size);
637 } else {
638 /* alloc_percpu zero-fills */
639 pptr = __alloc_percpu_gfp(size, 8,
640 GFP_ATOMIC | __GFP_NOWARN);
641 if (!pptr) {
642 kfree(l_new);
643 return ERR_PTR(-ENOMEM);
644 }
824bd0ce
AS
645 }
646
fd91de7b 647 pcpu_copy_value(htab, pptr, value, onallcpus);
15a07b33 648
6c905981
AS
649 if (!prealloc)
650 htab_elem_set_ptr(l_new, key_size, pptr);
824bd0ce
AS
651 } else {
652 memcpy(l_new->key + round_up(key_size, 8), value, size);
653 }
654
655 l_new->hash = hash;
656 return l_new;
657}
658
659static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
660 u64 map_flags)
661{
824bd0ce
AS
662 if (l_old && map_flags == BPF_NOEXIST)
663 /* elem already exists */
664 return -EEXIST;
665
666 if (!l_old && map_flags == BPF_EXIST)
667 /* elem doesn't exist, cannot update it */
668 return -ENOENT;
669
670 return 0;
671}
672
0f8e4bd8
AS
673/* Called from syscall or from eBPF program */
674static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
675 u64 map_flags)
676{
677 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
824bd0ce 678 struct htab_elem *l_new = NULL, *l_old;
0f8e4bd8
AS
679 struct hlist_head *head;
680 unsigned long flags;
824bd0ce
AS
681 struct bucket *b;
682 u32 key_size, hash;
0f8e4bd8
AS
683 int ret;
684
824bd0ce 685 if (unlikely(map_flags > BPF_EXIST))
0f8e4bd8
AS
686 /* unknown flags */
687 return -EINVAL;
688
689 WARN_ON_ONCE(!rcu_read_lock_held());
690
0f8e4bd8
AS
691 key_size = map->key_size;
692
824bd0ce
AS
693 hash = htab_map_hash(key, key_size);
694
824bd0ce 695 b = __select_bucket(htab, hash);
688ecfe6 696 head = &b->head;
0f8e4bd8
AS
697
698 /* bpf_map_update_elem() can be called in_irq() */
688ecfe6 699 raw_spin_lock_irqsave(&b->lock, flags);
0f8e4bd8 700
824bd0ce 701 l_old = lookup_elem_raw(head, hash, key, key_size);
0f8e4bd8 702
824bd0ce
AS
703 ret = check_flags(htab, l_old, map_flags);
704 if (ret)
0f8e4bd8 705 goto err;
0f8e4bd8 706
a6ed3ea6
AS
707 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
708 !!l_old);
6c905981
AS
709 if (IS_ERR(l_new)) {
710 /* all pre-allocated elements are in use or memory exhausted */
711 ret = PTR_ERR(l_new);
712 goto err;
713 }
714
824bd0ce
AS
715 /* add new element to the head of the list, so that
716 * concurrent search will find it before old elem
0f8e4bd8
AS
717 */
718 hlist_add_head_rcu(&l_new->hash_node, head);
719 if (l_old) {
720 hlist_del_rcu(&l_old->hash_node);
6c905981 721 free_htab_elem(htab, l_old);
0f8e4bd8 722 }
6c905981 723 ret = 0;
0f8e4bd8 724err:
688ecfe6 725 raw_spin_unlock_irqrestore(&b->lock, flags);
0f8e4bd8
AS
726 return ret;
727}
728
29ba732a
MKL
729static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
730 u64 map_flags)
731{
732 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
733 struct htab_elem *l_new, *l_old = NULL;
734 struct hlist_head *head;
735 unsigned long flags;
736 struct bucket *b;
737 u32 key_size, hash;
738 int ret;
739
740 if (unlikely(map_flags > BPF_EXIST))
741 /* unknown flags */
742 return -EINVAL;
743
744 WARN_ON_ONCE(!rcu_read_lock_held());
745
746 key_size = map->key_size;
747
748 hash = htab_map_hash(key, key_size);
749
750 b = __select_bucket(htab, hash);
751 head = &b->head;
752
753 /* For LRU, we need to alloc before taking bucket's
754 * spinlock because getting free nodes from LRU may need
755 * to remove older elements from htab and this removal
756 * operation will need a bucket lock.
757 */
758 l_new = prealloc_lru_pop(htab, key, hash);
759 if (!l_new)
760 return -ENOMEM;
761 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
762
763 /* bpf_map_update_elem() can be called in_irq() */
764 raw_spin_lock_irqsave(&b->lock, flags);
765
766 l_old = lookup_elem_raw(head, hash, key, key_size);
767
768 ret = check_flags(htab, l_old, map_flags);
769 if (ret)
770 goto err;
771
772 /* add new element to the head of the list, so that
773 * concurrent search will find it before old elem
774 */
775 hlist_add_head_rcu(&l_new->hash_node, head);
776 if (l_old) {
777 bpf_lru_node_set_ref(&l_new->lru_node);
778 hlist_del_rcu(&l_old->hash_node);
779 }
780 ret = 0;
781
782err:
783 raw_spin_unlock_irqrestore(&b->lock, flags);
784
785 if (ret)
786 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
787 else if (l_old)
788 bpf_lru_push_free(&htab->lru, &l_old->lru_node);
789
790 return ret;
791}
792
15a07b33
AS
793static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
794 void *value, u64 map_flags,
795 bool onallcpus)
824bd0ce
AS
796{
797 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
798 struct htab_elem *l_new = NULL, *l_old;
799 struct hlist_head *head;
800 unsigned long flags;
801 struct bucket *b;
802 u32 key_size, hash;
803 int ret;
804
805 if (unlikely(map_flags > BPF_EXIST))
806 /* unknown flags */
807 return -EINVAL;
808
809 WARN_ON_ONCE(!rcu_read_lock_held());
810
811 key_size = map->key_size;
812
813 hash = htab_map_hash(key, key_size);
814
815 b = __select_bucket(htab, hash);
816 head = &b->head;
817
818 /* bpf_map_update_elem() can be called in_irq() */
819 raw_spin_lock_irqsave(&b->lock, flags);
820
821 l_old = lookup_elem_raw(head, hash, key, key_size);
822
823 ret = check_flags(htab, l_old, map_flags);
824 if (ret)
825 goto err;
826
827 if (l_old) {
828 /* per-cpu hash map can update value in-place */
fd91de7b
MKL
829 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
830 value, onallcpus);
824bd0ce
AS
831 } else {
832 l_new = alloc_htab_elem(htab, key, value, key_size,
a6ed3ea6 833 hash, true, onallcpus, false);
6c905981
AS
834 if (IS_ERR(l_new)) {
835 ret = PTR_ERR(l_new);
824bd0ce
AS
836 goto err;
837 }
838 hlist_add_head_rcu(&l_new->hash_node, head);
824bd0ce
AS
839 }
840 ret = 0;
841err:
842 raw_spin_unlock_irqrestore(&b->lock, flags);
843 return ret;
844}
845
8f844938
MKL
846static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
847 void *value, u64 map_flags,
848 bool onallcpus)
849{
850 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
851 struct htab_elem *l_new = NULL, *l_old;
852 struct hlist_head *head;
853 unsigned long flags;
854 struct bucket *b;
855 u32 key_size, hash;
856 int ret;
857
858 if (unlikely(map_flags > BPF_EXIST))
859 /* unknown flags */
860 return -EINVAL;
861
862 WARN_ON_ONCE(!rcu_read_lock_held());
863
864 key_size = map->key_size;
865
866 hash = htab_map_hash(key, key_size);
867
868 b = __select_bucket(htab, hash);
869 head = &b->head;
870
871 /* For LRU, we need to alloc before taking bucket's
872 * spinlock because LRU's elem alloc may need
873 * to remove older elem from htab and this removal
874 * operation will need a bucket lock.
875 */
876 if (map_flags != BPF_EXIST) {
877 l_new = prealloc_lru_pop(htab, key, hash);
878 if (!l_new)
879 return -ENOMEM;
880 }
881
882 /* bpf_map_update_elem() can be called in_irq() */
883 raw_spin_lock_irqsave(&b->lock, flags);
884
885 l_old = lookup_elem_raw(head, hash, key, key_size);
886
887 ret = check_flags(htab, l_old, map_flags);
888 if (ret)
889 goto err;
890
891 if (l_old) {
892 bpf_lru_node_set_ref(&l_old->lru_node);
893
894 /* per-cpu hash map can update value in-place */
895 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
896 value, onallcpus);
897 } else {
898 pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
899 value, onallcpus);
900 hlist_add_head_rcu(&l_new->hash_node, head);
901 l_new = NULL;
902 }
903 ret = 0;
904err:
905 raw_spin_unlock_irqrestore(&b->lock, flags);
906 if (l_new)
907 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
908 return ret;
909}
910
15a07b33
AS
911static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
912 void *value, u64 map_flags)
913{
914 return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
915}
916
8f844938
MKL
917static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
918 void *value, u64 map_flags)
919{
920 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
921 false);
922}
923
0f8e4bd8
AS
924/* Called from syscall or from eBPF program */
925static int htab_map_delete_elem(struct bpf_map *map, void *key)
926{
927 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
928 struct hlist_head *head;
688ecfe6 929 struct bucket *b;
0f8e4bd8
AS
930 struct htab_elem *l;
931 unsigned long flags;
932 u32 hash, key_size;
933 int ret = -ENOENT;
934
935 WARN_ON_ONCE(!rcu_read_lock_held());
936
937 key_size = map->key_size;
938
939 hash = htab_map_hash(key, key_size);
688ecfe6 940 b = __select_bucket(htab, hash);
941 head = &b->head;
0f8e4bd8 942
688ecfe6 943 raw_spin_lock_irqsave(&b->lock, flags);
0f8e4bd8 944
0f8e4bd8
AS
945 l = lookup_elem_raw(head, hash, key, key_size);
946
947 if (l) {
948 hlist_del_rcu(&l->hash_node);
6c905981 949 free_htab_elem(htab, l);
0f8e4bd8
AS
950 ret = 0;
951 }
952
688ecfe6 953 raw_spin_unlock_irqrestore(&b->lock, flags);
0f8e4bd8
AS
954 return ret;
955}
956
29ba732a
MKL
957static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
958{
959 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
960 struct hlist_head *head;
961 struct bucket *b;
962 struct htab_elem *l;
963 unsigned long flags;
964 u32 hash, key_size;
965 int ret = -ENOENT;
966
967 WARN_ON_ONCE(!rcu_read_lock_held());
968
969 key_size = map->key_size;
970
971 hash = htab_map_hash(key, key_size);
972 b = __select_bucket(htab, hash);
973 head = &b->head;
974
975 raw_spin_lock_irqsave(&b->lock, flags);
976
977 l = lookup_elem_raw(head, hash, key, key_size);
978
979 if (l) {
980 hlist_del_rcu(&l->hash_node);
981 ret = 0;
982 }
983
984 raw_spin_unlock_irqrestore(&b->lock, flags);
985 if (l)
986 bpf_lru_push_free(&htab->lru, &l->lru_node);
987 return ret;
988}
989
0f8e4bd8
AS
990static void delete_all_elements(struct bpf_htab *htab)
991{
992 int i;
993
994 for (i = 0; i < htab->n_buckets; i++) {
995 struct hlist_head *head = select_bucket(htab, i);
996 struct hlist_node *n;
997 struct htab_elem *l;
998
999 hlist_for_each_entry_safe(l, n, head, hash_node) {
1000 hlist_del_rcu(&l->hash_node);
483bed2b
DB
1001 if (l->state != HTAB_EXTRA_ELEM_USED)
1002 htab_elem_free(htab, l);
0f8e4bd8
AS
1003 }
1004 }
1005}
0f8e4bd8
AS
1006/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1007static void htab_map_free(struct bpf_map *map)
1008{
1009 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1010
1011 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
1012 * so the programs (can be more than one that used this map) were
1013 * disconnected from events. Wait for outstanding critical sections in
1014 * these programs to complete
1015 */
1016 synchronize_rcu();
1017
6c905981
AS
1018 /* some of free_htab_elem() callbacks for elements of this map may
1019 * not have executed. Wait for them.
0f8e4bd8 1020 */
6c905981 1021 rcu_barrier();
29ba732a 1022 if (htab->map.map_flags & BPF_F_NO_PREALLOC)
6c905981 1023 delete_all_elements(htab);
29ba732a
MKL
1024 else
1025 prealloc_destroy(htab);
1026
a6ed3ea6 1027 free_percpu(htab->extra_elems);
d407bd25 1028 bpf_map_area_free(htab->buckets);
0f8e4bd8
AS
1029 kfree(htab);
1030}
1031
a2c83fff 1032static const struct bpf_map_ops htab_ops = {
0f8e4bd8
AS
1033 .map_alloc = htab_map_alloc,
1034 .map_free = htab_map_free,
1035 .map_get_next_key = htab_map_get_next_key,
1036 .map_lookup_elem = htab_map_lookup_elem,
1037 .map_update_elem = htab_map_update_elem,
1038 .map_delete_elem = htab_map_delete_elem,
1039};
1040
c78f8bdf 1041static struct bpf_map_type_list htab_type __ro_after_init = {
0f8e4bd8
AS
1042 .ops = &htab_ops,
1043 .type = BPF_MAP_TYPE_HASH,
1044};
1045
29ba732a
MKL
1046static const struct bpf_map_ops htab_lru_ops = {
1047 .map_alloc = htab_map_alloc,
1048 .map_free = htab_map_free,
1049 .map_get_next_key = htab_map_get_next_key,
1050 .map_lookup_elem = htab_lru_map_lookup_elem,
1051 .map_update_elem = htab_lru_map_update_elem,
1052 .map_delete_elem = htab_lru_map_delete_elem,
1053};
1054
c78f8bdf 1055static struct bpf_map_type_list htab_lru_type __ro_after_init = {
29ba732a
MKL
1056 .ops = &htab_lru_ops,
1057 .type = BPF_MAP_TYPE_LRU_HASH,
1058};
1059
824bd0ce
AS
1060/* Called from eBPF program */
1061static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1062{
1063 struct htab_elem *l = __htab_map_lookup_elem(map, key);
1064
1065 if (l)
1066 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1067 else
1068 return NULL;
1069}
1070
8f844938
MKL
1071static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1072{
1073 struct htab_elem *l = __htab_map_lookup_elem(map, key);
1074
1075 if (l) {
1076 bpf_lru_node_set_ref(&l->lru_node);
1077 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1078 }
1079
1080 return NULL;
1081}
1082
15a07b33
AS
1083int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
1084{
8f844938 1085 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
15a07b33
AS
1086 struct htab_elem *l;
1087 void __percpu *pptr;
1088 int ret = -ENOENT;
1089 int cpu, off = 0;
1090 u32 size;
1091
1092 /* per_cpu areas are zero-filled and bpf programs can only
1093 * access 'value_size' of them, so copying rounded areas
1094 * will not leak any kernel data
1095 */
1096 size = round_up(map->value_size, 8);
1097 rcu_read_lock();
1098 l = __htab_map_lookup_elem(map, key);
1099 if (!l)
1100 goto out;
8f844938
MKL
1101 if (htab_is_lru(htab))
1102 bpf_lru_node_set_ref(&l->lru_node);
15a07b33
AS
1103 pptr = htab_elem_get_ptr(l, map->key_size);
1104 for_each_possible_cpu(cpu) {
1105 bpf_long_memcpy(value + off,
1106 per_cpu_ptr(pptr, cpu), size);
1107 off += size;
1108 }
1109 ret = 0;
1110out:
1111 rcu_read_unlock();
1112 return ret;
1113}
1114
1115int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1116 u64 map_flags)
1117{
8f844938 1118 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
6bbd9a05
SL
1119 int ret;
1120
1121 rcu_read_lock();
8f844938
MKL
1122 if (htab_is_lru(htab))
1123 ret = __htab_lru_percpu_map_update_elem(map, key, value,
1124 map_flags, true);
1125 else
1126 ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
1127 true);
6bbd9a05
SL
1128 rcu_read_unlock();
1129
1130 return ret;
15a07b33
AS
1131}
1132
824bd0ce
AS
1133static const struct bpf_map_ops htab_percpu_ops = {
1134 .map_alloc = htab_map_alloc,
1135 .map_free = htab_map_free,
1136 .map_get_next_key = htab_map_get_next_key,
1137 .map_lookup_elem = htab_percpu_map_lookup_elem,
1138 .map_update_elem = htab_percpu_map_update_elem,
1139 .map_delete_elem = htab_map_delete_elem,
1140};
1141
c78f8bdf 1142static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
824bd0ce
AS
1143 .ops = &htab_percpu_ops,
1144 .type = BPF_MAP_TYPE_PERCPU_HASH,
1145};
1146
8f844938
MKL
1147static const struct bpf_map_ops htab_lru_percpu_ops = {
1148 .map_alloc = htab_map_alloc,
1149 .map_free = htab_map_free,
1150 .map_get_next_key = htab_map_get_next_key,
1151 .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
1152 .map_update_elem = htab_lru_percpu_map_update_elem,
1153 .map_delete_elem = htab_lru_map_delete_elem,
1154};
1155
c78f8bdf 1156static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
8f844938
MKL
1157 .ops = &htab_lru_percpu_ops,
1158 .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
1159};
1160
0f8e4bd8
AS
1161static int __init register_htab_map(void)
1162{
a2c83fff 1163 bpf_register_map_type(&htab_type);
824bd0ce 1164 bpf_register_map_type(&htab_percpu_type);
29ba732a 1165 bpf_register_map_type(&htab_lru_type);
8f844938 1166 bpf_register_map_type(&htab_lru_percpu_type);
0f8e4bd8
AS
1167 return 0;
1168}
1169late_initcall(register_htab_map);