]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - kernel/bpf/hashtab.c
bpf: Add BPF_MAP_TYPE_LRU_HASH
[mirror_ubuntu-artful-kernel.git] / kernel / bpf / hashtab.c
CommitLineData
0f8e4bd8 1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
6c905981 2 * Copyright (c) 2016 Facebook
0f8e4bd8
AS
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/bpf.h>
14#include <linux/jhash.h>
15#include <linux/filter.h>
16#include <linux/vmalloc.h>
6c905981 17#include "percpu_freelist.h"
29ba732a 18#include "bpf_lru_list.h"
0f8e4bd8 19
688ecfe6 20struct bucket {
21 struct hlist_head head;
22 raw_spinlock_t lock;
23};
24
0f8e4bd8
AS
25struct bpf_htab {
26 struct bpf_map map;
688ecfe6 27 struct bucket *buckets;
6c905981 28 void *elems;
29ba732a
MKL
29 union {
30 struct pcpu_freelist freelist;
31 struct bpf_lru lru;
32 };
a6ed3ea6 33 void __percpu *extra_elems;
6591f1e6 34 atomic_t count; /* number of elements in this hashtable */
0f8e4bd8
AS
35 u32 n_buckets; /* number of hash buckets */
36 u32 elem_size; /* size of each element in bytes */
37};
38
a6ed3ea6
AS
39enum extra_elem_state {
40 HTAB_NOT_AN_EXTRA_ELEM = 0,
41 HTAB_EXTRA_ELEM_FREE,
42 HTAB_EXTRA_ELEM_USED
43};
44
0f8e4bd8
AS
45/* each htab element is struct htab_elem + key + value */
46struct htab_elem {
824bd0ce 47 union {
6c905981
AS
48 struct hlist_node hash_node;
49 struct bpf_htab *htab;
50 struct pcpu_freelist_node fnode;
824bd0ce 51 };
a6ed3ea6
AS
52 union {
53 struct rcu_head rcu;
54 enum extra_elem_state state;
29ba732a 55 struct bpf_lru_node lru_node;
a6ed3ea6 56 };
6c905981 57 u32 hash;
0f8e4bd8
AS
58 char key[0] __aligned(8);
59};
60
29ba732a
MKL
61static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
62
63static bool htab_is_lru(const struct bpf_htab *htab)
64{
65 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH;
66}
67
6c905981
AS
68static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
69 void __percpu *pptr)
70{
71 *(void __percpu **)(l->key + key_size) = pptr;
72}
73
74static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
75{
76 return *(void __percpu **)(l->key + key_size);
77}
78
79static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
80{
81 return (struct htab_elem *) (htab->elems + i * htab->elem_size);
82}
83
84static void htab_free_elems(struct bpf_htab *htab)
85{
86 int i;
87
88 if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
89 goto free_elems;
90
91 for (i = 0; i < htab->map.max_entries; i++) {
92 void __percpu *pptr;
93
94 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
95 htab->map.key_size);
96 free_percpu(pptr);
97 }
98free_elems:
99 vfree(htab->elems);
100}
101
29ba732a
MKL
102static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
103 u32 hash)
104{
105 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
106 struct htab_elem *l;
107
108 if (node) {
109 l = container_of(node, struct htab_elem, lru_node);
110 memcpy(l->key, key, htab->map.key_size);
111 return l;
112 }
113
114 return NULL;
115}
116
117static int prealloc_init(struct bpf_htab *htab)
6c905981
AS
118{
119 int err = -ENOMEM, i;
120
121 htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
122 if (!htab->elems)
123 return -ENOMEM;
124
125 if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
126 goto skip_percpu_elems;
127
128 for (i = 0; i < htab->map.max_entries; i++) {
129 u32 size = round_up(htab->map.value_size, 8);
130 void __percpu *pptr;
131
132 pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
133 if (!pptr)
134 goto free_elems;
135 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
136 pptr);
137 }
138
139skip_percpu_elems:
29ba732a
MKL
140 if (htab_is_lru(htab))
141 err = bpf_lru_init(&htab->lru,
142 htab->map.map_flags & BPF_F_NO_COMMON_LRU,
143 offsetof(struct htab_elem, hash) -
144 offsetof(struct htab_elem, lru_node),
145 htab_lru_map_delete_node,
146 htab);
147 else
148 err = pcpu_freelist_init(&htab->freelist);
149
6c905981
AS
150 if (err)
151 goto free_elems;
152
29ba732a
MKL
153 if (htab_is_lru(htab))
154 bpf_lru_populate(&htab->lru, htab->elems,
155 offsetof(struct htab_elem, lru_node),
156 htab->elem_size, htab->map.max_entries);
157 else
158 pcpu_freelist_populate(&htab->freelist, htab->elems,
159 htab->elem_size, htab->map.max_entries);
160
6c905981
AS
161 return 0;
162
163free_elems:
164 htab_free_elems(htab);
165 return err;
166}
167
29ba732a
MKL
168static void prealloc_destroy(struct bpf_htab *htab)
169{
170 htab_free_elems(htab);
171
172 if (htab_is_lru(htab))
173 bpf_lru_destroy(&htab->lru);
174 else
175 pcpu_freelist_destroy(&htab->freelist);
176}
177
a6ed3ea6
AS
178static int alloc_extra_elems(struct bpf_htab *htab)
179{
180 void __percpu *pptr;
181 int cpu;
182
183 pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
184 if (!pptr)
185 return -ENOMEM;
186
187 for_each_possible_cpu(cpu) {
188 ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
189 HTAB_EXTRA_ELEM_FREE;
190 }
191 htab->extra_elems = pptr;
192 return 0;
193}
194
0f8e4bd8
AS
195/* Called from syscall */
196static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
197{
824bd0ce 198 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
29ba732a
MKL
199 bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH;
200 /* percpu_lru means each cpu has its own LRU list.
201 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
202 * the map's value itself is percpu. percpu_lru has
203 * nothing to do with the map's value.
204 */
205 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
206 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
0f8e4bd8
AS
207 struct bpf_htab *htab;
208 int err, i;
824bd0ce 209 u64 cost;
0f8e4bd8 210
29ba732a
MKL
211 if (lru && !capable(CAP_SYS_ADMIN))
212 /* LRU implementation is much complicated than other
213 * maps. Hence, limit to CAP_SYS_ADMIN for now.
214 */
215 return ERR_PTR(-EPERM);
216
217 if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
6c905981
AS
218 /* reserved bits should not be used */
219 return ERR_PTR(-EINVAL);
220
29ba732a
MKL
221 if (!lru && percpu_lru)
222 return ERR_PTR(-EINVAL);
223
224 if (lru && !prealloc)
225 return ERR_PTR(-ENOTSUPP);
226
0f8e4bd8
AS
227 htab = kzalloc(sizeof(*htab), GFP_USER);
228 if (!htab)
229 return ERR_PTR(-ENOMEM);
230
231 /* mandatory map attributes */
824bd0ce 232 htab->map.map_type = attr->map_type;
0f8e4bd8
AS
233 htab->map.key_size = attr->key_size;
234 htab->map.value_size = attr->value_size;
235 htab->map.max_entries = attr->max_entries;
6c905981 236 htab->map.map_flags = attr->map_flags;
0f8e4bd8
AS
237
238 /* check sanity of attributes.
239 * value_size == 0 may be allowed in the future to use map as a set
240 */
241 err = -EINVAL;
242 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
243 htab->map.value_size == 0)
244 goto free_htab;
245
29ba732a
MKL
246 if (percpu_lru) {
247 /* ensure each CPU's lru list has >=1 elements.
248 * since we are at it, make each lru list has the same
249 * number of elements.
250 */
251 htab->map.max_entries = roundup(attr->max_entries,
252 num_possible_cpus());
253 if (htab->map.max_entries < attr->max_entries)
254 htab->map.max_entries = rounddown(attr->max_entries,
255 num_possible_cpus());
256 }
257
0f8e4bd8
AS
258 /* hash table size must be power of 2 */
259 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
260
261 err = -E2BIG;
262 if (htab->map.key_size > MAX_BPF_STACK)
263 /* eBPF programs initialize keys on stack, so they cannot be
264 * larger than max stack size
265 */
266 goto free_htab;
267
01b3f521
AS
268 if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
269 MAX_BPF_STACK - sizeof(struct htab_elem))
270 /* if value_size is bigger, the user space won't be able to
271 * access the elements via bpf syscall. This check also makes
272 * sure that the elem_size doesn't overflow and it's
273 * kmalloc-able later in htab_map_update_elem()
274 */
275 goto free_htab;
276
824bd0ce
AS
277 if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
278 /* make sure the size for pcpu_alloc() is reasonable */
279 goto free_htab;
280
01b3f521 281 htab->elem_size = sizeof(struct htab_elem) +
824bd0ce
AS
282 round_up(htab->map.key_size, 8);
283 if (percpu)
284 htab->elem_size += sizeof(void *);
285 else
6c905981 286 htab->elem_size += round_up(htab->map.value_size, 8);
01b3f521 287
daaf427c
AS
288 /* prevent zero size kmalloc and check for u32 overflow */
289 if (htab->n_buckets == 0 ||
688ecfe6 290 htab->n_buckets > U32_MAX / sizeof(struct bucket))
daaf427c
AS
291 goto free_htab;
292
824bd0ce
AS
293 cost = (u64) htab->n_buckets * sizeof(struct bucket) +
294 (u64) htab->elem_size * htab->map.max_entries;
295
296 if (percpu)
297 cost += (u64) round_up(htab->map.value_size, 8) *
298 num_possible_cpus() * htab->map.max_entries;
a6ed3ea6
AS
299 else
300 cost += (u64) htab->elem_size * num_possible_cpus();
824bd0ce
AS
301
302 if (cost >= U32_MAX - PAGE_SIZE)
01b3f521
AS
303 /* make sure page count doesn't overflow */
304 goto free_htab;
305
824bd0ce 306 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
01b3f521 307
6c905981
AS
308 /* if map size is larger than memlock limit, reject it early */
309 err = bpf_map_precharge_memlock(htab->map.pages);
310 if (err)
311 goto free_htab;
312
01b3f521 313 err = -ENOMEM;
688ecfe6 314 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
0f8e4bd8
AS
315 GFP_USER | __GFP_NOWARN);
316
317 if (!htab->buckets) {
688ecfe6 318 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
0f8e4bd8
AS
319 if (!htab->buckets)
320 goto free_htab;
321 }
322
688ecfe6 323 for (i = 0; i < htab->n_buckets; i++) {
324 INIT_HLIST_HEAD(&htab->buckets[i].head);
325 raw_spin_lock_init(&htab->buckets[i].lock);
326 }
0f8e4bd8 327
29ba732a
MKL
328 if (!percpu && !lru) {
329 /* lru itself can remove the least used element, so
330 * there is no need for an extra elem during map_update.
331 */
a6ed3ea6
AS
332 err = alloc_extra_elems(htab);
333 if (err)
334 goto free_buckets;
335 }
336
29ba732a
MKL
337 if (prealloc) {
338 err = prealloc_init(htab);
6c905981 339 if (err)
a6ed3ea6 340 goto free_extra_elems;
6c905981 341 }
0f8e4bd8 342
0f8e4bd8
AS
343 return &htab->map;
344
a6ed3ea6
AS
345free_extra_elems:
346 free_percpu(htab->extra_elems);
6c905981
AS
347free_buckets:
348 kvfree(htab->buckets);
0f8e4bd8
AS
349free_htab:
350 kfree(htab);
351 return ERR_PTR(err);
352}
353
354static inline u32 htab_map_hash(const void *key, u32 key_len)
355{
356 return jhash(key, key_len, 0);
357}
358
688ecfe6 359static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
0f8e4bd8
AS
360{
361 return &htab->buckets[hash & (htab->n_buckets - 1)];
362}
363
688ecfe6 364static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
365{
366 return &__select_bucket(htab, hash)->head;
367}
368
0f8e4bd8
AS
369static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
370 void *key, u32 key_size)
371{
372 struct htab_elem *l;
373
374 hlist_for_each_entry_rcu(l, head, hash_node)
375 if (l->hash == hash && !memcmp(&l->key, key, key_size))
376 return l;
377
378 return NULL;
379}
380
381/* Called from syscall or from eBPF program */
824bd0ce 382static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
0f8e4bd8
AS
383{
384 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
385 struct hlist_head *head;
386 struct htab_elem *l;
387 u32 hash, key_size;
388
389 /* Must be called with rcu_read_lock. */
390 WARN_ON_ONCE(!rcu_read_lock_held());
391
392 key_size = map->key_size;
393
394 hash = htab_map_hash(key, key_size);
395
396 head = select_bucket(htab, hash);
397
398 l = lookup_elem_raw(head, hash, key, key_size);
399
824bd0ce
AS
400 return l;
401}
402
403static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
404{
405 struct htab_elem *l = __htab_map_lookup_elem(map, key);
406
0f8e4bd8
AS
407 if (l)
408 return l->key + round_up(map->key_size, 8);
409
410 return NULL;
411}
412
29ba732a
MKL
413static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
414{
415 struct htab_elem *l = __htab_map_lookup_elem(map, key);
416
417 if (l) {
418 bpf_lru_node_set_ref(&l->lru_node);
419 return l->key + round_up(map->key_size, 8);
420 }
421
422 return NULL;
423}
424
425/* It is called from the bpf_lru_list when the LRU needs to delete
426 * older elements from the htab.
427 */
428static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
429{
430 struct bpf_htab *htab = (struct bpf_htab *)arg;
431 struct htab_elem *l, *tgt_l;
432 struct hlist_head *head;
433 unsigned long flags;
434 struct bucket *b;
435
436 tgt_l = container_of(node, struct htab_elem, lru_node);
437 b = __select_bucket(htab, tgt_l->hash);
438 head = &b->head;
439
440 raw_spin_lock_irqsave(&b->lock, flags);
441
442 hlist_for_each_entry_rcu(l, head, hash_node)
443 if (l == tgt_l) {
444 hlist_del_rcu(&l->hash_node);
445 break;
446 }
447
448 raw_spin_unlock_irqrestore(&b->lock, flags);
449
450 return l == tgt_l;
451}
452
0f8e4bd8
AS
453/* Called from syscall */
454static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
455{
456 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
457 struct hlist_head *head;
458 struct htab_elem *l, *next_l;
459 u32 hash, key_size;
460 int i;
461
462 WARN_ON_ONCE(!rcu_read_lock_held());
463
464 key_size = map->key_size;
465
466 hash = htab_map_hash(key, key_size);
467
468 head = select_bucket(htab, hash);
469
470 /* lookup the key */
471 l = lookup_elem_raw(head, hash, key, key_size);
472
473 if (!l) {
474 i = 0;
475 goto find_first_elem;
476 }
477
478 /* key was found, get next key in the same bucket */
479 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
480 struct htab_elem, hash_node);
481
482 if (next_l) {
483 /* if next elem in this hash list is non-zero, just return it */
484 memcpy(next_key, next_l->key, key_size);
485 return 0;
486 }
487
488 /* no more elements in this hash list, go to the next bucket */
489 i = hash & (htab->n_buckets - 1);
490 i++;
491
492find_first_elem:
493 /* iterate over buckets */
494 for (; i < htab->n_buckets; i++) {
495 head = select_bucket(htab, i);
496
497 /* pick first element in the bucket */
498 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
499 struct htab_elem, hash_node);
500 if (next_l) {
501 /* if it's not empty, just return it */
502 memcpy(next_key, next_l->key, key_size);
503 return 0;
504 }
505 }
506
6c905981 507 /* iterated over all buckets and all elements */
0f8e4bd8
AS
508 return -ENOENT;
509}
510
6c905981 511static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
824bd0ce 512{
6c905981
AS
513 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
514 free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
824bd0ce
AS
515 kfree(l);
516}
517
6c905981 518static void htab_elem_free_rcu(struct rcu_head *head)
824bd0ce
AS
519{
520 struct htab_elem *l = container_of(head, struct htab_elem, rcu);
6c905981 521 struct bpf_htab *htab = l->htab;
824bd0ce 522
6c905981
AS
523 /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
524 * we're calling kfree, otherwise deadlock is possible if kprobes
525 * are placed somewhere inside of slub
526 */
527 preempt_disable();
528 __this_cpu_inc(bpf_prog_active);
529 htab_elem_free(htab, l);
530 __this_cpu_dec(bpf_prog_active);
531 preempt_enable();
824bd0ce
AS
532}
533
6c905981 534static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
824bd0ce 535{
a6ed3ea6
AS
536 if (l->state == HTAB_EXTRA_ELEM_USED) {
537 l->state = HTAB_EXTRA_ELEM_FREE;
538 return;
539 }
540
6c905981
AS
541 if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
542 pcpu_freelist_push(&htab->freelist, &l->fnode);
824bd0ce 543 } else {
6c905981
AS
544 atomic_dec(&htab->count);
545 l->htab = htab;
546 call_rcu(&l->rcu, htab_elem_free_rcu);
824bd0ce
AS
547 }
548}
549
fd91de7b
MKL
550static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
551 void *value, bool onallcpus)
552{
553 if (!onallcpus) {
554 /* copy true value_size bytes */
555 memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
556 } else {
557 u32 size = round_up(htab->map.value_size, 8);
558 int off = 0, cpu;
559
560 for_each_possible_cpu(cpu) {
561 bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
562 value + off, size);
563 off += size;
564 }
565 }
566}
567
824bd0ce
AS
568static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
569 void *value, u32 key_size, u32 hash,
a6ed3ea6
AS
570 bool percpu, bool onallcpus,
571 bool old_elem_exists)
824bd0ce
AS
572{
573 u32 size = htab->map.value_size;
6c905981 574 bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
824bd0ce
AS
575 struct htab_elem *l_new;
576 void __percpu *pptr;
a6ed3ea6 577 int err = 0;
824bd0ce 578
6c905981
AS
579 if (prealloc) {
580 l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
581 if (!l_new)
a6ed3ea6 582 err = -E2BIG;
6c905981
AS
583 } else {
584 if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
585 atomic_dec(&htab->count);
a6ed3ea6
AS
586 err = -E2BIG;
587 } else {
588 l_new = kmalloc(htab->elem_size,
589 GFP_ATOMIC | __GFP_NOWARN);
590 if (!l_new)
591 return ERR_PTR(-ENOMEM);
6c905981 592 }
a6ed3ea6
AS
593 }
594
595 if (err) {
596 if (!old_elem_exists)
597 return ERR_PTR(err);
598
599 /* if we're updating the existing element and the hash table
600 * is full, use per-cpu extra elems
601 */
602 l_new = this_cpu_ptr(htab->extra_elems);
603 if (l_new->state != HTAB_EXTRA_ELEM_FREE)
604 return ERR_PTR(-E2BIG);
605 l_new->state = HTAB_EXTRA_ELEM_USED;
606 } else {
607 l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
6c905981 608 }
824bd0ce
AS
609
610 memcpy(l_new->key, key, key_size);
611 if (percpu) {
612 /* round up value_size to 8 bytes */
613 size = round_up(size, 8);
614
6c905981
AS
615 if (prealloc) {
616 pptr = htab_elem_get_ptr(l_new, key_size);
617 } else {
618 /* alloc_percpu zero-fills */
619 pptr = __alloc_percpu_gfp(size, 8,
620 GFP_ATOMIC | __GFP_NOWARN);
621 if (!pptr) {
622 kfree(l_new);
623 return ERR_PTR(-ENOMEM);
624 }
824bd0ce
AS
625 }
626
fd91de7b 627 pcpu_copy_value(htab, pptr, value, onallcpus);
15a07b33 628
6c905981
AS
629 if (!prealloc)
630 htab_elem_set_ptr(l_new, key_size, pptr);
824bd0ce
AS
631 } else {
632 memcpy(l_new->key + round_up(key_size, 8), value, size);
633 }
634
635 l_new->hash = hash;
636 return l_new;
637}
638
639static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
640 u64 map_flags)
641{
824bd0ce
AS
642 if (l_old && map_flags == BPF_NOEXIST)
643 /* elem already exists */
644 return -EEXIST;
645
646 if (!l_old && map_flags == BPF_EXIST)
647 /* elem doesn't exist, cannot update it */
648 return -ENOENT;
649
650 return 0;
651}
652
0f8e4bd8
AS
653/* Called from syscall or from eBPF program */
654static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
655 u64 map_flags)
656{
657 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
824bd0ce 658 struct htab_elem *l_new = NULL, *l_old;
0f8e4bd8
AS
659 struct hlist_head *head;
660 unsigned long flags;
824bd0ce
AS
661 struct bucket *b;
662 u32 key_size, hash;
0f8e4bd8
AS
663 int ret;
664
824bd0ce 665 if (unlikely(map_flags > BPF_EXIST))
0f8e4bd8
AS
666 /* unknown flags */
667 return -EINVAL;
668
669 WARN_ON_ONCE(!rcu_read_lock_held());
670
0f8e4bd8
AS
671 key_size = map->key_size;
672
824bd0ce
AS
673 hash = htab_map_hash(key, key_size);
674
824bd0ce 675 b = __select_bucket(htab, hash);
688ecfe6 676 head = &b->head;
0f8e4bd8
AS
677
678 /* bpf_map_update_elem() can be called in_irq() */
688ecfe6 679 raw_spin_lock_irqsave(&b->lock, flags);
0f8e4bd8 680
824bd0ce 681 l_old = lookup_elem_raw(head, hash, key, key_size);
0f8e4bd8 682
824bd0ce
AS
683 ret = check_flags(htab, l_old, map_flags);
684 if (ret)
0f8e4bd8 685 goto err;
0f8e4bd8 686
a6ed3ea6
AS
687 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
688 !!l_old);
6c905981
AS
689 if (IS_ERR(l_new)) {
690 /* all pre-allocated elements are in use or memory exhausted */
691 ret = PTR_ERR(l_new);
692 goto err;
693 }
694
824bd0ce
AS
695 /* add new element to the head of the list, so that
696 * concurrent search will find it before old elem
0f8e4bd8
AS
697 */
698 hlist_add_head_rcu(&l_new->hash_node, head);
699 if (l_old) {
700 hlist_del_rcu(&l_old->hash_node);
6c905981 701 free_htab_elem(htab, l_old);
0f8e4bd8 702 }
6c905981 703 ret = 0;
0f8e4bd8 704err:
688ecfe6 705 raw_spin_unlock_irqrestore(&b->lock, flags);
0f8e4bd8
AS
706 return ret;
707}
708
29ba732a
MKL
709static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
710 u64 map_flags)
711{
712 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
713 struct htab_elem *l_new, *l_old = NULL;
714 struct hlist_head *head;
715 unsigned long flags;
716 struct bucket *b;
717 u32 key_size, hash;
718 int ret;
719
720 if (unlikely(map_flags > BPF_EXIST))
721 /* unknown flags */
722 return -EINVAL;
723
724 WARN_ON_ONCE(!rcu_read_lock_held());
725
726 key_size = map->key_size;
727
728 hash = htab_map_hash(key, key_size);
729
730 b = __select_bucket(htab, hash);
731 head = &b->head;
732
733 /* For LRU, we need to alloc before taking bucket's
734 * spinlock because getting free nodes from LRU may need
735 * to remove older elements from htab and this removal
736 * operation will need a bucket lock.
737 */
738 l_new = prealloc_lru_pop(htab, key, hash);
739 if (!l_new)
740 return -ENOMEM;
741 memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
742
743 /* bpf_map_update_elem() can be called in_irq() */
744 raw_spin_lock_irqsave(&b->lock, flags);
745
746 l_old = lookup_elem_raw(head, hash, key, key_size);
747
748 ret = check_flags(htab, l_old, map_flags);
749 if (ret)
750 goto err;
751
752 /* add new element to the head of the list, so that
753 * concurrent search will find it before old elem
754 */
755 hlist_add_head_rcu(&l_new->hash_node, head);
756 if (l_old) {
757 bpf_lru_node_set_ref(&l_new->lru_node);
758 hlist_del_rcu(&l_old->hash_node);
759 }
760 ret = 0;
761
762err:
763 raw_spin_unlock_irqrestore(&b->lock, flags);
764
765 if (ret)
766 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
767 else if (l_old)
768 bpf_lru_push_free(&htab->lru, &l_old->lru_node);
769
770 return ret;
771}
772
15a07b33
AS
773static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
774 void *value, u64 map_flags,
775 bool onallcpus)
824bd0ce
AS
776{
777 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
778 struct htab_elem *l_new = NULL, *l_old;
779 struct hlist_head *head;
780 unsigned long flags;
781 struct bucket *b;
782 u32 key_size, hash;
783 int ret;
784
785 if (unlikely(map_flags > BPF_EXIST))
786 /* unknown flags */
787 return -EINVAL;
788
789 WARN_ON_ONCE(!rcu_read_lock_held());
790
791 key_size = map->key_size;
792
793 hash = htab_map_hash(key, key_size);
794
795 b = __select_bucket(htab, hash);
796 head = &b->head;
797
798 /* bpf_map_update_elem() can be called in_irq() */
799 raw_spin_lock_irqsave(&b->lock, flags);
800
801 l_old = lookup_elem_raw(head, hash, key, key_size);
802
803 ret = check_flags(htab, l_old, map_flags);
804 if (ret)
805 goto err;
806
807 if (l_old) {
808 /* per-cpu hash map can update value in-place */
fd91de7b
MKL
809 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
810 value, onallcpus);
824bd0ce
AS
811 } else {
812 l_new = alloc_htab_elem(htab, key, value, key_size,
a6ed3ea6 813 hash, true, onallcpus, false);
6c905981
AS
814 if (IS_ERR(l_new)) {
815 ret = PTR_ERR(l_new);
824bd0ce
AS
816 goto err;
817 }
818 hlist_add_head_rcu(&l_new->hash_node, head);
824bd0ce
AS
819 }
820 ret = 0;
821err:
822 raw_spin_unlock_irqrestore(&b->lock, flags);
823 return ret;
824}
825
15a07b33
AS
826static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
827 void *value, u64 map_flags)
828{
829 return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
830}
831
0f8e4bd8
AS
832/* Called from syscall or from eBPF program */
833static int htab_map_delete_elem(struct bpf_map *map, void *key)
834{
835 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
836 struct hlist_head *head;
688ecfe6 837 struct bucket *b;
0f8e4bd8
AS
838 struct htab_elem *l;
839 unsigned long flags;
840 u32 hash, key_size;
841 int ret = -ENOENT;
842
843 WARN_ON_ONCE(!rcu_read_lock_held());
844
845 key_size = map->key_size;
846
847 hash = htab_map_hash(key, key_size);
688ecfe6 848 b = __select_bucket(htab, hash);
849 head = &b->head;
0f8e4bd8 850
688ecfe6 851 raw_spin_lock_irqsave(&b->lock, flags);
0f8e4bd8 852
0f8e4bd8
AS
853 l = lookup_elem_raw(head, hash, key, key_size);
854
855 if (l) {
856 hlist_del_rcu(&l->hash_node);
6c905981 857 free_htab_elem(htab, l);
0f8e4bd8
AS
858 ret = 0;
859 }
860
688ecfe6 861 raw_spin_unlock_irqrestore(&b->lock, flags);
0f8e4bd8
AS
862 return ret;
863}
864
29ba732a
MKL
865static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
866{
867 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
868 struct hlist_head *head;
869 struct bucket *b;
870 struct htab_elem *l;
871 unsigned long flags;
872 u32 hash, key_size;
873 int ret = -ENOENT;
874
875 WARN_ON_ONCE(!rcu_read_lock_held());
876
877 key_size = map->key_size;
878
879 hash = htab_map_hash(key, key_size);
880 b = __select_bucket(htab, hash);
881 head = &b->head;
882
883 raw_spin_lock_irqsave(&b->lock, flags);
884
885 l = lookup_elem_raw(head, hash, key, key_size);
886
887 if (l) {
888 hlist_del_rcu(&l->hash_node);
889 ret = 0;
890 }
891
892 raw_spin_unlock_irqrestore(&b->lock, flags);
893 if (l)
894 bpf_lru_push_free(&htab->lru, &l->lru_node);
895 return ret;
896}
897
0f8e4bd8
AS
898static void delete_all_elements(struct bpf_htab *htab)
899{
900 int i;
901
902 for (i = 0; i < htab->n_buckets; i++) {
903 struct hlist_head *head = select_bucket(htab, i);
904 struct hlist_node *n;
905 struct htab_elem *l;
906
907 hlist_for_each_entry_safe(l, n, head, hash_node) {
908 hlist_del_rcu(&l->hash_node);
483bed2b
DB
909 if (l->state != HTAB_EXTRA_ELEM_USED)
910 htab_elem_free(htab, l);
0f8e4bd8
AS
911 }
912 }
913}
0f8e4bd8
AS
914/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
915static void htab_map_free(struct bpf_map *map)
916{
917 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
918
919 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
920 * so the programs (can be more than one that used this map) were
921 * disconnected from events. Wait for outstanding critical sections in
922 * these programs to complete
923 */
924 synchronize_rcu();
925
6c905981
AS
926 /* some of free_htab_elem() callbacks for elements of this map may
927 * not have executed. Wait for them.
0f8e4bd8 928 */
6c905981 929 rcu_barrier();
29ba732a 930 if (htab->map.map_flags & BPF_F_NO_PREALLOC)
6c905981 931 delete_all_elements(htab);
29ba732a
MKL
932 else
933 prealloc_destroy(htab);
934
a6ed3ea6 935 free_percpu(htab->extra_elems);
0f8e4bd8
AS
936 kvfree(htab->buckets);
937 kfree(htab);
938}
939
a2c83fff 940static const struct bpf_map_ops htab_ops = {
0f8e4bd8
AS
941 .map_alloc = htab_map_alloc,
942 .map_free = htab_map_free,
943 .map_get_next_key = htab_map_get_next_key,
944 .map_lookup_elem = htab_map_lookup_elem,
945 .map_update_elem = htab_map_update_elem,
946 .map_delete_elem = htab_map_delete_elem,
947};
948
a2c83fff 949static struct bpf_map_type_list htab_type __read_mostly = {
0f8e4bd8
AS
950 .ops = &htab_ops,
951 .type = BPF_MAP_TYPE_HASH,
952};
953
29ba732a
MKL
954static const struct bpf_map_ops htab_lru_ops = {
955 .map_alloc = htab_map_alloc,
956 .map_free = htab_map_free,
957 .map_get_next_key = htab_map_get_next_key,
958 .map_lookup_elem = htab_lru_map_lookup_elem,
959 .map_update_elem = htab_lru_map_update_elem,
960 .map_delete_elem = htab_lru_map_delete_elem,
961};
962
963static struct bpf_map_type_list htab_lru_type __read_mostly = {
964 .ops = &htab_lru_ops,
965 .type = BPF_MAP_TYPE_LRU_HASH,
966};
967
824bd0ce
AS
968/* Called from eBPF program */
969static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
970{
971 struct htab_elem *l = __htab_map_lookup_elem(map, key);
972
973 if (l)
974 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
975 else
976 return NULL;
977}
978
15a07b33
AS
979int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
980{
981 struct htab_elem *l;
982 void __percpu *pptr;
983 int ret = -ENOENT;
984 int cpu, off = 0;
985 u32 size;
986
987 /* per_cpu areas are zero-filled and bpf programs can only
988 * access 'value_size' of them, so copying rounded areas
989 * will not leak any kernel data
990 */
991 size = round_up(map->value_size, 8);
992 rcu_read_lock();
993 l = __htab_map_lookup_elem(map, key);
994 if (!l)
995 goto out;
996 pptr = htab_elem_get_ptr(l, map->key_size);
997 for_each_possible_cpu(cpu) {
998 bpf_long_memcpy(value + off,
999 per_cpu_ptr(pptr, cpu), size);
1000 off += size;
1001 }
1002 ret = 0;
1003out:
1004 rcu_read_unlock();
1005 return ret;
1006}
1007
1008int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1009 u64 map_flags)
1010{
6bbd9a05
SL
1011 int ret;
1012
1013 rcu_read_lock();
1014 ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
1015 rcu_read_unlock();
1016
1017 return ret;
15a07b33
AS
1018}
1019
824bd0ce
AS
1020static const struct bpf_map_ops htab_percpu_ops = {
1021 .map_alloc = htab_map_alloc,
1022 .map_free = htab_map_free,
1023 .map_get_next_key = htab_map_get_next_key,
1024 .map_lookup_elem = htab_percpu_map_lookup_elem,
1025 .map_update_elem = htab_percpu_map_update_elem,
1026 .map_delete_elem = htab_map_delete_elem,
1027};
1028
1029static struct bpf_map_type_list htab_percpu_type __read_mostly = {
1030 .ops = &htab_percpu_ops,
1031 .type = BPF_MAP_TYPE_PERCPU_HASH,
1032};
1033
0f8e4bd8
AS
1034static int __init register_htab_map(void)
1035{
a2c83fff 1036 bpf_register_map_type(&htab_type);
824bd0ce 1037 bpf_register_map_type(&htab_percpu_type);
29ba732a 1038 bpf_register_map_type(&htab_lru_type);
0f8e4bd8
AS
1039 return 0;
1040}
1041late_initcall(register_htab_map);