1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016,2017 Facebook
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 #include <linux/bpf.h>
14 #include <linux/err.h>
15 #include <linux/slab.h>
17 #include <linux/filter.h>
18 #include <linux/perf_event.h>
20 #include "map_in_map.h"
22 static void bpf_array_free_percpu(struct bpf_array
*array
)
26 for (i
= 0; i
< array
->map
.max_entries
; i
++)
27 free_percpu(array
->pptrs
[i
]);
30 static int bpf_array_alloc_percpu(struct bpf_array
*array
)
35 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
36 ptr
= __alloc_percpu_gfp(array
->elem_size
, 8,
37 GFP_USER
| __GFP_NOWARN
);
39 bpf_array_free_percpu(array
);
42 array
->pptrs
[i
] = ptr
;
48 /* Called from syscall */
49 static struct bpf_map
*array_map_alloc(union bpf_attr
*attr
)
51 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
;
52 int numa_node
= bpf_map_attr_numa_node(attr
);
53 struct bpf_array
*array
;
57 /* check sanity of attributes */
58 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
59 attr
->value_size
== 0 || attr
->map_flags
& ~BPF_F_NUMA_NODE
||
60 (percpu
&& numa_node
!= NUMA_NO_NODE
))
61 return ERR_PTR(-EINVAL
);
63 if (attr
->value_size
> KMALLOC_MAX_SIZE
)
64 /* if value_size is bigger, the user space won't be able to
65 * access the elements.
67 return ERR_PTR(-E2BIG
);
69 elem_size
= round_up(attr
->value_size
, 8);
71 array_size
= sizeof(*array
);
73 array_size
+= (u64
) attr
->max_entries
* sizeof(void *);
75 array_size
+= (u64
) attr
->max_entries
* elem_size
;
77 /* make sure there is no u32 overflow later in round_up() */
78 if (array_size
>= U32_MAX
- PAGE_SIZE
)
79 return ERR_PTR(-ENOMEM
);
81 /* allocate all map elements and zero-initialize them */
82 array
= bpf_map_area_alloc(array_size
, numa_node
);
84 return ERR_PTR(-ENOMEM
);
86 /* copy mandatory map attributes */
87 array
->map
.map_type
= attr
->map_type
;
88 array
->map
.key_size
= attr
->key_size
;
89 array
->map
.value_size
= attr
->value_size
;
90 array
->map
.max_entries
= attr
->max_entries
;
91 array
->map
.map_flags
= attr
->map_flags
;
92 array
->map
.numa_node
= numa_node
;
93 array
->elem_size
= elem_size
;
98 array_size
+= (u64
) attr
->max_entries
* elem_size
* num_possible_cpus();
100 if (array_size
>= U32_MAX
- PAGE_SIZE
||
101 bpf_array_alloc_percpu(array
)) {
102 bpf_map_area_free(array
);
103 return ERR_PTR(-ENOMEM
);
106 array
->map
.pages
= round_up(array_size
, PAGE_SIZE
) >> PAGE_SHIFT
;
111 /* Called from syscall or from eBPF program */
112 static void *array_map_lookup_elem(struct bpf_map
*map
, void *key
)
114 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
115 u32 index
= *(u32
*)key
;
117 if (unlikely(index
>= array
->map
.max_entries
))
120 return array
->value
+ array
->elem_size
* index
;
123 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
124 static u32
array_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
126 struct bpf_insn
*insn
= insn_buf
;
127 u32 elem_size
= round_up(map
->value_size
, 8);
128 const int ret
= BPF_REG_0
;
129 const int map_ptr
= BPF_REG_1
;
130 const int index
= BPF_REG_2
;
132 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
133 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
134 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 3);
136 if (is_power_of_2(elem_size
)) {
137 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
139 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
141 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
142 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
143 *insn
++ = BPF_MOV64_IMM(ret
, 0);
144 return insn
- insn_buf
;
147 /* Called from eBPF program */
148 static void *percpu_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
150 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
151 u32 index
= *(u32
*)key
;
153 if (unlikely(index
>= array
->map
.max_entries
))
156 return this_cpu_ptr(array
->pptrs
[index
]);
159 int bpf_percpu_array_copy(struct bpf_map
*map
, void *key
, void *value
)
161 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
162 u32 index
= *(u32
*)key
;
167 if (unlikely(index
>= array
->map
.max_entries
))
170 /* per_cpu areas are zero-filled and bpf programs can only
171 * access 'value_size' of them, so copying rounded areas
172 * will not leak any kernel data
174 size
= round_up(map
->value_size
, 8);
176 pptr
= array
->pptrs
[index
];
177 for_each_possible_cpu(cpu
) {
178 bpf_long_memcpy(value
+ off
, per_cpu_ptr(pptr
, cpu
), size
);
185 /* Called from syscall */
186 static int array_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
188 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
189 u32 index
= key
? *(u32
*)key
: U32_MAX
;
190 u32
*next
= (u32
*)next_key
;
192 if (index
>= array
->map
.max_entries
) {
197 if (index
== array
->map
.max_entries
- 1)
204 /* Called from syscall or from eBPF program */
205 static int array_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
208 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
209 u32 index
= *(u32
*)key
;
211 if (unlikely(map_flags
> BPF_EXIST
))
215 if (unlikely(index
>= array
->map
.max_entries
))
216 /* all elements were pre-allocated, cannot insert a new one */
219 if (unlikely(map_flags
== BPF_NOEXIST
))
220 /* all elements already exist */
223 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
224 memcpy(this_cpu_ptr(array
->pptrs
[index
]),
225 value
, map
->value_size
);
227 memcpy(array
->value
+ array
->elem_size
* index
,
228 value
, map
->value_size
);
232 int bpf_percpu_array_update(struct bpf_map
*map
, void *key
, void *value
,
235 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
236 u32 index
= *(u32
*)key
;
241 if (unlikely(map_flags
> BPF_EXIST
))
245 if (unlikely(index
>= array
->map
.max_entries
))
246 /* all elements were pre-allocated, cannot insert a new one */
249 if (unlikely(map_flags
== BPF_NOEXIST
))
250 /* all elements already exist */
253 /* the user space will provide round_up(value_size, 8) bytes that
254 * will be copied into per-cpu area. bpf programs can only access
255 * value_size of it. During lookup the same extra bytes will be
256 * returned or zeros which were zero-filled by percpu_alloc,
257 * so no kernel data leaks possible
259 size
= round_up(map
->value_size
, 8);
261 pptr
= array
->pptrs
[index
];
262 for_each_possible_cpu(cpu
) {
263 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
), value
+ off
, size
);
270 /* Called from syscall or from eBPF program */
271 static int array_map_delete_elem(struct bpf_map
*map
, void *key
)
276 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
277 static void array_map_free(struct bpf_map
*map
)
279 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
281 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
282 * so the programs (can be more than one that used this map) were
283 * disconnected from events. Wait for outstanding programs to complete
288 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
289 bpf_array_free_percpu(array
);
291 bpf_map_area_free(array
);
294 const struct bpf_map_ops array_map_ops
= {
295 .map_alloc
= array_map_alloc
,
296 .map_free
= array_map_free
,
297 .map_get_next_key
= array_map_get_next_key
,
298 .map_lookup_elem
= array_map_lookup_elem
,
299 .map_update_elem
= array_map_update_elem
,
300 .map_delete_elem
= array_map_delete_elem
,
301 .map_gen_lookup
= array_map_gen_lookup
,
304 const struct bpf_map_ops percpu_array_map_ops
= {
305 .map_alloc
= array_map_alloc
,
306 .map_free
= array_map_free
,
307 .map_get_next_key
= array_map_get_next_key
,
308 .map_lookup_elem
= percpu_array_map_lookup_elem
,
309 .map_update_elem
= array_map_update_elem
,
310 .map_delete_elem
= array_map_delete_elem
,
313 static struct bpf_map
*fd_array_map_alloc(union bpf_attr
*attr
)
315 /* only file descriptors can be stored in this type of map */
316 if (attr
->value_size
!= sizeof(u32
))
317 return ERR_PTR(-EINVAL
);
318 return array_map_alloc(attr
);
321 static void fd_array_map_free(struct bpf_map
*map
)
323 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
328 /* make sure it's empty */
329 for (i
= 0; i
< array
->map
.max_entries
; i
++)
330 BUG_ON(array
->ptrs
[i
] != NULL
);
332 bpf_map_area_free(array
);
335 static void *fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
340 /* only called from syscall */
341 int bpf_fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
, u32
*value
)
346 if (!map
->ops
->map_fd_sys_lookup_elem
)
350 elem
= array_map_lookup_elem(map
, key
);
351 if (elem
&& (ptr
= READ_ONCE(*elem
)))
352 *value
= map
->ops
->map_fd_sys_lookup_elem(ptr
);
360 /* only called from syscall */
361 int bpf_fd_array_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
362 void *key
, void *value
, u64 map_flags
)
364 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
365 void *new_ptr
, *old_ptr
;
366 u32 index
= *(u32
*)key
, ufd
;
368 if (map_flags
!= BPF_ANY
)
371 if (index
>= array
->map
.max_entries
)
375 new_ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
377 return PTR_ERR(new_ptr
);
379 old_ptr
= xchg(array
->ptrs
+ index
, new_ptr
);
381 map
->ops
->map_fd_put_ptr(old_ptr
);
386 static int fd_array_map_delete_elem(struct bpf_map
*map
, void *key
)
388 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
390 u32 index
= *(u32
*)key
;
392 if (index
>= array
->map
.max_entries
)
395 old_ptr
= xchg(array
->ptrs
+ index
, NULL
);
397 map
->ops
->map_fd_put_ptr(old_ptr
);
404 static void *prog_fd_array_get_ptr(struct bpf_map
*map
,
405 struct file
*map_file
, int fd
)
407 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
408 struct bpf_prog
*prog
= bpf_prog_get(fd
);
413 if (!bpf_prog_array_compatible(array
, prog
)) {
415 return ERR_PTR(-EINVAL
);
421 static void prog_fd_array_put_ptr(void *ptr
)
426 static u32
prog_fd_array_sys_lookup_elem(void *ptr
)
428 return ((struct bpf_prog
*)ptr
)->aux
->id
;
431 /* decrement refcnt of all bpf_progs that are stored in this map */
432 void bpf_fd_array_map_clear(struct bpf_map
*map
)
434 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
437 for (i
= 0; i
< array
->map
.max_entries
; i
++)
438 fd_array_map_delete_elem(map
, &i
);
441 const struct bpf_map_ops prog_array_map_ops
= {
442 .map_alloc
= fd_array_map_alloc
,
443 .map_free
= fd_array_map_free
,
444 .map_get_next_key
= array_map_get_next_key
,
445 .map_lookup_elem
= fd_array_map_lookup_elem
,
446 .map_delete_elem
= fd_array_map_delete_elem
,
447 .map_fd_get_ptr
= prog_fd_array_get_ptr
,
448 .map_fd_put_ptr
= prog_fd_array_put_ptr
,
449 .map_fd_sys_lookup_elem
= prog_fd_array_sys_lookup_elem
,
452 static struct bpf_event_entry
*bpf_event_entry_gen(struct file
*perf_file
,
453 struct file
*map_file
)
455 struct bpf_event_entry
*ee
;
457 ee
= kzalloc(sizeof(*ee
), GFP_ATOMIC
);
459 ee
->event
= perf_file
->private_data
;
460 ee
->perf_file
= perf_file
;
461 ee
->map_file
= map_file
;
467 static void __bpf_event_entry_free(struct rcu_head
*rcu
)
469 struct bpf_event_entry
*ee
;
471 ee
= container_of(rcu
, struct bpf_event_entry
, rcu
);
476 static void bpf_event_entry_free_rcu(struct bpf_event_entry
*ee
)
478 call_rcu(&ee
->rcu
, __bpf_event_entry_free
);
481 static void *perf_event_fd_array_get_ptr(struct bpf_map
*map
,
482 struct file
*map_file
, int fd
)
484 struct bpf_event_entry
*ee
;
485 struct perf_event
*event
;
486 struct file
*perf_file
;
489 perf_file
= perf_event_get(fd
);
490 if (IS_ERR(perf_file
))
493 ee
= ERR_PTR(-EOPNOTSUPP
);
494 event
= perf_file
->private_data
;
495 if (perf_event_read_local(event
, &value
) == -EOPNOTSUPP
)
498 ee
= bpf_event_entry_gen(perf_file
, map_file
);
501 ee
= ERR_PTR(-ENOMEM
);
507 static void perf_event_fd_array_put_ptr(void *ptr
)
509 bpf_event_entry_free_rcu(ptr
);
512 static void perf_event_fd_array_release(struct bpf_map
*map
,
513 struct file
*map_file
)
515 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
516 struct bpf_event_entry
*ee
;
520 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
521 ee
= READ_ONCE(array
->ptrs
[i
]);
522 if (ee
&& ee
->map_file
== map_file
)
523 fd_array_map_delete_elem(map
, &i
);
528 const struct bpf_map_ops perf_event_array_map_ops
= {
529 .map_alloc
= fd_array_map_alloc
,
530 .map_free
= fd_array_map_free
,
531 .map_get_next_key
= array_map_get_next_key
,
532 .map_lookup_elem
= fd_array_map_lookup_elem
,
533 .map_delete_elem
= fd_array_map_delete_elem
,
534 .map_fd_get_ptr
= perf_event_fd_array_get_ptr
,
535 .map_fd_put_ptr
= perf_event_fd_array_put_ptr
,
536 .map_release
= perf_event_fd_array_release
,
539 #ifdef CONFIG_CGROUPS
540 static void *cgroup_fd_array_get_ptr(struct bpf_map
*map
,
541 struct file
*map_file
/* not used */,
544 return cgroup_get_from_fd(fd
);
547 static void cgroup_fd_array_put_ptr(void *ptr
)
549 /* cgroup_put free cgrp after a rcu grace period */
553 static void cgroup_fd_array_free(struct bpf_map
*map
)
555 bpf_fd_array_map_clear(map
);
556 fd_array_map_free(map
);
559 const struct bpf_map_ops cgroup_array_map_ops
= {
560 .map_alloc
= fd_array_map_alloc
,
561 .map_free
= cgroup_fd_array_free
,
562 .map_get_next_key
= array_map_get_next_key
,
563 .map_lookup_elem
= fd_array_map_lookup_elem
,
564 .map_delete_elem
= fd_array_map_delete_elem
,
565 .map_fd_get_ptr
= cgroup_fd_array_get_ptr
,
566 .map_fd_put_ptr
= cgroup_fd_array_put_ptr
,
570 static struct bpf_map
*array_of_map_alloc(union bpf_attr
*attr
)
572 struct bpf_map
*map
, *inner_map_meta
;
574 inner_map_meta
= bpf_map_meta_alloc(attr
->inner_map_fd
);
575 if (IS_ERR(inner_map_meta
))
576 return inner_map_meta
;
578 map
= fd_array_map_alloc(attr
);
580 bpf_map_meta_free(inner_map_meta
);
584 map
->inner_map_meta
= inner_map_meta
;
589 static void array_of_map_free(struct bpf_map
*map
)
591 /* map->inner_map_meta is only accessed by syscall which
592 * is protected by fdget/fdput.
594 bpf_map_meta_free(map
->inner_map_meta
);
595 bpf_fd_array_map_clear(map
);
596 fd_array_map_free(map
);
599 static void *array_of_map_lookup_elem(struct bpf_map
*map
, void *key
)
601 struct bpf_map
**inner_map
= array_map_lookup_elem(map
, key
);
606 return READ_ONCE(*inner_map
);
609 static u32
array_of_map_gen_lookup(struct bpf_map
*map
,
610 struct bpf_insn
*insn_buf
)
612 u32 elem_size
= round_up(map
->value_size
, 8);
613 struct bpf_insn
*insn
= insn_buf
;
614 const int ret
= BPF_REG_0
;
615 const int map_ptr
= BPF_REG_1
;
616 const int index
= BPF_REG_2
;
618 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
619 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
620 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 5);
621 if (is_power_of_2(elem_size
))
622 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
624 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
625 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
626 *insn
++ = BPF_LDX_MEM(BPF_DW
, ret
, ret
, 0);
627 *insn
++ = BPF_JMP_IMM(BPF_JEQ
, ret
, 0, 1);
628 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
629 *insn
++ = BPF_MOV64_IMM(ret
, 0);
631 return insn
- insn_buf
;
634 const struct bpf_map_ops array_of_maps_map_ops
= {
635 .map_alloc
= array_of_map_alloc
,
636 .map_free
= array_of_map_free
,
637 .map_get_next_key
= array_map_get_next_key
,
638 .map_lookup_elem
= array_of_map_lookup_elem
,
639 .map_delete_elem
= fd_array_map_delete_elem
,
640 .map_fd_get_ptr
= bpf_map_fd_get_ptr
,
641 .map_fd_put_ptr
= bpf_map_fd_put_ptr
,
642 .map_fd_sys_lookup_elem
= bpf_map_fd_sys_lookup_elem
,
643 .map_gen_lookup
= array_of_map_gen_lookup
,