1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016,2017 Facebook
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 #include <linux/bpf.h>
14 #include <linux/err.h>
15 #include <linux/slab.h>
17 #include <linux/filter.h>
18 #include <linux/perf_event.h>
20 #include "map_in_map.h"
22 static void bpf_array_free_percpu(struct bpf_array
*array
)
26 for (i
= 0; i
< array
->map
.max_entries
; i
++)
27 free_percpu(array
->pptrs
[i
]);
30 static int bpf_array_alloc_percpu(struct bpf_array
*array
)
35 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
36 ptr
= __alloc_percpu_gfp(array
->elem_size
, 8,
37 GFP_USER
| __GFP_NOWARN
);
39 bpf_array_free_percpu(array
);
42 array
->pptrs
[i
] = ptr
;
48 /* Called from syscall */
49 static struct bpf_map
*array_map_alloc(union bpf_attr
*attr
)
51 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
;
52 struct bpf_array
*array
;
56 /* check sanity of attributes */
57 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
58 attr
->value_size
== 0 || attr
->map_flags
)
59 return ERR_PTR(-EINVAL
);
61 if (attr
->value_size
> KMALLOC_MAX_SIZE
)
62 /* if value_size is bigger, the user space won't be able to
63 * access the elements.
65 return ERR_PTR(-E2BIG
);
67 elem_size
= round_up(attr
->value_size
, 8);
69 array_size
= sizeof(*array
);
71 array_size
+= (u64
) attr
->max_entries
* sizeof(void *);
73 array_size
+= (u64
) attr
->max_entries
* elem_size
;
75 /* make sure there is no u32 overflow later in round_up() */
76 if (array_size
>= U32_MAX
- PAGE_SIZE
)
77 return ERR_PTR(-ENOMEM
);
79 /* allocate all map elements and zero-initialize them */
80 array
= bpf_map_area_alloc(array_size
);
82 return ERR_PTR(-ENOMEM
);
84 /* copy mandatory map attributes */
85 array
->map
.map_type
= attr
->map_type
;
86 array
->map
.key_size
= attr
->key_size
;
87 array
->map
.value_size
= attr
->value_size
;
88 array
->map
.max_entries
= attr
->max_entries
;
89 array
->map
.map_flags
= attr
->map_flags
;
90 array
->elem_size
= elem_size
;
95 array_size
+= (u64
) attr
->max_entries
* elem_size
* num_possible_cpus();
97 if (array_size
>= U32_MAX
- PAGE_SIZE
||
98 elem_size
> PCPU_MIN_UNIT_SIZE
|| bpf_array_alloc_percpu(array
)) {
99 bpf_map_area_free(array
);
100 return ERR_PTR(-ENOMEM
);
103 array
->map
.pages
= round_up(array_size
, PAGE_SIZE
) >> PAGE_SHIFT
;
108 /* Called from syscall or from eBPF program */
109 static void *array_map_lookup_elem(struct bpf_map
*map
, void *key
)
111 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
112 u32 index
= *(u32
*)key
;
114 if (unlikely(index
>= array
->map
.max_entries
))
117 return array
->value
+ array
->elem_size
* index
;
120 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
121 static u32
array_map_gen_lookup(struct bpf_map
*map
, struct bpf_insn
*insn_buf
)
123 struct bpf_insn
*insn
= insn_buf
;
124 u32 elem_size
= round_up(map
->value_size
, 8);
125 const int ret
= BPF_REG_0
;
126 const int map_ptr
= BPF_REG_1
;
127 const int index
= BPF_REG_2
;
129 *insn
++ = BPF_ALU64_IMM(BPF_ADD
, map_ptr
, offsetof(struct bpf_array
, value
));
130 *insn
++ = BPF_LDX_MEM(BPF_W
, ret
, index
, 0);
131 *insn
++ = BPF_JMP_IMM(BPF_JGE
, ret
, map
->max_entries
, 3);
133 if (is_power_of_2(elem_size
)) {
134 *insn
++ = BPF_ALU64_IMM(BPF_LSH
, ret
, ilog2(elem_size
));
136 *insn
++ = BPF_ALU64_IMM(BPF_MUL
, ret
, elem_size
);
138 *insn
++ = BPF_ALU64_REG(BPF_ADD
, ret
, map_ptr
);
139 *insn
++ = BPF_JMP_IMM(BPF_JA
, 0, 0, 1);
140 *insn
++ = BPF_MOV64_IMM(ret
, 0);
141 return insn
- insn_buf
;
144 /* Called from eBPF program */
145 static void *percpu_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
147 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
148 u32 index
= *(u32
*)key
;
150 if (unlikely(index
>= array
->map
.max_entries
))
153 return this_cpu_ptr(array
->pptrs
[index
]);
156 int bpf_percpu_array_copy(struct bpf_map
*map
, void *key
, void *value
)
158 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
159 u32 index
= *(u32
*)key
;
164 if (unlikely(index
>= array
->map
.max_entries
))
167 /* per_cpu areas are zero-filled and bpf programs can only
168 * access 'value_size' of them, so copying rounded areas
169 * will not leak any kernel data
171 size
= round_up(map
->value_size
, 8);
173 pptr
= array
->pptrs
[index
];
174 for_each_possible_cpu(cpu
) {
175 bpf_long_memcpy(value
+ off
, per_cpu_ptr(pptr
, cpu
), size
);
182 /* Called from syscall */
183 static int array_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
185 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
186 u32 index
= key
? *(u32
*)key
: U32_MAX
;
187 u32
*next
= (u32
*)next_key
;
189 if (index
>= array
->map
.max_entries
) {
194 if (index
== array
->map
.max_entries
- 1)
201 /* Called from syscall or from eBPF program */
202 static int array_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
205 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
206 u32 index
= *(u32
*)key
;
208 if (unlikely(map_flags
> BPF_EXIST
))
212 if (unlikely(index
>= array
->map
.max_entries
))
213 /* all elements were pre-allocated, cannot insert a new one */
216 if (unlikely(map_flags
== BPF_NOEXIST
))
217 /* all elements already exist */
220 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
221 memcpy(this_cpu_ptr(array
->pptrs
[index
]),
222 value
, map
->value_size
);
224 memcpy(array
->value
+ array
->elem_size
* index
,
225 value
, map
->value_size
);
229 int bpf_percpu_array_update(struct bpf_map
*map
, void *key
, void *value
,
232 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
233 u32 index
= *(u32
*)key
;
238 if (unlikely(map_flags
> BPF_EXIST
))
242 if (unlikely(index
>= array
->map
.max_entries
))
243 /* all elements were pre-allocated, cannot insert a new one */
246 if (unlikely(map_flags
== BPF_NOEXIST
))
247 /* all elements already exist */
250 /* the user space will provide round_up(value_size, 8) bytes that
251 * will be copied into per-cpu area. bpf programs can only access
252 * value_size of it. During lookup the same extra bytes will be
253 * returned or zeros which were zero-filled by percpu_alloc,
254 * so no kernel data leaks possible
256 size
= round_up(map
->value_size
, 8);
258 pptr
= array
->pptrs
[index
];
259 for_each_possible_cpu(cpu
) {
260 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
), value
+ off
, size
);
267 /* Called from syscall or from eBPF program */
268 static int array_map_delete_elem(struct bpf_map
*map
, void *key
)
273 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
274 static void array_map_free(struct bpf_map
*map
)
276 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
278 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
279 * so the programs (can be more than one that used this map) were
280 * disconnected from events. Wait for outstanding programs to complete
285 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
286 bpf_array_free_percpu(array
);
288 bpf_map_area_free(array
);
291 const struct bpf_map_ops array_map_ops
= {
292 .map_alloc
= array_map_alloc
,
293 .map_free
= array_map_free
,
294 .map_get_next_key
= array_map_get_next_key
,
295 .map_lookup_elem
= array_map_lookup_elem
,
296 .map_update_elem
= array_map_update_elem
,
297 .map_delete_elem
= array_map_delete_elem
,
298 .map_gen_lookup
= array_map_gen_lookup
,
301 const struct bpf_map_ops percpu_array_map_ops
= {
302 .map_alloc
= array_map_alloc
,
303 .map_free
= array_map_free
,
304 .map_get_next_key
= array_map_get_next_key
,
305 .map_lookup_elem
= percpu_array_map_lookup_elem
,
306 .map_update_elem
= array_map_update_elem
,
307 .map_delete_elem
= array_map_delete_elem
,
310 static struct bpf_map
*fd_array_map_alloc(union bpf_attr
*attr
)
312 /* only file descriptors can be stored in this type of map */
313 if (attr
->value_size
!= sizeof(u32
))
314 return ERR_PTR(-EINVAL
);
315 return array_map_alloc(attr
);
318 static void fd_array_map_free(struct bpf_map
*map
)
320 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
325 /* make sure it's empty */
326 for (i
= 0; i
< array
->map
.max_entries
; i
++)
327 BUG_ON(array
->ptrs
[i
] != NULL
);
329 bpf_map_area_free(array
);
332 static void *fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
337 /* only called from syscall */
338 int bpf_fd_array_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
339 void *key
, void *value
, u64 map_flags
)
341 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
342 void *new_ptr
, *old_ptr
;
343 u32 index
= *(u32
*)key
, ufd
;
345 if (map_flags
!= BPF_ANY
)
348 if (index
>= array
->map
.max_entries
)
352 new_ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
354 return PTR_ERR(new_ptr
);
356 old_ptr
= xchg(array
->ptrs
+ index
, new_ptr
);
358 map
->ops
->map_fd_put_ptr(old_ptr
);
363 static int fd_array_map_delete_elem(struct bpf_map
*map
, void *key
)
365 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
367 u32 index
= *(u32
*)key
;
369 if (index
>= array
->map
.max_entries
)
372 old_ptr
= xchg(array
->ptrs
+ index
, NULL
);
374 map
->ops
->map_fd_put_ptr(old_ptr
);
381 static void *prog_fd_array_get_ptr(struct bpf_map
*map
,
382 struct file
*map_file
, int fd
)
384 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
385 struct bpf_prog
*prog
= bpf_prog_get(fd
);
390 if (!bpf_prog_array_compatible(array
, prog
)) {
392 return ERR_PTR(-EINVAL
);
398 static void prog_fd_array_put_ptr(void *ptr
)
403 /* decrement refcnt of all bpf_progs that are stored in this map */
404 void bpf_fd_array_map_clear(struct bpf_map
*map
)
406 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
409 for (i
= 0; i
< array
->map
.max_entries
; i
++)
410 fd_array_map_delete_elem(map
, &i
);
413 const struct bpf_map_ops prog_array_map_ops
= {
414 .map_alloc
= fd_array_map_alloc
,
415 .map_free
= fd_array_map_free
,
416 .map_get_next_key
= array_map_get_next_key
,
417 .map_lookup_elem
= fd_array_map_lookup_elem
,
418 .map_delete_elem
= fd_array_map_delete_elem
,
419 .map_fd_get_ptr
= prog_fd_array_get_ptr
,
420 .map_fd_put_ptr
= prog_fd_array_put_ptr
,
423 static struct bpf_event_entry
*bpf_event_entry_gen(struct file
*perf_file
,
424 struct file
*map_file
)
426 struct bpf_event_entry
*ee
;
428 ee
= kzalloc(sizeof(*ee
), GFP_ATOMIC
);
430 ee
->event
= perf_file
->private_data
;
431 ee
->perf_file
= perf_file
;
432 ee
->map_file
= map_file
;
438 static void __bpf_event_entry_free(struct rcu_head
*rcu
)
440 struct bpf_event_entry
*ee
;
442 ee
= container_of(rcu
, struct bpf_event_entry
, rcu
);
447 static void bpf_event_entry_free_rcu(struct bpf_event_entry
*ee
)
449 call_rcu(&ee
->rcu
, __bpf_event_entry_free
);
452 static void *perf_event_fd_array_get_ptr(struct bpf_map
*map
,
453 struct file
*map_file
, int fd
)
455 const struct perf_event_attr
*attr
;
456 struct bpf_event_entry
*ee
;
457 struct perf_event
*event
;
458 struct file
*perf_file
;
460 perf_file
= perf_event_get(fd
);
461 if (IS_ERR(perf_file
))
464 event
= perf_file
->private_data
;
465 ee
= ERR_PTR(-EINVAL
);
467 attr
= perf_event_attrs(event
);
468 if (IS_ERR(attr
) || attr
->inherit
)
471 switch (attr
->type
) {
472 case PERF_TYPE_SOFTWARE
:
473 if (attr
->config
!= PERF_COUNT_SW_BPF_OUTPUT
)
477 case PERF_TYPE_HARDWARE
:
478 ee
= bpf_event_entry_gen(perf_file
, map_file
);
481 ee
= ERR_PTR(-ENOMEM
);
492 static void perf_event_fd_array_put_ptr(void *ptr
)
494 bpf_event_entry_free_rcu(ptr
);
497 static void perf_event_fd_array_release(struct bpf_map
*map
,
498 struct file
*map_file
)
500 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
501 struct bpf_event_entry
*ee
;
505 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
506 ee
= READ_ONCE(array
->ptrs
[i
]);
507 if (ee
&& ee
->map_file
== map_file
)
508 fd_array_map_delete_elem(map
, &i
);
513 const struct bpf_map_ops perf_event_array_map_ops
= {
514 .map_alloc
= fd_array_map_alloc
,
515 .map_free
= fd_array_map_free
,
516 .map_get_next_key
= array_map_get_next_key
,
517 .map_lookup_elem
= fd_array_map_lookup_elem
,
518 .map_delete_elem
= fd_array_map_delete_elem
,
519 .map_fd_get_ptr
= perf_event_fd_array_get_ptr
,
520 .map_fd_put_ptr
= perf_event_fd_array_put_ptr
,
521 .map_release
= perf_event_fd_array_release
,
524 #ifdef CONFIG_CGROUPS
525 static void *cgroup_fd_array_get_ptr(struct bpf_map
*map
,
526 struct file
*map_file
/* not used */,
529 return cgroup_get_from_fd(fd
);
532 static void cgroup_fd_array_put_ptr(void *ptr
)
534 /* cgroup_put free cgrp after a rcu grace period */
538 static void cgroup_fd_array_free(struct bpf_map
*map
)
540 bpf_fd_array_map_clear(map
);
541 fd_array_map_free(map
);
544 const struct bpf_map_ops cgroup_array_map_ops
= {
545 .map_alloc
= fd_array_map_alloc
,
546 .map_free
= cgroup_fd_array_free
,
547 .map_get_next_key
= array_map_get_next_key
,
548 .map_lookup_elem
= fd_array_map_lookup_elem
,
549 .map_delete_elem
= fd_array_map_delete_elem
,
550 .map_fd_get_ptr
= cgroup_fd_array_get_ptr
,
551 .map_fd_put_ptr
= cgroup_fd_array_put_ptr
,
555 static struct bpf_map
*array_of_map_alloc(union bpf_attr
*attr
)
557 struct bpf_map
*map
, *inner_map_meta
;
559 inner_map_meta
= bpf_map_meta_alloc(attr
->inner_map_fd
);
560 if (IS_ERR(inner_map_meta
))
561 return inner_map_meta
;
563 map
= fd_array_map_alloc(attr
);
565 bpf_map_meta_free(inner_map_meta
);
569 map
->inner_map_meta
= inner_map_meta
;
574 static void array_of_map_free(struct bpf_map
*map
)
576 /* map->inner_map_meta is only accessed by syscall which
577 * is protected by fdget/fdput.
579 bpf_map_meta_free(map
->inner_map_meta
);
580 bpf_fd_array_map_clear(map
);
581 fd_array_map_free(map
);
584 static void *array_of_map_lookup_elem(struct bpf_map
*map
, void *key
)
586 struct bpf_map
**inner_map
= array_map_lookup_elem(map
, key
);
591 return READ_ONCE(*inner_map
);
594 const struct bpf_map_ops array_of_maps_map_ops
= {
595 .map_alloc
= array_of_map_alloc
,
596 .map_free
= array_of_map_free
,
597 .map_get_next_key
= array_map_get_next_key
,
598 .map_lookup_elem
= array_of_map_lookup_elem
,
599 .map_delete_elem
= fd_array_map_delete_elem
,
600 .map_fd_get_ptr
= bpf_map_fd_get_ptr
,
601 .map_fd_put_ptr
= bpf_map_fd_put_ptr
,