1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 #include <linux/bpf.h>
13 #include <linux/err.h>
14 #include <linux/slab.h>
16 #include <linux/filter.h>
17 #include <linux/perf_event.h>
19 static void bpf_array_free_percpu(struct bpf_array
*array
)
23 for (i
= 0; i
< array
->map
.max_entries
; i
++)
24 free_percpu(array
->pptrs
[i
]);
27 static int bpf_array_alloc_percpu(struct bpf_array
*array
)
32 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
33 ptr
= __alloc_percpu_gfp(array
->elem_size
, 8,
34 GFP_USER
| __GFP_NOWARN
);
36 bpf_array_free_percpu(array
);
39 array
->pptrs
[i
] = ptr
;
45 /* Called from syscall */
46 static struct bpf_map
*array_map_alloc(union bpf_attr
*attr
)
48 bool percpu
= attr
->map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
;
49 struct bpf_array
*array
;
53 /* check sanity of attributes */
54 if (attr
->max_entries
== 0 || attr
->key_size
!= 4 ||
55 attr
->value_size
== 0 || attr
->map_flags
)
56 return ERR_PTR(-EINVAL
);
58 if (attr
->value_size
> KMALLOC_MAX_SIZE
)
59 /* if value_size is bigger, the user space won't be able to
60 * access the elements.
62 return ERR_PTR(-E2BIG
);
64 elem_size
= round_up(attr
->value_size
, 8);
66 array_size
= sizeof(*array
);
68 array_size
+= (u64
) attr
->max_entries
* sizeof(void *);
70 array_size
+= (u64
) attr
->max_entries
* elem_size
;
72 /* make sure there is no u32 overflow later in round_up() */
73 if (array_size
>= U32_MAX
- PAGE_SIZE
)
74 return ERR_PTR(-ENOMEM
);
76 /* allocate all map elements and zero-initialize them */
77 array
= bpf_map_area_alloc(array_size
);
79 return ERR_PTR(-ENOMEM
);
81 /* copy mandatory map attributes */
82 array
->map
.map_type
= attr
->map_type
;
83 array
->map
.key_size
= attr
->key_size
;
84 array
->map
.value_size
= attr
->value_size
;
85 array
->map
.max_entries
= attr
->max_entries
;
86 array
->elem_size
= elem_size
;
91 array_size
+= (u64
) attr
->max_entries
* elem_size
* num_possible_cpus();
93 if (array_size
>= U32_MAX
- PAGE_SIZE
||
94 elem_size
> PCPU_MIN_UNIT_SIZE
|| bpf_array_alloc_percpu(array
)) {
95 bpf_map_area_free(array
);
96 return ERR_PTR(-ENOMEM
);
99 array
->map
.pages
= round_up(array_size
, PAGE_SIZE
) >> PAGE_SHIFT
;
104 /* Called from syscall or from eBPF program */
105 static void *array_map_lookup_elem(struct bpf_map
*map
, void *key
)
107 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
108 u32 index
= *(u32
*)key
;
110 if (unlikely(index
>= array
->map
.max_entries
))
113 return array
->value
+ array
->elem_size
* index
;
116 /* Called from eBPF program */
117 static void *percpu_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
119 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
120 u32 index
= *(u32
*)key
;
122 if (unlikely(index
>= array
->map
.max_entries
))
125 return this_cpu_ptr(array
->pptrs
[index
]);
128 int bpf_percpu_array_copy(struct bpf_map
*map
, void *key
, void *value
)
130 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
131 u32 index
= *(u32
*)key
;
136 if (unlikely(index
>= array
->map
.max_entries
))
139 /* per_cpu areas are zero-filled and bpf programs can only
140 * access 'value_size' of them, so copying rounded areas
141 * will not leak any kernel data
143 size
= round_up(map
->value_size
, 8);
145 pptr
= array
->pptrs
[index
];
146 for_each_possible_cpu(cpu
) {
147 bpf_long_memcpy(value
+ off
, per_cpu_ptr(pptr
, cpu
), size
);
154 /* Called from syscall */
155 static int array_map_get_next_key(struct bpf_map
*map
, void *key
, void *next_key
)
157 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
158 u32 index
= *(u32
*)key
;
159 u32
*next
= (u32
*)next_key
;
161 if (index
>= array
->map
.max_entries
) {
166 if (index
== array
->map
.max_entries
- 1)
173 /* Called from syscall or from eBPF program */
174 static int array_map_update_elem(struct bpf_map
*map
, void *key
, void *value
,
177 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
178 u32 index
= *(u32
*)key
;
180 if (unlikely(map_flags
> BPF_EXIST
))
184 if (unlikely(index
>= array
->map
.max_entries
))
185 /* all elements were pre-allocated, cannot insert a new one */
188 if (unlikely(map_flags
== BPF_NOEXIST
))
189 /* all elements already exist */
192 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
193 memcpy(this_cpu_ptr(array
->pptrs
[index
]),
194 value
, map
->value_size
);
196 memcpy(array
->value
+ array
->elem_size
* index
,
197 value
, map
->value_size
);
201 int bpf_percpu_array_update(struct bpf_map
*map
, void *key
, void *value
,
204 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
205 u32 index
= *(u32
*)key
;
210 if (unlikely(map_flags
> BPF_EXIST
))
214 if (unlikely(index
>= array
->map
.max_entries
))
215 /* all elements were pre-allocated, cannot insert a new one */
218 if (unlikely(map_flags
== BPF_NOEXIST
))
219 /* all elements already exist */
222 /* the user space will provide round_up(value_size, 8) bytes that
223 * will be copied into per-cpu area. bpf programs can only access
224 * value_size of it. During lookup the same extra bytes will be
225 * returned or zeros which were zero-filled by percpu_alloc,
226 * so no kernel data leaks possible
228 size
= round_up(map
->value_size
, 8);
230 pptr
= array
->pptrs
[index
];
231 for_each_possible_cpu(cpu
) {
232 bpf_long_memcpy(per_cpu_ptr(pptr
, cpu
), value
+ off
, size
);
239 /* Called from syscall or from eBPF program */
240 static int array_map_delete_elem(struct bpf_map
*map
, void *key
)
245 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
246 static void array_map_free(struct bpf_map
*map
)
248 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
250 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
251 * so the programs (can be more than one that used this map) were
252 * disconnected from events. Wait for outstanding programs to complete
257 if (array
->map
.map_type
== BPF_MAP_TYPE_PERCPU_ARRAY
)
258 bpf_array_free_percpu(array
);
260 bpf_map_area_free(array
);
263 static const struct bpf_map_ops array_ops
= {
264 .map_alloc
= array_map_alloc
,
265 .map_free
= array_map_free
,
266 .map_get_next_key
= array_map_get_next_key
,
267 .map_lookup_elem
= array_map_lookup_elem
,
268 .map_update_elem
= array_map_update_elem
,
269 .map_delete_elem
= array_map_delete_elem
,
272 static struct bpf_map_type_list array_type __read_mostly
= {
274 .type
= BPF_MAP_TYPE_ARRAY
,
277 static const struct bpf_map_ops percpu_array_ops
= {
278 .map_alloc
= array_map_alloc
,
279 .map_free
= array_map_free
,
280 .map_get_next_key
= array_map_get_next_key
,
281 .map_lookup_elem
= percpu_array_map_lookup_elem
,
282 .map_update_elem
= array_map_update_elem
,
283 .map_delete_elem
= array_map_delete_elem
,
286 static struct bpf_map_type_list percpu_array_type __read_mostly
= {
287 .ops
= &percpu_array_ops
,
288 .type
= BPF_MAP_TYPE_PERCPU_ARRAY
,
291 static int __init
register_array_map(void)
293 bpf_register_map_type(&array_type
);
294 bpf_register_map_type(&percpu_array_type
);
297 late_initcall(register_array_map
);
299 static struct bpf_map
*fd_array_map_alloc(union bpf_attr
*attr
)
301 /* only file descriptors can be stored in this type of map */
302 if (attr
->value_size
!= sizeof(u32
))
303 return ERR_PTR(-EINVAL
);
304 return array_map_alloc(attr
);
307 static void fd_array_map_free(struct bpf_map
*map
)
309 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
314 /* make sure it's empty */
315 for (i
= 0; i
< array
->map
.max_entries
; i
++)
316 BUG_ON(array
->ptrs
[i
] != NULL
);
318 bpf_map_area_free(array
);
321 static void *fd_array_map_lookup_elem(struct bpf_map
*map
, void *key
)
326 /* only called from syscall */
327 int bpf_fd_array_map_update_elem(struct bpf_map
*map
, struct file
*map_file
,
328 void *key
, void *value
, u64 map_flags
)
330 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
331 void *new_ptr
, *old_ptr
;
332 u32 index
= *(u32
*)key
, ufd
;
334 if (map_flags
!= BPF_ANY
)
337 if (index
>= array
->map
.max_entries
)
341 new_ptr
= map
->ops
->map_fd_get_ptr(map
, map_file
, ufd
);
343 return PTR_ERR(new_ptr
);
345 old_ptr
= xchg(array
->ptrs
+ index
, new_ptr
);
347 map
->ops
->map_fd_put_ptr(old_ptr
);
352 static int fd_array_map_delete_elem(struct bpf_map
*map
, void *key
)
354 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
356 u32 index
= *(u32
*)key
;
358 if (index
>= array
->map
.max_entries
)
361 old_ptr
= xchg(array
->ptrs
+ index
, NULL
);
363 map
->ops
->map_fd_put_ptr(old_ptr
);
370 static void *prog_fd_array_get_ptr(struct bpf_map
*map
,
371 struct file
*map_file
, int fd
)
373 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
374 struct bpf_prog
*prog
= bpf_prog_get(fd
);
379 if (!bpf_prog_array_compatible(array
, prog
)) {
381 return ERR_PTR(-EINVAL
);
387 static void prog_fd_array_put_ptr(void *ptr
)
392 /* decrement refcnt of all bpf_progs that are stored in this map */
393 void bpf_fd_array_map_clear(struct bpf_map
*map
)
395 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
398 for (i
= 0; i
< array
->map
.max_entries
; i
++)
399 fd_array_map_delete_elem(map
, &i
);
402 static const struct bpf_map_ops prog_array_ops
= {
403 .map_alloc
= fd_array_map_alloc
,
404 .map_free
= fd_array_map_free
,
405 .map_get_next_key
= array_map_get_next_key
,
406 .map_lookup_elem
= fd_array_map_lookup_elem
,
407 .map_delete_elem
= fd_array_map_delete_elem
,
408 .map_fd_get_ptr
= prog_fd_array_get_ptr
,
409 .map_fd_put_ptr
= prog_fd_array_put_ptr
,
412 static struct bpf_map_type_list prog_array_type __read_mostly
= {
413 .ops
= &prog_array_ops
,
414 .type
= BPF_MAP_TYPE_PROG_ARRAY
,
417 static int __init
register_prog_array_map(void)
419 bpf_register_map_type(&prog_array_type
);
422 late_initcall(register_prog_array_map
);
424 static struct bpf_event_entry
*bpf_event_entry_gen(struct file
*perf_file
,
425 struct file
*map_file
)
427 struct bpf_event_entry
*ee
;
429 ee
= kzalloc(sizeof(*ee
), GFP_ATOMIC
);
431 ee
->event
= perf_file
->private_data
;
432 ee
->perf_file
= perf_file
;
433 ee
->map_file
= map_file
;
439 static void __bpf_event_entry_free(struct rcu_head
*rcu
)
441 struct bpf_event_entry
*ee
;
443 ee
= container_of(rcu
, struct bpf_event_entry
, rcu
);
448 static void bpf_event_entry_free_rcu(struct bpf_event_entry
*ee
)
450 call_rcu(&ee
->rcu
, __bpf_event_entry_free
);
453 static void *perf_event_fd_array_get_ptr(struct bpf_map
*map
,
454 struct file
*map_file
, int fd
)
456 const struct perf_event_attr
*attr
;
457 struct bpf_event_entry
*ee
;
458 struct perf_event
*event
;
459 struct file
*perf_file
;
461 perf_file
= perf_event_get(fd
);
462 if (IS_ERR(perf_file
))
465 event
= perf_file
->private_data
;
466 ee
= ERR_PTR(-EINVAL
);
468 attr
= perf_event_attrs(event
);
469 if (IS_ERR(attr
) || attr
->inherit
)
472 switch (attr
->type
) {
473 case PERF_TYPE_SOFTWARE
:
474 if (attr
->config
!= PERF_COUNT_SW_BPF_OUTPUT
)
478 case PERF_TYPE_HARDWARE
:
479 ee
= bpf_event_entry_gen(perf_file
, map_file
);
482 ee
= ERR_PTR(-ENOMEM
);
493 static void perf_event_fd_array_put_ptr(void *ptr
)
495 bpf_event_entry_free_rcu(ptr
);
498 static void perf_event_fd_array_release(struct bpf_map
*map
,
499 struct file
*map_file
)
501 struct bpf_array
*array
= container_of(map
, struct bpf_array
, map
);
502 struct bpf_event_entry
*ee
;
506 for (i
= 0; i
< array
->map
.max_entries
; i
++) {
507 ee
= READ_ONCE(array
->ptrs
[i
]);
508 if (ee
&& ee
->map_file
== map_file
)
509 fd_array_map_delete_elem(map
, &i
);
514 static const struct bpf_map_ops perf_event_array_ops
= {
515 .map_alloc
= fd_array_map_alloc
,
516 .map_free
= fd_array_map_free
,
517 .map_get_next_key
= array_map_get_next_key
,
518 .map_lookup_elem
= fd_array_map_lookup_elem
,
519 .map_delete_elem
= fd_array_map_delete_elem
,
520 .map_fd_get_ptr
= perf_event_fd_array_get_ptr
,
521 .map_fd_put_ptr
= perf_event_fd_array_put_ptr
,
522 .map_release
= perf_event_fd_array_release
,
525 static struct bpf_map_type_list perf_event_array_type __read_mostly
= {
526 .ops
= &perf_event_array_ops
,
527 .type
= BPF_MAP_TYPE_PERF_EVENT_ARRAY
,
530 static int __init
register_perf_event_array_map(void)
532 bpf_register_map_type(&perf_event_array_type
);
535 late_initcall(register_perf_event_array_map
);
537 #ifdef CONFIG_CGROUPS
538 static void *cgroup_fd_array_get_ptr(struct bpf_map
*map
,
539 struct file
*map_file
/* not used */,
542 return cgroup_get_from_fd(fd
);
545 static void cgroup_fd_array_put_ptr(void *ptr
)
547 /* cgroup_put free cgrp after a rcu grace period */
551 static void cgroup_fd_array_free(struct bpf_map
*map
)
553 bpf_fd_array_map_clear(map
);
554 fd_array_map_free(map
);
557 static const struct bpf_map_ops cgroup_array_ops
= {
558 .map_alloc
= fd_array_map_alloc
,
559 .map_free
= cgroup_fd_array_free
,
560 .map_get_next_key
= array_map_get_next_key
,
561 .map_lookup_elem
= fd_array_map_lookup_elem
,
562 .map_delete_elem
= fd_array_map_delete_elem
,
563 .map_fd_get_ptr
= cgroup_fd_array_get_ptr
,
564 .map_fd_put_ptr
= cgroup_fd_array_put_ptr
,
567 static struct bpf_map_type_list cgroup_array_type __read_mostly
= {
568 .ops
= &cgroup_array_ops
,
569 .type
= BPF_MAP_TYPE_CGROUP_ARRAY
,
572 static int __init
register_cgroup_array_map(void)
574 bpf_register_map_type(&cgroup_array_type
);
577 late_initcall(register_cgroup_array_map
);