]>
Commit | Line | Data |
---|---|---|
457f4436 AN |
1 | #include <linux/bpf.h> |
2 | #include <linux/btf.h> | |
3 | #include <linux/err.h> | |
4 | #include <linux/irq_work.h> | |
5 | #include <linux/slab.h> | |
6 | #include <linux/filter.h> | |
7 | #include <linux/mm.h> | |
8 | #include <linux/vmalloc.h> | |
9 | #include <linux/wait.h> | |
10 | #include <linux/poll.h> | |
11 | #include <uapi/linux/btf.h> | |
12 | ||
13 | #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) | |
14 | ||
15 | /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ | |
16 | #define RINGBUF_PGOFF \ | |
17 | (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) | |
18 | /* consumer page and producer page */ | |
19 | #define RINGBUF_POS_PAGES 2 | |
20 | ||
21 | #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) | |
22 | ||
23 | /* Maximum size of ring buffer area is limited by 32-bit page offset within | |
24 | * record header, counted in pages. Reserve 8 bits for extensibility, and take | |
25 | * into account few extra pages for consumer/producer pages and | |
26 | * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single | |
27 | * ring buffer. | |
28 | */ | |
29 | #define RINGBUF_MAX_DATA_SZ \ | |
30 | (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) | |
31 | ||
32 | struct bpf_ringbuf { | |
33 | wait_queue_head_t waitq; | |
34 | struct irq_work work; | |
35 | u64 mask; | |
36 | struct page **pages; | |
37 | int nr_pages; | |
38 | spinlock_t spinlock ____cacheline_aligned_in_smp; | |
39 | /* Consumer and producer counters are put into separate pages to allow | |
40 | * mapping consumer page as r/w, but restrict producer page to r/o. | |
41 | * This protects producer position from being modified by user-space | |
42 | * application and ruining in-kernel position tracking. | |
43 | */ | |
44 | unsigned long consumer_pos __aligned(PAGE_SIZE); | |
45 | unsigned long producer_pos __aligned(PAGE_SIZE); | |
46 | char data[] __aligned(PAGE_SIZE); | |
47 | }; | |
48 | ||
49 | struct bpf_ringbuf_map { | |
50 | struct bpf_map map; | |
51 | struct bpf_map_memory memory; | |
52 | struct bpf_ringbuf *rb; | |
53 | }; | |
54 | ||
55 | /* 8-byte ring buffer record header structure */ | |
56 | struct bpf_ringbuf_hdr { | |
57 | u32 len; | |
58 | u32 pg_off; | |
59 | }; | |
60 | ||
61 | static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) | |
62 | { | |
63 | const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | | |
64 | __GFP_ZERO; | |
65 | int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; | |
66 | int nr_data_pages = data_sz >> PAGE_SHIFT; | |
67 | int nr_pages = nr_meta_pages + nr_data_pages; | |
68 | struct page **pages, *page; | |
69 | struct bpf_ringbuf *rb; | |
70 | size_t array_size; | |
71 | int i; | |
72 | ||
73 | /* Each data page is mapped twice to allow "virtual" | |
74 | * continuous read of samples wrapping around the end of ring | |
75 | * buffer area: | |
76 | * ------------------------------------------------------ | |
77 | * | meta pages | real data pages | same data pages | | |
78 | * ------------------------------------------------------ | |
79 | * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | | |
80 | * ------------------------------------------------------ | |
81 | * | | TA DA | TA DA | | |
82 | * ------------------------------------------------------ | |
83 | * ^^^^^^^ | |
84 | * | | |
85 | * Here, no need to worry about special handling of wrapped-around | |
86 | * data due to double-mapped data pages. This works both in kernel and | |
87 | * when mmap()'ed in user-space, simplifying both kernel and | |
88 | * user-space implementations significantly. | |
89 | */ | |
90 | array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); | |
91 | if (array_size > PAGE_SIZE) | |
92 | pages = vmalloc_node(array_size, numa_node); | |
93 | else | |
94 | pages = kmalloc_node(array_size, flags, numa_node); | |
95 | if (!pages) | |
96 | return NULL; | |
97 | ||
98 | for (i = 0; i < nr_pages; i++) { | |
99 | page = alloc_pages_node(numa_node, flags, 0); | |
100 | if (!page) { | |
101 | nr_pages = i; | |
102 | goto err_free_pages; | |
103 | } | |
104 | pages[i] = page; | |
105 | if (i >= nr_meta_pages) | |
106 | pages[nr_data_pages + i] = page; | |
107 | } | |
108 | ||
109 | rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, | |
110 | VM_ALLOC | VM_USERMAP, PAGE_KERNEL); | |
111 | if (rb) { | |
112 | rb->pages = pages; | |
113 | rb->nr_pages = nr_pages; | |
114 | return rb; | |
115 | } | |
116 | ||
117 | err_free_pages: | |
118 | for (i = 0; i < nr_pages; i++) | |
119 | __free_page(pages[i]); | |
120 | kvfree(pages); | |
121 | return NULL; | |
122 | } | |
123 | ||
124 | static void bpf_ringbuf_notify(struct irq_work *work) | |
125 | { | |
126 | struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); | |
127 | ||
128 | wake_up_all(&rb->waitq); | |
129 | } | |
130 | ||
131 | static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) | |
132 | { | |
133 | struct bpf_ringbuf *rb; | |
134 | ||
457f4436 AN |
135 | rb = bpf_ringbuf_area_alloc(data_sz, numa_node); |
136 | if (!rb) | |
137 | return ERR_PTR(-ENOMEM); | |
138 | ||
139 | spin_lock_init(&rb->spinlock); | |
140 | init_waitqueue_head(&rb->waitq); | |
141 | init_irq_work(&rb->work, bpf_ringbuf_notify); | |
142 | ||
143 | rb->mask = data_sz - 1; | |
144 | rb->consumer_pos = 0; | |
145 | rb->producer_pos = 0; | |
146 | ||
147 | return rb; | |
148 | } | |
149 | ||
150 | static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) | |
151 | { | |
152 | struct bpf_ringbuf_map *rb_map; | |
153 | u64 cost; | |
154 | int err; | |
155 | ||
156 | if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) | |
157 | return ERR_PTR(-EINVAL); | |
158 | ||
159 | if (attr->key_size || attr->value_size || | |
517bbe19 AN |
160 | !is_power_of_2(attr->max_entries) || |
161 | !PAGE_ALIGNED(attr->max_entries)) | |
457f4436 AN |
162 | return ERR_PTR(-EINVAL); |
163 | ||
517bbe19 AN |
164 | #ifdef CONFIG_64BIT |
165 | /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ | |
166 | if (attr->max_entries > RINGBUF_MAX_DATA_SZ) | |
167 | return ERR_PTR(-E2BIG); | |
168 | #endif | |
169 | ||
457f4436 AN |
170 | rb_map = kzalloc(sizeof(*rb_map), GFP_USER); |
171 | if (!rb_map) | |
172 | return ERR_PTR(-ENOMEM); | |
173 | ||
174 | bpf_map_init_from_attr(&rb_map->map, attr); | |
175 | ||
176 | cost = sizeof(struct bpf_ringbuf_map) + | |
177 | sizeof(struct bpf_ringbuf) + | |
178 | attr->max_entries; | |
179 | err = bpf_map_charge_init(&rb_map->map.memory, cost); | |
180 | if (err) | |
181 | goto err_free_map; | |
182 | ||
183 | rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); | |
184 | if (IS_ERR(rb_map->rb)) { | |
185 | err = PTR_ERR(rb_map->rb); | |
186 | goto err_uncharge; | |
187 | } | |
188 | ||
189 | return &rb_map->map; | |
190 | ||
191 | err_uncharge: | |
192 | bpf_map_charge_finish(&rb_map->map.memory); | |
193 | err_free_map: | |
194 | kfree(rb_map); | |
195 | return ERR_PTR(err); | |
196 | } | |
197 | ||
198 | static void bpf_ringbuf_free(struct bpf_ringbuf *rb) | |
199 | { | |
200 | /* copy pages pointer and nr_pages to local variable, as we are going | |
201 | * to unmap rb itself with vunmap() below | |
202 | */ | |
203 | struct page **pages = rb->pages; | |
204 | int i, nr_pages = rb->nr_pages; | |
205 | ||
206 | vunmap(rb); | |
207 | for (i = 0; i < nr_pages; i++) | |
208 | __free_page(pages[i]); | |
209 | kvfree(pages); | |
210 | } | |
211 | ||
212 | static void ringbuf_map_free(struct bpf_map *map) | |
213 | { | |
214 | struct bpf_ringbuf_map *rb_map; | |
215 | ||
457f4436 AN |
216 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
217 | bpf_ringbuf_free(rb_map->rb); | |
218 | kfree(rb_map); | |
219 | } | |
220 | ||
221 | static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) | |
222 | { | |
223 | return ERR_PTR(-ENOTSUPP); | |
224 | } | |
225 | ||
226 | static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, | |
227 | u64 flags) | |
228 | { | |
229 | return -ENOTSUPP; | |
230 | } | |
231 | ||
232 | static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) | |
233 | { | |
234 | return -ENOTSUPP; | |
235 | } | |
236 | ||
237 | static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, | |
238 | void *next_key) | |
239 | { | |
240 | return -ENOTSUPP; | |
241 | } | |
242 | ||
243 | static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) | |
244 | { | |
245 | size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; | |
246 | ||
247 | /* consumer page + producer page + 2 x data pages */ | |
248 | return RINGBUF_POS_PAGES + 2 * data_pages; | |
249 | } | |
250 | ||
251 | static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) | |
252 | { | |
253 | struct bpf_ringbuf_map *rb_map; | |
254 | size_t mmap_sz; | |
255 | ||
256 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
257 | mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; | |
258 | ||
259 | if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) | |
260 | return -EINVAL; | |
261 | ||
262 | return remap_vmalloc_range(vma, rb_map->rb, | |
263 | vma->vm_pgoff + RINGBUF_PGOFF); | |
264 | } | |
265 | ||
266 | static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) | |
267 | { | |
268 | unsigned long cons_pos, prod_pos; | |
269 | ||
270 | cons_pos = smp_load_acquire(&rb->consumer_pos); | |
271 | prod_pos = smp_load_acquire(&rb->producer_pos); | |
272 | return prod_pos - cons_pos; | |
273 | } | |
274 | ||
275 | static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, | |
276 | struct poll_table_struct *pts) | |
277 | { | |
278 | struct bpf_ringbuf_map *rb_map; | |
279 | ||
280 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
281 | poll_wait(filp, &rb_map->rb->waitq, pts); | |
282 | ||
283 | if (ringbuf_avail_data_sz(rb_map->rb)) | |
284 | return EPOLLIN | EPOLLRDNORM; | |
285 | return 0; | |
286 | } | |
287 | ||
2872e9ac | 288 | static int ringbuf_map_btf_id; |
457f4436 AN |
289 | const struct bpf_map_ops ringbuf_map_ops = { |
290 | .map_alloc = ringbuf_map_alloc, | |
291 | .map_free = ringbuf_map_free, | |
292 | .map_mmap = ringbuf_map_mmap, | |
293 | .map_poll = ringbuf_map_poll, | |
294 | .map_lookup_elem = ringbuf_map_lookup_elem, | |
295 | .map_update_elem = ringbuf_map_update_elem, | |
296 | .map_delete_elem = ringbuf_map_delete_elem, | |
297 | .map_get_next_key = ringbuf_map_get_next_key, | |
2872e9ac AI |
298 | .map_btf_name = "bpf_ringbuf_map", |
299 | .map_btf_id = &ringbuf_map_btf_id, | |
457f4436 AN |
300 | }; |
301 | ||
302 | /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, | |
303 | * calculate offset from record metadata to ring buffer in pages, rounded | |
304 | * down. This page offset is stored as part of record metadata and allows to | |
305 | * restore struct bpf_ringbuf * from record pointer. This page offset is | |
306 | * stored at offset 4 of record metadata header. | |
307 | */ | |
308 | static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, | |
309 | struct bpf_ringbuf_hdr *hdr) | |
310 | { | |
311 | return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; | |
312 | } | |
313 | ||
314 | /* Given pointer to ring buffer record header, restore pointer to struct | |
315 | * bpf_ringbuf itself by using page offset stored at offset 4 | |
316 | */ | |
317 | static struct bpf_ringbuf * | |
318 | bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) | |
319 | { | |
320 | unsigned long addr = (unsigned long)(void *)hdr; | |
321 | unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; | |
322 | ||
323 | return (void*)((addr & PAGE_MASK) - off); | |
324 | } | |
325 | ||
326 | static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) | |
327 | { | |
328 | unsigned long cons_pos, prod_pos, new_prod_pos, flags; | |
329 | u32 len, pg_off; | |
330 | struct bpf_ringbuf_hdr *hdr; | |
331 | ||
332 | if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) | |
333 | return NULL; | |
334 | ||
335 | len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); | |
336 | cons_pos = smp_load_acquire(&rb->consumer_pos); | |
337 | ||
338 | if (in_nmi()) { | |
339 | if (!spin_trylock_irqsave(&rb->spinlock, flags)) | |
340 | return NULL; | |
341 | } else { | |
342 | spin_lock_irqsave(&rb->spinlock, flags); | |
343 | } | |
344 | ||
345 | prod_pos = rb->producer_pos; | |
346 | new_prod_pos = prod_pos + len; | |
347 | ||
348 | /* check for out of ringbuf space by ensuring producer position | |
349 | * doesn't advance more than (ringbuf_size - 1) ahead | |
350 | */ | |
351 | if (new_prod_pos - cons_pos > rb->mask) { | |
352 | spin_unlock_irqrestore(&rb->spinlock, flags); | |
353 | return NULL; | |
354 | } | |
355 | ||
356 | hdr = (void *)rb->data + (prod_pos & rb->mask); | |
357 | pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); | |
358 | hdr->len = size | BPF_RINGBUF_BUSY_BIT; | |
359 | hdr->pg_off = pg_off; | |
360 | ||
361 | /* pairs with consumer's smp_load_acquire() */ | |
362 | smp_store_release(&rb->producer_pos, new_prod_pos); | |
363 | ||
364 | spin_unlock_irqrestore(&rb->spinlock, flags); | |
365 | ||
366 | return (void *)hdr + BPF_RINGBUF_HDR_SZ; | |
367 | } | |
368 | ||
369 | BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) | |
370 | { | |
371 | struct bpf_ringbuf_map *rb_map; | |
372 | ||
373 | if (unlikely(flags)) | |
374 | return 0; | |
375 | ||
376 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
377 | return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); | |
378 | } | |
379 | ||
380 | const struct bpf_func_proto bpf_ringbuf_reserve_proto = { | |
381 | .func = bpf_ringbuf_reserve, | |
382 | .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, | |
383 | .arg1_type = ARG_CONST_MAP_PTR, | |
384 | .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, | |
385 | .arg3_type = ARG_ANYTHING, | |
386 | }; | |
387 | ||
388 | static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) | |
389 | { | |
390 | unsigned long rec_pos, cons_pos; | |
391 | struct bpf_ringbuf_hdr *hdr; | |
392 | struct bpf_ringbuf *rb; | |
393 | u32 new_len; | |
394 | ||
395 | hdr = sample - BPF_RINGBUF_HDR_SZ; | |
396 | rb = bpf_ringbuf_restore_from_rec(hdr); | |
397 | new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; | |
398 | if (discard) | |
399 | new_len |= BPF_RINGBUF_DISCARD_BIT; | |
400 | ||
401 | /* update record header with correct final size prefix */ | |
402 | xchg(&hdr->len, new_len); | |
403 | ||
404 | /* if consumer caught up and is waiting for our record, notify about | |
405 | * new data availability | |
406 | */ | |
407 | rec_pos = (void *)hdr - (void *)rb->data; | |
408 | cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; | |
409 | ||
410 | if (flags & BPF_RB_FORCE_WAKEUP) | |
411 | irq_work_queue(&rb->work); | |
412 | else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) | |
413 | irq_work_queue(&rb->work); | |
414 | } | |
415 | ||
416 | BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) | |
417 | { | |
418 | bpf_ringbuf_commit(sample, flags, false /* discard */); | |
419 | return 0; | |
420 | } | |
421 | ||
422 | const struct bpf_func_proto bpf_ringbuf_submit_proto = { | |
423 | .func = bpf_ringbuf_submit, | |
424 | .ret_type = RET_VOID, | |
425 | .arg1_type = ARG_PTR_TO_ALLOC_MEM, | |
426 | .arg2_type = ARG_ANYTHING, | |
427 | }; | |
428 | ||
429 | BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) | |
430 | { | |
431 | bpf_ringbuf_commit(sample, flags, true /* discard */); | |
432 | return 0; | |
433 | } | |
434 | ||
435 | const struct bpf_func_proto bpf_ringbuf_discard_proto = { | |
436 | .func = bpf_ringbuf_discard, | |
437 | .ret_type = RET_VOID, | |
438 | .arg1_type = ARG_PTR_TO_ALLOC_MEM, | |
439 | .arg2_type = ARG_ANYTHING, | |
440 | }; | |
441 | ||
442 | BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, | |
443 | u64, flags) | |
444 | { | |
445 | struct bpf_ringbuf_map *rb_map; | |
446 | void *rec; | |
447 | ||
448 | if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) | |
449 | return -EINVAL; | |
450 | ||
451 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
452 | rec = __bpf_ringbuf_reserve(rb_map->rb, size); | |
453 | if (!rec) | |
454 | return -EAGAIN; | |
455 | ||
456 | memcpy(rec, data, size); | |
457 | bpf_ringbuf_commit(rec, flags, false /* discard */); | |
458 | return 0; | |
459 | } | |
460 | ||
461 | const struct bpf_func_proto bpf_ringbuf_output_proto = { | |
462 | .func = bpf_ringbuf_output, | |
463 | .ret_type = RET_INTEGER, | |
464 | .arg1_type = ARG_CONST_MAP_PTR, | |
465 | .arg2_type = ARG_PTR_TO_MEM, | |
466 | .arg3_type = ARG_CONST_SIZE_OR_ZERO, | |
467 | .arg4_type = ARG_ANYTHING, | |
468 | }; | |
469 | ||
470 | BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) | |
471 | { | |
472 | struct bpf_ringbuf *rb; | |
473 | ||
474 | rb = container_of(map, struct bpf_ringbuf_map, map)->rb; | |
475 | ||
476 | switch (flags) { | |
477 | case BPF_RB_AVAIL_DATA: | |
478 | return ringbuf_avail_data_sz(rb); | |
479 | case BPF_RB_RING_SIZE: | |
480 | return rb->mask + 1; | |
481 | case BPF_RB_CONS_POS: | |
482 | return smp_load_acquire(&rb->consumer_pos); | |
483 | case BPF_RB_PROD_POS: | |
484 | return smp_load_acquire(&rb->producer_pos); | |
485 | default: | |
486 | return 0; | |
487 | } | |
488 | } | |
489 | ||
490 | const struct bpf_func_proto bpf_ringbuf_query_proto = { | |
491 | .func = bpf_ringbuf_query, | |
492 | .ret_type = RET_INTEGER, | |
493 | .arg1_type = ARG_CONST_MAP_PTR, | |
494 | .arg2_type = ARG_ANYTHING, | |
495 | }; |