]>
Commit | Line | Data |
---|---|---|
457f4436 AN |
1 | #include <linux/bpf.h> |
2 | #include <linux/btf.h> | |
3 | #include <linux/err.h> | |
4 | #include <linux/irq_work.h> | |
5 | #include <linux/slab.h> | |
6 | #include <linux/filter.h> | |
7 | #include <linux/mm.h> | |
8 | #include <linux/vmalloc.h> | |
9 | #include <linux/wait.h> | |
10 | #include <linux/poll.h> | |
11 | #include <uapi/linux/btf.h> | |
12 | ||
13 | #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) | |
14 | ||
15 | /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ | |
16 | #define RINGBUF_PGOFF \ | |
17 | (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) | |
18 | /* consumer page and producer page */ | |
19 | #define RINGBUF_POS_PAGES 2 | |
20 | ||
21 | #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) | |
22 | ||
23 | /* Maximum size of ring buffer area is limited by 32-bit page offset within | |
24 | * record header, counted in pages. Reserve 8 bits for extensibility, and take | |
25 | * into account few extra pages for consumer/producer pages and | |
26 | * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single | |
27 | * ring buffer. | |
28 | */ | |
29 | #define RINGBUF_MAX_DATA_SZ \ | |
30 | (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) | |
31 | ||
32 | struct bpf_ringbuf { | |
33 | wait_queue_head_t waitq; | |
34 | struct irq_work work; | |
35 | u64 mask; | |
36 | struct page **pages; | |
37 | int nr_pages; | |
38 | spinlock_t spinlock ____cacheline_aligned_in_smp; | |
39 | /* Consumer and producer counters are put into separate pages to allow | |
40 | * mapping consumer page as r/w, but restrict producer page to r/o. | |
41 | * This protects producer position from being modified by user-space | |
42 | * application and ruining in-kernel position tracking. | |
43 | */ | |
44 | unsigned long consumer_pos __aligned(PAGE_SIZE); | |
45 | unsigned long producer_pos __aligned(PAGE_SIZE); | |
46 | char data[] __aligned(PAGE_SIZE); | |
47 | }; | |
48 | ||
49 | struct bpf_ringbuf_map { | |
50 | struct bpf_map map; | |
457f4436 AN |
51 | struct bpf_ringbuf *rb; |
52 | }; | |
53 | ||
54 | /* 8-byte ring buffer record header structure */ | |
55 | struct bpf_ringbuf_hdr { | |
56 | u32 len; | |
57 | u32 pg_off; | |
58 | }; | |
59 | ||
60 | static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) | |
61 | { | |
be4035c7 RG |
62 | const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | |
63 | __GFP_NOWARN | __GFP_ZERO; | |
457f4436 AN |
64 | int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; |
65 | int nr_data_pages = data_sz >> PAGE_SHIFT; | |
66 | int nr_pages = nr_meta_pages + nr_data_pages; | |
67 | struct page **pages, *page; | |
68 | struct bpf_ringbuf *rb; | |
69 | size_t array_size; | |
70 | int i; | |
71 | ||
72 | /* Each data page is mapped twice to allow "virtual" | |
73 | * continuous read of samples wrapping around the end of ring | |
74 | * buffer area: | |
75 | * ------------------------------------------------------ | |
76 | * | meta pages | real data pages | same data pages | | |
77 | * ------------------------------------------------------ | |
78 | * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | | |
79 | * ------------------------------------------------------ | |
80 | * | | TA DA | TA DA | | |
81 | * ------------------------------------------------------ | |
82 | * ^^^^^^^ | |
83 | * | | |
84 | * Here, no need to worry about special handling of wrapped-around | |
85 | * data due to double-mapped data pages. This works both in kernel and | |
86 | * when mmap()'ed in user-space, simplifying both kernel and | |
87 | * user-space implementations significantly. | |
88 | */ | |
89 | array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); | |
be4035c7 | 90 | pages = bpf_map_area_alloc(array_size, numa_node); |
457f4436 AN |
91 | if (!pages) |
92 | return NULL; | |
93 | ||
94 | for (i = 0; i < nr_pages; i++) { | |
95 | page = alloc_pages_node(numa_node, flags, 0); | |
96 | if (!page) { | |
97 | nr_pages = i; | |
98 | goto err_free_pages; | |
99 | } | |
100 | pages[i] = page; | |
101 | if (i >= nr_meta_pages) | |
102 | pages[nr_data_pages + i] = page; | |
103 | } | |
104 | ||
105 | rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, | |
106 | VM_ALLOC | VM_USERMAP, PAGE_KERNEL); | |
107 | if (rb) { | |
108 | rb->pages = pages; | |
109 | rb->nr_pages = nr_pages; | |
110 | return rb; | |
111 | } | |
112 | ||
113 | err_free_pages: | |
114 | for (i = 0; i < nr_pages; i++) | |
115 | __free_page(pages[i]); | |
116 | kvfree(pages); | |
117 | return NULL; | |
118 | } | |
119 | ||
120 | static void bpf_ringbuf_notify(struct irq_work *work) | |
121 | { | |
122 | struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); | |
123 | ||
124 | wake_up_all(&rb->waitq); | |
125 | } | |
126 | ||
127 | static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) | |
128 | { | |
129 | struct bpf_ringbuf *rb; | |
130 | ||
457f4436 AN |
131 | rb = bpf_ringbuf_area_alloc(data_sz, numa_node); |
132 | if (!rb) | |
abbdd081 | 133 | return NULL; |
457f4436 AN |
134 | |
135 | spin_lock_init(&rb->spinlock); | |
136 | init_waitqueue_head(&rb->waitq); | |
137 | init_irq_work(&rb->work, bpf_ringbuf_notify); | |
138 | ||
139 | rb->mask = data_sz - 1; | |
140 | rb->consumer_pos = 0; | |
141 | rb->producer_pos = 0; | |
142 | ||
143 | return rb; | |
144 | } | |
145 | ||
146 | static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) | |
147 | { | |
148 | struct bpf_ringbuf_map *rb_map; | |
457f4436 AN |
149 | |
150 | if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) | |
151 | return ERR_PTR(-EINVAL); | |
152 | ||
153 | if (attr->key_size || attr->value_size || | |
517bbe19 AN |
154 | !is_power_of_2(attr->max_entries) || |
155 | !PAGE_ALIGNED(attr->max_entries)) | |
457f4436 AN |
156 | return ERR_PTR(-EINVAL); |
157 | ||
517bbe19 AN |
158 | #ifdef CONFIG_64BIT |
159 | /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ | |
160 | if (attr->max_entries > RINGBUF_MAX_DATA_SZ) | |
161 | return ERR_PTR(-E2BIG); | |
162 | #endif | |
163 | ||
be4035c7 | 164 | rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); |
457f4436 AN |
165 | if (!rb_map) |
166 | return ERR_PTR(-ENOMEM); | |
167 | ||
168 | bpf_map_init_from_attr(&rb_map->map, attr); | |
169 | ||
457f4436 | 170 | rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); |
abbdd081 RG |
171 | if (!rb_map->rb) { |
172 | kfree(rb_map); | |
173 | return ERR_PTR(-ENOMEM); | |
457f4436 AN |
174 | } |
175 | ||
176 | return &rb_map->map; | |
457f4436 AN |
177 | } |
178 | ||
179 | static void bpf_ringbuf_free(struct bpf_ringbuf *rb) | |
180 | { | |
181 | /* copy pages pointer and nr_pages to local variable, as we are going | |
182 | * to unmap rb itself with vunmap() below | |
183 | */ | |
184 | struct page **pages = rb->pages; | |
185 | int i, nr_pages = rb->nr_pages; | |
186 | ||
187 | vunmap(rb); | |
188 | for (i = 0; i < nr_pages; i++) | |
189 | __free_page(pages[i]); | |
190 | kvfree(pages); | |
191 | } | |
192 | ||
193 | static void ringbuf_map_free(struct bpf_map *map) | |
194 | { | |
195 | struct bpf_ringbuf_map *rb_map; | |
196 | ||
457f4436 AN |
197 | rb_map = container_of(map, struct bpf_ringbuf_map, map); |
198 | bpf_ringbuf_free(rb_map->rb); | |
199 | kfree(rb_map); | |
200 | } | |
201 | ||
202 | static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) | |
203 | { | |
204 | return ERR_PTR(-ENOTSUPP); | |
205 | } | |
206 | ||
207 | static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, | |
208 | u64 flags) | |
209 | { | |
210 | return -ENOTSUPP; | |
211 | } | |
212 | ||
213 | static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) | |
214 | { | |
215 | return -ENOTSUPP; | |
216 | } | |
217 | ||
218 | static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, | |
219 | void *next_key) | |
220 | { | |
221 | return -ENOTSUPP; | |
222 | } | |
223 | ||
457f4436 AN |
224 | static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) |
225 | { | |
226 | struct bpf_ringbuf_map *rb_map; | |
457f4436 AN |
227 | |
228 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
457f4436 | 229 | |
15735c6b AN |
230 | if (vma->vm_flags & VM_WRITE) { |
231 | /* allow writable mapping for the consumer_pos only */ | |
232 | if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) | |
233 | return -EPERM; | |
234 | } else { | |
235 | vma->vm_flags &= ~VM_MAYWRITE; | |
236 | } | |
237 | /* remap_vmalloc_range() checks size and offset constraints */ | |
457f4436 AN |
238 | return remap_vmalloc_range(vma, rb_map->rb, |
239 | vma->vm_pgoff + RINGBUF_PGOFF); | |
240 | } | |
241 | ||
242 | static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) | |
243 | { | |
244 | unsigned long cons_pos, prod_pos; | |
245 | ||
246 | cons_pos = smp_load_acquire(&rb->consumer_pos); | |
247 | prod_pos = smp_load_acquire(&rb->producer_pos); | |
248 | return prod_pos - cons_pos; | |
249 | } | |
250 | ||
251 | static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, | |
252 | struct poll_table_struct *pts) | |
253 | { | |
254 | struct bpf_ringbuf_map *rb_map; | |
255 | ||
256 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
257 | poll_wait(filp, &rb_map->rb->waitq, pts); | |
258 | ||
259 | if (ringbuf_avail_data_sz(rb_map->rb)) | |
260 | return EPOLLIN | EPOLLRDNORM; | |
261 | return 0; | |
262 | } | |
263 | ||
2872e9ac | 264 | static int ringbuf_map_btf_id; |
457f4436 | 265 | const struct bpf_map_ops ringbuf_map_ops = { |
f4d05259 | 266 | .map_meta_equal = bpf_map_meta_equal, |
457f4436 AN |
267 | .map_alloc = ringbuf_map_alloc, |
268 | .map_free = ringbuf_map_free, | |
269 | .map_mmap = ringbuf_map_mmap, | |
270 | .map_poll = ringbuf_map_poll, | |
271 | .map_lookup_elem = ringbuf_map_lookup_elem, | |
272 | .map_update_elem = ringbuf_map_update_elem, | |
273 | .map_delete_elem = ringbuf_map_delete_elem, | |
274 | .map_get_next_key = ringbuf_map_get_next_key, | |
2872e9ac AI |
275 | .map_btf_name = "bpf_ringbuf_map", |
276 | .map_btf_id = &ringbuf_map_btf_id, | |
457f4436 AN |
277 | }; |
278 | ||
279 | /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, | |
280 | * calculate offset from record metadata to ring buffer in pages, rounded | |
281 | * down. This page offset is stored as part of record metadata and allows to | |
282 | * restore struct bpf_ringbuf * from record pointer. This page offset is | |
283 | * stored at offset 4 of record metadata header. | |
284 | */ | |
285 | static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, | |
286 | struct bpf_ringbuf_hdr *hdr) | |
287 | { | |
288 | return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; | |
289 | } | |
290 | ||
291 | /* Given pointer to ring buffer record header, restore pointer to struct | |
292 | * bpf_ringbuf itself by using page offset stored at offset 4 | |
293 | */ | |
294 | static struct bpf_ringbuf * | |
295 | bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) | |
296 | { | |
297 | unsigned long addr = (unsigned long)(void *)hdr; | |
298 | unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; | |
299 | ||
300 | return (void*)((addr & PAGE_MASK) - off); | |
301 | } | |
302 | ||
303 | static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) | |
304 | { | |
305 | unsigned long cons_pos, prod_pos, new_prod_pos, flags; | |
306 | u32 len, pg_off; | |
307 | struct bpf_ringbuf_hdr *hdr; | |
308 | ||
309 | if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) | |
310 | return NULL; | |
311 | ||
312 | len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); | |
ae6eef43 TLSC |
313 | if (len > rb->mask + 1) |
314 | return NULL; | |
315 | ||
457f4436 AN |
316 | cons_pos = smp_load_acquire(&rb->consumer_pos); |
317 | ||
318 | if (in_nmi()) { | |
319 | if (!spin_trylock_irqsave(&rb->spinlock, flags)) | |
320 | return NULL; | |
321 | } else { | |
322 | spin_lock_irqsave(&rb->spinlock, flags); | |
323 | } | |
324 | ||
325 | prod_pos = rb->producer_pos; | |
326 | new_prod_pos = prod_pos + len; | |
327 | ||
328 | /* check for out of ringbuf space by ensuring producer position | |
329 | * doesn't advance more than (ringbuf_size - 1) ahead | |
330 | */ | |
331 | if (new_prod_pos - cons_pos > rb->mask) { | |
332 | spin_unlock_irqrestore(&rb->spinlock, flags); | |
333 | return NULL; | |
334 | } | |
335 | ||
336 | hdr = (void *)rb->data + (prod_pos & rb->mask); | |
337 | pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); | |
338 | hdr->len = size | BPF_RINGBUF_BUSY_BIT; | |
339 | hdr->pg_off = pg_off; | |
340 | ||
341 | /* pairs with consumer's smp_load_acquire() */ | |
342 | smp_store_release(&rb->producer_pos, new_prod_pos); | |
343 | ||
344 | spin_unlock_irqrestore(&rb->spinlock, flags); | |
345 | ||
346 | return (void *)hdr + BPF_RINGBUF_HDR_SZ; | |
347 | } | |
348 | ||
349 | BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) | |
350 | { | |
351 | struct bpf_ringbuf_map *rb_map; | |
352 | ||
353 | if (unlikely(flags)) | |
354 | return 0; | |
355 | ||
356 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
357 | return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); | |
358 | } | |
359 | ||
360 | const struct bpf_func_proto bpf_ringbuf_reserve_proto = { | |
361 | .func = bpf_ringbuf_reserve, | |
362 | .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, | |
363 | .arg1_type = ARG_CONST_MAP_PTR, | |
364 | .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, | |
365 | .arg3_type = ARG_ANYTHING, | |
366 | }; | |
367 | ||
368 | static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) | |
369 | { | |
370 | unsigned long rec_pos, cons_pos; | |
371 | struct bpf_ringbuf_hdr *hdr; | |
372 | struct bpf_ringbuf *rb; | |
373 | u32 new_len; | |
374 | ||
375 | hdr = sample - BPF_RINGBUF_HDR_SZ; | |
376 | rb = bpf_ringbuf_restore_from_rec(hdr); | |
377 | new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; | |
378 | if (discard) | |
379 | new_len |= BPF_RINGBUF_DISCARD_BIT; | |
380 | ||
381 | /* update record header with correct final size prefix */ | |
382 | xchg(&hdr->len, new_len); | |
383 | ||
384 | /* if consumer caught up and is waiting for our record, notify about | |
385 | * new data availability | |
386 | */ | |
387 | rec_pos = (void *)hdr - (void *)rb->data; | |
388 | cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; | |
389 | ||
390 | if (flags & BPF_RB_FORCE_WAKEUP) | |
391 | irq_work_queue(&rb->work); | |
392 | else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) | |
393 | irq_work_queue(&rb->work); | |
394 | } | |
395 | ||
396 | BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) | |
397 | { | |
398 | bpf_ringbuf_commit(sample, flags, false /* discard */); | |
399 | return 0; | |
400 | } | |
401 | ||
402 | const struct bpf_func_proto bpf_ringbuf_submit_proto = { | |
403 | .func = bpf_ringbuf_submit, | |
404 | .ret_type = RET_VOID, | |
405 | .arg1_type = ARG_PTR_TO_ALLOC_MEM, | |
406 | .arg2_type = ARG_ANYTHING, | |
407 | }; | |
408 | ||
409 | BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) | |
410 | { | |
411 | bpf_ringbuf_commit(sample, flags, true /* discard */); | |
412 | return 0; | |
413 | } | |
414 | ||
415 | const struct bpf_func_proto bpf_ringbuf_discard_proto = { | |
416 | .func = bpf_ringbuf_discard, | |
417 | .ret_type = RET_VOID, | |
418 | .arg1_type = ARG_PTR_TO_ALLOC_MEM, | |
419 | .arg2_type = ARG_ANYTHING, | |
420 | }; | |
421 | ||
422 | BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, | |
423 | u64, flags) | |
424 | { | |
425 | struct bpf_ringbuf_map *rb_map; | |
426 | void *rec; | |
427 | ||
428 | if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) | |
429 | return -EINVAL; | |
430 | ||
431 | rb_map = container_of(map, struct bpf_ringbuf_map, map); | |
432 | rec = __bpf_ringbuf_reserve(rb_map->rb, size); | |
433 | if (!rec) | |
434 | return -EAGAIN; | |
435 | ||
436 | memcpy(rec, data, size); | |
437 | bpf_ringbuf_commit(rec, flags, false /* discard */); | |
438 | return 0; | |
439 | } | |
440 | ||
441 | const struct bpf_func_proto bpf_ringbuf_output_proto = { | |
442 | .func = bpf_ringbuf_output, | |
443 | .ret_type = RET_INTEGER, | |
444 | .arg1_type = ARG_CONST_MAP_PTR, | |
445 | .arg2_type = ARG_PTR_TO_MEM, | |
446 | .arg3_type = ARG_CONST_SIZE_OR_ZERO, | |
447 | .arg4_type = ARG_ANYTHING, | |
448 | }; | |
449 | ||
450 | BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) | |
451 | { | |
452 | struct bpf_ringbuf *rb; | |
453 | ||
454 | rb = container_of(map, struct bpf_ringbuf_map, map)->rb; | |
455 | ||
456 | switch (flags) { | |
457 | case BPF_RB_AVAIL_DATA: | |
458 | return ringbuf_avail_data_sz(rb); | |
459 | case BPF_RB_RING_SIZE: | |
460 | return rb->mask + 1; | |
461 | case BPF_RB_CONS_POS: | |
462 | return smp_load_acquire(&rb->consumer_pos); | |
463 | case BPF_RB_PROD_POS: | |
464 | return smp_load_acquire(&rb->producer_pos); | |
465 | default: | |
466 | return 0; | |
467 | } | |
468 | } | |
469 | ||
470 | const struct bpf_func_proto bpf_ringbuf_query_proto = { | |
471 | .func = bpf_ringbuf_query, | |
472 | .ret_type = RET_INTEGER, | |
473 | .arg1_type = ARG_CONST_MAP_PTR, | |
474 | .arg2_type = ARG_ANYTHING, | |
475 | }; |