]> git.proxmox.com Git - systemd.git/blame - src/shared/bpf-program.c
New upstream version 249~rc1
[systemd.git] / src / shared / bpf-program.c
CommitLineData
a032b68d 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
f5e65279
MB
2
3#include <fcntl.h>
4#include <sys/stat.h>
5#include <sys/types.h>
6#include <unistd.h>
7
8#include "alloc-util.h"
9#include "bpf-program.h"
8b3d4ff0 10#include "escape.h"
f5e65279 11#include "fd-util.h"
bb4f798a 12#include "memory-util.h"
e1f67bc7 13#include "missing_syscall.h"
98393f85 14#include "path-util.h"
8b3d4ff0
MB
15#include "serialize.h"
16#include "string-table.h"
17
18static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
19 [BPF_CGROUP_INET_INGRESS] = "ingress",
20 [BPF_CGROUP_INET_EGRESS] = "egress",
21 [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
22 [BPF_CGROUP_SOCK_OPS] = "sock_ops",
23 [BPF_CGROUP_DEVICE] = "device",
24 [BPF_CGROUP_INET4_BIND] = "bind4",
25 [BPF_CGROUP_INET6_BIND] = "bind6",
26 [BPF_CGROUP_INET4_CONNECT] = "connect4",
27 [BPF_CGROUP_INET6_CONNECT] = "connect6",
28 [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
29 [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
30 [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
31 [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
32 [BPF_CGROUP_SYSCTL] = "sysctl",
33 [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
34 [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
35 [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
36 [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
37};
38
39DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
40
41DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_unref);
42
43 /* struct bpf_prog_info info must be initialized since its value is both input and output
44 * for BPF_OBJ_GET_INFO_BY_FD syscall. */
45static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
46 union bpf_attr attr;
47
48 /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
49 * structured initialization is used.
50 * Refer to https://github.com/systemd/systemd/issues/18164
51 */
52 zero(attr);
53 attr.info.bpf_fd = prog_fd;
54 attr.info.info_len = info_len;
55 attr.info.info = PTR_TO_UINT64(info);
56
57 if (bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)) < 0)
58 return -errno;
59
60 return 0;
61}
f5e65279
MB
62
63int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
64 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
65
8b3d4ff0
MB
66 p = new(BPFProgram, 1);
67 if (!p)
68 return -ENOMEM;
69
70 *p = (BPFProgram) {
71 .n_ref = 1,
72 .prog_type = prog_type,
73 .kernel_fd = -1,
74 };
75
76 *ret = TAKE_PTR(p);
77
78 return 0;
79}
80
81int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
82 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
83 struct bpf_prog_info info = {};
84 int r;
85
86 assert(path);
87 assert(ret);
88
89 p = new(BPFProgram, 1);
f5e65279 90 if (!p)
9e294e28 91 return -ENOMEM;
f5e65279 92
8b3d4ff0
MB
93 *p = (BPFProgram) {
94 .prog_type = BPF_PROG_TYPE_UNSPEC,
95 .n_ref = 1,
96 .kernel_fd = -1,
97 };
98
99 r = bpf_program_load_from_bpf_fs(p, path);
100 if (r < 0)
101 return r;
102
103 r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
104 if (r < 0)
105 return r;
f5e65279 106
8b3d4ff0 107 p->prog_type = info.type;
b012e921
MB
108 *ret = TAKE_PTR(p);
109
f5e65279
MB
110 return 0;
111}
112
6e866b33
MB
113static BPFProgram *bpf_program_free(BPFProgram *p) {
114 assert(p);
98393f85
MB
115
116 /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
117 * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
118 * programs that attached one of their BPF programs to a cgroup will leave this programs pinned for good with
119 * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
120 * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
121 * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
122 * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
123 * whenever we close the BPF fd. */
124 (void) bpf_program_cgroup_detach(p);
125
f5e65279
MB
126 safe_close(p->kernel_fd);
127 free(p->instructions);
98393f85 128 free(p->attached_path);
f5e65279
MB
129
130 return mfree(p);
131}
132
6e866b33
MB
133DEFINE_TRIVIAL_REF_UNREF_FUNC(BPFProgram, bpf_program, bpf_program_free);
134
f5e65279
MB
135int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
136
137 assert(p);
138
98393f85
MB
139 if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
140 return -EBUSY;
141
8b3d4ff0 142 if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
f5e65279
MB
143 return -ENOMEM;
144
145 memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
146 p->n_instructions += count;
147
148 return 0;
149}
150
151int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
152 union bpf_attr attr;
153
154 assert(p);
155
98393f85
MB
156 if (p->kernel_fd >= 0) { /* make this idempotent */
157 memzero(log_buf, log_size);
158 return 0;
159 }
f5e65279 160
1ce460ce
MB
161 // FIXME: Clang doesn't 0-pad with structured initialization, causing
162 // the kernel to reject the bpf_attr as invalid. See:
163 // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
164 // Ideally it should behave like GCC, so that we can remove these workarounds.
165 zero(attr);
166 attr.prog_type = p->prog_type;
167 attr.insns = PTR_TO_UINT64(p->instructions);
168 attr.insn_cnt = p->n_instructions;
169 attr.license = PTR_TO_UINT64("GPL");
170 attr.log_buf = PTR_TO_UINT64(log_buf);
171 attr.log_level = !!log_buf;
172 attr.log_size = log_size;
f5e65279
MB
173
174 p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
175 if (p->kernel_fd < 0)
176 return -errno;
177
178 return 0;
179}
180
f2dec872
BR
181int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
182 union bpf_attr attr;
183
184 assert(p);
185
186 if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
187 return -EBUSY;
188
1ce460ce
MB
189 zero(attr);
190 attr.pathname = PTR_TO_UINT64(path);
f2dec872
BR
191
192 p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
193 if (p->kernel_fd < 0)
194 return -errno;
195
196 return 0;
197}
198
f5e65279 199int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
98393f85 200 _cleanup_free_ char *copy = NULL;
f5e65279
MB
201 _cleanup_close_ int fd = -1;
202 union bpf_attr attr;
98393f85 203 int r;
f5e65279
MB
204
205 assert(p);
206 assert(type >= 0);
207 assert(path);
208
98393f85
MB
209 if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
210 return -EINVAL;
211
212 /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
213 * refuse this early. */
214 if (p->attached_path) {
215 if (!path_equal(p->attached_path, path))
216 return -EBUSY;
217 if (p->attached_type != type)
218 return -EBUSY;
219 if (p->attached_flags != flags)
220 return -EBUSY;
221
222 /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
223 * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
224 * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
225 * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
226 * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
227 * would remain in effect. */
228 if (flags != BPF_F_ALLOW_OVERRIDE)
229 return 0;
230 }
231
232 /* Ensure we have a kernel object for this. */
233 r = bpf_program_load_kernel(p, NULL, 0);
234 if (r < 0)
235 return r;
236
237 copy = strdup(path);
238 if (!copy)
239 return -ENOMEM;
240
f5e65279
MB
241 fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
242 if (fd < 0)
243 return -errno;
244
1ce460ce
MB
245 zero(attr);
246 attr.attach_type = type;
247 attr.target_fd = fd;
248 attr.attach_bpf_fd = p->kernel_fd;
249 attr.attach_flags = flags;
f5e65279
MB
250
251 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
252 return -errno;
253
98393f85
MB
254 free_and_replace(p->attached_path, copy);
255 p->attached_type = type;
256 p->attached_flags = flags;
257
f5e65279
MB
258 return 0;
259}
260
98393f85 261int bpf_program_cgroup_detach(BPFProgram *p) {
f5e65279 262 _cleanup_close_ int fd = -1;
f5e65279 263
98393f85 264 assert(p);
f5e65279 265
98393f85
MB
266 if (!p->attached_path)
267 return -EUNATCH;
f5e65279 268
98393f85
MB
269 fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
270 if (fd < 0) {
271 if (errno != ENOENT)
272 return -errno;
f5e65279 273
98393f85
MB
274 /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
275 * implicitly by the removal, hence don't complain */
276
277 } else {
278 union bpf_attr attr;
279
1ce460ce
MB
280 zero(attr);
281 attr.attach_type = p->attached_type;
282 attr.target_fd = fd;
283 attr.attach_bpf_fd = p->kernel_fd;
98393f85
MB
284
285 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
286 return -errno;
287 }
288
289 p->attached_path = mfree(p->attached_path);
f5e65279
MB
290
291 return 0;
292}
293
294int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
1ce460ce 295 union bpf_attr attr;
f5e65279
MB
296 int fd;
297
1ce460ce
MB
298 zero(attr);
299 attr.map_type = type;
300 attr.key_size = key_size;
301 attr.value_size = value_size;
302 attr.max_entries = max_entries;
303 attr.map_flags = flags;
304
f5e65279
MB
305 fd = bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
306 if (fd < 0)
307 return -errno;
308
309 return fd;
310}
311
312int bpf_map_update_element(int fd, const void *key, void *value) {
1ce460ce 313 union bpf_attr attr;
f5e65279 314
1ce460ce
MB
315 zero(attr);
316 attr.map_fd = fd;
317 attr.key = PTR_TO_UINT64(key);
318 attr.value = PTR_TO_UINT64(value);
f5e65279
MB
319
320 if (bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)) < 0)
321 return -errno;
322
323 return 0;
324}
325
326int bpf_map_lookup_element(int fd, const void *key, void *value) {
1ce460ce 327 union bpf_attr attr;
f5e65279 328
1ce460ce
MB
329 zero(attr);
330 attr.map_fd = fd;
331 attr.key = PTR_TO_UINT64(key);
332 attr.value = PTR_TO_UINT64(value);
f5e65279
MB
333
334 if (bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)) < 0)
335 return -errno;
336
337 return 0;
338}
8b3d4ff0
MB
339
340int bpf_program_pin(int prog_fd, const char *bpffs_path) {
341 union bpf_attr attr;
342
343 zero(attr);
344 attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
345 attr.bpf_fd = prog_fd;
346
347 if (bpf(BPF_OBJ_PIN, &attr, sizeof(attr)) < 0)
348 return -errno;
349
350 return 0;
351}
352
353int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
354 struct bpf_prog_info info = {};
355 int r;
356
357 assert(ret_id);
358
359 r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
360 if (r < 0)
361 return r;
362
363 *ret_id = info.id;
364
365 return 0;
366};
367
368int bpf_program_serialize_attachment(
369 FILE *f,
370 FDSet *fds,
371 const char *key,
372 BPFProgram *p) {
373
374 _cleanup_free_ char *escaped = NULL;
375 int copy, r;
376
377 if (!p || !p->attached_path)
378 return 0;
379
380 assert(p->kernel_fd >= 0);
381
382 escaped = cescape(p->attached_path);
383 if (!escaped)
384 return -ENOMEM;
385
386 copy = fdset_put_dup(fds, p->kernel_fd);
387 if (copy < 0)
388 return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
389
390 r = serialize_item_format(
391 f,
392 key,
393 "%i %s %s",
394 copy,
395 bpf_cgroup_attach_type_to_string(p->attached_type),
396 escaped);
397 if (r < 0)
398 return r;
399
400 /* After serialization, let's forget the fact that this program is attached. The attachment — if you
401 * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
402 * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
403 * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
404 * want the program to be detached while freeing things, so that the attachment can be retained after
405 * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
406 * hence we set it to NULL here. */
407
408 p->attached_path = mfree(p->attached_path);
409 return 0;
410}
411
412int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
413 BPFProgram *p;
414 int r;
415
416 SET_FOREACH(p, set) {
417 r = bpf_program_serialize_attachment(f, fds, key, p);
418 if (r < 0)
419 return r;
420 }
421
422 return 0;
423}
424
425int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
426 _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
427 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
428 _cleanup_close_ int fd = -1;
429 int ifd, at, r;
430
431 assert(v);
432 assert(bpfp);
433
434 /* Extract first word: the fd number */
435 r = extract_first_word(&v, &sfd, NULL, 0);
436 if (r < 0)
437 return r;
438 if (r == 0)
439 return -EINVAL;
440
441 r = safe_atoi(sfd, &ifd);
442 if (r < 0)
443 return r;
444 if (ifd < 0)
445 return -EBADF;
446
447 /* Extract second word: the attach type */
448 r = extract_first_word(&v, &sat, NULL, 0);
449 if (r < 0)
450 return r;
451 if (r == 0)
452 return -EINVAL;
453
454 at = bpf_cgroup_attach_type_from_string(sat);
455 if (at < 0)
456 return at;
457
458 /* The rest is the path */
459 r = cunescape(v, 0, &unescaped);
460 if (r < 0)
461 return r;
462
463 fd = fdset_remove(fds, ifd);
464 if (fd < 0)
465 return fd;
466
467 p = new(BPFProgram, 1);
468 if (!p)
469 return -ENOMEM;
470
471 *p = (BPFProgram) {
472 .n_ref = 1,
473 .kernel_fd = TAKE_FD(fd),
474 .prog_type = BPF_PROG_TYPE_UNSPEC,
475 .attached_path = TAKE_PTR(unescaped),
476 .attached_type = at,
477 };
478
479 if (*bpfp)
480 bpf_program_unref(*bpfp);
481
482 *bpfp = TAKE_PTR(p);
483 return 0;
484}
485
486int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
487 BPFProgram *p = NULL;
488 int r;
489
490 assert(v);
491 assert(bpfsetp);
492
493 r = bpf_program_deserialize_attachment(v, fds, &p);
494 if (r < 0)
495 return r;
496
497 r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
498 if (r < 0)
499 return r;
500
501 return 0;
502}