]>
Commit | Line | Data |
---|---|---|
cc73685d | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
ccb4cabe SH |
2 | |
3 | /* | |
4 | * cgfs-ng.c: this is a new, simplified implementation of a filesystem | |
5 | * cgroup backend. The original cgfs.c was designed to be as flexible | |
6 | * as possible. It would try to find cgroup filesystems no matter where | |
7 | * or how you had them mounted, and deduce the most usable mount for | |
0e7ff52c | 8 | * each controller. |
ccb4cabe SH |
9 | * |
10 | * This new implementation assumes that cgroup filesystems are mounted | |
11 | * under /sys/fs/cgroup/clist where clist is either the controller, or | |
18406e5a | 12 | * a comma-separated list of controllers. |
ccb4cabe | 13 | */ |
a54694f8 | 14 | |
646b75b5 CB |
15 | #include "config.h" |
16 | ||
d38dd64a CB |
17 | #ifndef _GNU_SOURCE |
18 | #define _GNU_SOURCE 1 | |
19 | #endif | |
646b75b5 | 20 | |
a54694f8 CB |
21 | #include <ctype.h> |
22 | #include <dirent.h> | |
23 | #include <errno.h> | |
24 | #include <grp.h> | |
d38dd64a CB |
25 | #include <linux/kdev_t.h> |
26 | #include <linux/types.h> | |
942e193e CB |
27 | #include <poll.h> |
28 | #include <signal.h> | |
a54694f8 | 29 | #include <stdint.h> |
ccb4cabe SH |
30 | #include <stdio.h> |
31 | #include <stdlib.h> | |
a54694f8 | 32 | #include <string.h> |
385e58e8 | 33 | #include <sys/epoll.h> |
438c4581 | 34 | #include <sys/types.h> |
d38dd64a | 35 | #include <unistd.h> |
c8bf519d | 36 | |
d1783ef4 | 37 | #include "af_unix.h" |
b635e92d | 38 | #include "caps.h" |
ccb4cabe | 39 | #include "cgroup.h" |
bf651989 | 40 | #include "cgroup2_devices.h" |
6328fd9c | 41 | #include "cgroup_utils.h" |
ccb4cabe | 42 | #include "commands.h" |
c8af3332 | 43 | #include "commands_utils.h" |
43654d34 | 44 | #include "conf.h" |
38fa7e47 | 45 | #include "error_utils.h" |
a54694f8 | 46 | #include "log.h" |
c19ad94b | 47 | #include "macro.h" |
018051e3 | 48 | #include "mainloop.h" |
861cb8c2 | 49 | #include "memory_utils.h" |
74ed30d7 | 50 | #include "mount_utils.h" |
43654d34 | 51 | #include "storage/storage.h" |
600a0163 | 52 | #include "string_utils.h" |
315f8a4e | 53 | #include "syscall_wrappers.h" |
a54694f8 | 54 | #include "utils.h" |
ccb4cabe | 55 | |
64e82f8b | 56 | #ifndef HAVE_STRLCPY |
58db1a61 | 57 | #include "strlcpy.h" |
64e82f8b DJ |
58 | #endif |
59 | ||
3ebe2fbd | 60 | #ifndef HAVE_STRLCAT |
58db1a61 | 61 | #include "strlcat.h" |
3ebe2fbd DJ |
62 | #endif |
63 | ||
ac2cecc4 | 64 | lxc_log_define(cgfsng, cgroup); |
ccb4cabe | 65 | |
35ec1a38 CB |
66 | /* |
67 | * Given a pointer to a null-terminated array of pointers, realloc to add one | |
8b8db2f6 CB |
68 | * entry, and point the new entry to NULL. Do not fail. Return the index to the |
69 | * second-to-last entry - that is, the one which is now available for use | |
70 | * (keeping the list null-terminated). | |
ccb4cabe | 71 | */ |
4780b5e7 | 72 | static int cg_list_add(void ***list) |
ccb4cabe | 73 | { |
35ec1a38 CB |
74 | int idx = 0; |
75 | void **p; | |
ccb4cabe SH |
76 | |
77 | if (*list) | |
35ec1a38 | 78 | for (; (*list)[idx]; idx++) |
8b8db2f6 | 79 | ; |
ccb4cabe | 80 | |
35ec1a38 CB |
81 | p = realloc(*list, (idx + 2) * sizeof(void **)); |
82 | if (!p) | |
83 | return ret_errno(ENOMEM); | |
84 | ||
85 | p[idx + 1] = NULL; | |
86 | *list = p; | |
87 | ||
88 | return idx; | |
ccb4cabe SH |
89 | } |
90 | ||
8073018d CB |
91 | /* Given a null-terminated array of strings, check whether @entry is one of the |
92 | * strings. | |
ccb4cabe SH |
93 | */ |
94 | static bool string_in_list(char **list, const char *entry) | |
95 | { | |
ccb4cabe SH |
96 | if (!list) |
97 | return false; | |
d6337a5f | 98 | |
77c3e9a2 | 99 | for (int i = 0; list[i]; i++) |
8b99a20a | 100 | if (strequal(list[i], entry)) |
ccb4cabe SH |
101 | return true; |
102 | ||
103 | return false; | |
104 | } | |
105 | ||
5ae0207c CB |
106 | /* Given a handler's cgroup data, return the struct hierarchy for the controller |
107 | * @c, or NULL if there is none. | |
ccb4cabe | 108 | */ |
abb6f657 | 109 | static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller) |
ccb4cabe | 110 | { |
77c3e9a2 CB |
111 | if (!ops->hierarchies) |
112 | return log_trace_errno(NULL, errno, "There are no useable cgroup controllers"); | |
d6337a5f | 113 | |
77c3e9a2 | 114 | for (int i = 0; ops->hierarchies[i]; i++) { |
27a5132c | 115 | if (!controller) { |
d6337a5f | 116 | /* This is the empty unified hierarchy. */ |
09ed8992 | 117 | if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0]) |
2202afc9 | 118 | return ops->hierarchies[i]; |
09ed8992 | 119 | |
106f1f38 | 120 | continue; |
6dcd6f02 | 121 | } |
09ed8992 | 122 | |
6dcd6f02 CB |
123 | /* |
124 | * Handle controllers with significant implementation changes | |
125 | * from cgroup to cgroup2. | |
126 | */ | |
127 | if (pure_unified_layout(ops)) { | |
8b99a20a | 128 | if (strequal(controller, "devices")) { |
ca72ccb5 | 129 | if (device_utility_controller(ops->unified)) |
6dcd6f02 CB |
130 | return ops->unified; |
131 | ||
132 | break; | |
8b99a20a | 133 | } else if (strequal(controller, "freezer")) { |
ca72ccb5 | 134 | if (freezer_utility_controller(ops->unified)) |
6dcd6f02 CB |
135 | return ops->unified; |
136 | ||
137 | break; | |
138 | } | |
d6337a5f CB |
139 | } |
140 | ||
27a5132c | 141 | if (string_in_list(ops->hierarchies[i]->controllers, controller)) |
2202afc9 | 142 | return ops->hierarchies[i]; |
ccb4cabe | 143 | } |
d6337a5f | 144 | |
27a5132c CB |
145 | if (controller) |
146 | WARN("There is no useable %s controller", controller); | |
147 | else | |
148 | WARN("There is no empty unified cgroup hierarchy"); | |
149 | ||
77c3e9a2 | 150 | return ret_set_errno(NULL, ENOENT); |
ccb4cabe SH |
151 | } |
152 | ||
abb6f657 CB |
153 | int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit) |
154 | { | |
155 | int dfd; | |
156 | const struct hierarchy *h; | |
157 | ||
158 | h = get_hierarchy(ops, fd->controller); | |
159 | if (!h) | |
160 | return ret_errno(ENOENT); | |
161 | ||
162 | /* | |
163 | * The client requested that the controller must be in a specific | |
164 | * cgroup version. | |
165 | */ | |
166 | if (fd->type != 0 && fd->type != h->fs_type) | |
167 | return ret_errno(EINVAL); | |
168 | ||
169 | if (limit) | |
170 | dfd = h->dfd_con; | |
171 | else | |
172 | dfd = h->dfd_lim; | |
173 | if (dfd < 0) | |
174 | return ret_errno(EBADF); | |
175 | ||
176 | fd->layout = ops->cgroup_layout; | |
177 | fd->type = h->fs_type; | |
178 | if (fd->type == UNIFIED_HIERARCHY) | |
179 | fd->utilities = h->utilities; | |
180 | fd->fd = dfd; | |
181 | ||
182 | return 0; | |
183 | } | |
184 | ||
a54694f8 CB |
185 | /* Taken over modified from the kernel sources. */ |
186 | #define NBITS 32 /* bits in uint32_t */ | |
187 | #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) | |
188 | #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS) | |
189 | ||
190 | static void set_bit(unsigned bit, uint32_t *bitarr) | |
191 | { | |
192 | bitarr[bit / NBITS] |= (1 << (bit % NBITS)); | |
193 | } | |
194 | ||
195 | static void clear_bit(unsigned bit, uint32_t *bitarr) | |
196 | { | |
197 | bitarr[bit / NBITS] &= ~(1 << (bit % NBITS)); | |
198 | } | |
199 | ||
200 | static bool is_set(unsigned bit, uint32_t *bitarr) | |
201 | { | |
202 | return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0; | |
203 | } | |
204 | ||
205 | /* Create cpumask from cpulist aka turn: | |
206 | * | |
207 | * 0,2-3 | |
208 | * | |
d5d468f6 | 209 | * into bit array |
a54694f8 CB |
210 | * |
211 | * 1 0 1 1 | |
212 | */ | |
f5bc57d2 | 213 | static int lxc_cpumask(char *buf, uint32_t **bitarr, size_t *last_set_bit) |
a54694f8 | 214 | { |
f5bc57d2 CB |
215 | __do_free uint32_t *arr_u32 = NULL; |
216 | size_t cur_last_set_bit = 0, nbits = 256; | |
217 | size_t nr_u32; | |
a54694f8 | 218 | char *token; |
d5d468f6 | 219 | |
f5bc57d2 CB |
220 | nr_u32 = BITS_TO_LONGS(nbits); |
221 | arr_u32 = zalloc(nr_u32 * sizeof(uint32_t)); | |
222 | if (!arr_u32) | |
223 | return ret_errno(ENOMEM); | |
a54694f8 | 224 | |
0be0d78f | 225 | lxc_iterate_parts(token, buf, ",") { |
f5bc57d2 | 226 | unsigned last_bit, first_bit; |
d5d468f6 | 227 | char *range; |
a54694f8 | 228 | |
f5bc57d2 CB |
229 | errno = 0; |
230 | first_bit = strtoul(token, NULL, 0); | |
231 | last_bit = first_bit; | |
d5d468f6 | 232 | range = strchr(token, '-'); |
a54694f8 | 233 | if (range) |
f5bc57d2 CB |
234 | last_bit = strtoul(range + 1, NULL, 0); |
235 | ||
236 | if (!(first_bit <= last_bit)) | |
237 | return ret_errno(EINVAL); | |
d5d468f6 | 238 | |
f5bc57d2 CB |
239 | if (last_bit >= nbits) { |
240 | size_t add_bits = last_bit - nbits + 32; | |
241 | size_t new_nr_u32; | |
242 | uint32_t *p; | |
a54694f8 | 243 | |
f5bc57d2 CB |
244 | new_nr_u32 = BITS_TO_LONGS(nbits + add_bits); |
245 | p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t)); | |
246 | if (!p) | |
247 | return ret_errno(ENOMEM); | |
248 | arr_u32 = move_ptr(p); | |
a54694f8 | 249 | |
f5bc57d2 CB |
250 | memset(arr_u32 + nr_u32, 0, |
251 | (new_nr_u32 - nr_u32) * sizeof(uint32_t)); | |
252 | nbits += add_bits; | |
253 | } | |
254 | ||
255 | while (first_bit <= last_bit) | |
256 | set_bit(first_bit++, arr_u32); | |
257 | ||
258 | if (last_bit > cur_last_set_bit) | |
259 | cur_last_set_bit = last_bit; | |
a54694f8 CB |
260 | } |
261 | ||
f5bc57d2 CB |
262 | *last_set_bit = cur_last_set_bit; |
263 | *bitarr = move_ptr(arr_u32); | |
264 | return 0; | |
a54694f8 CB |
265 | } |
266 | ||
4d8f68fb CB |
267 | static int lxc_cpumask_update(char *buf, uint32_t *bitarr, size_t last_set_bit, |
268 | bool clear) | |
269 | { | |
270 | bool flipped = false; | |
271 | char *token; | |
272 | ||
273 | lxc_iterate_parts(token, buf, ",") { | |
274 | unsigned last_bit, first_bit; | |
275 | char *range; | |
276 | ||
277 | errno = 0; | |
278 | first_bit = strtoul(token, NULL, 0); | |
279 | last_bit = first_bit; | |
280 | range = strchr(token, '-'); | |
281 | if (range) | |
282 | last_bit = strtoul(range + 1, NULL, 0); | |
283 | ||
284 | if (!(first_bit <= last_bit)) { | |
285 | WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit); | |
286 | continue; | |
287 | } | |
288 | ||
289 | if (last_bit > last_set_bit) | |
290 | continue; | |
291 | ||
292 | while (first_bit <= last_bit) { | |
293 | if (clear && is_set(first_bit, bitarr)) { | |
294 | flipped = true; | |
295 | clear_bit(first_bit, bitarr); | |
296 | } else if (!clear && !is_set(first_bit, bitarr)) { | |
297 | flipped = true; | |
298 | set_bit(first_bit, bitarr); | |
299 | } | |
300 | ||
301 | first_bit++; | |
302 | } | |
303 | } | |
304 | ||
305 | if (flipped) | |
306 | return 1; | |
307 | ||
308 | return 0; | |
309 | } | |
310 | ||
a54694f8 | 311 | /* Turn cpumask into simple, comma-separated cpulist. */ |
f5bc57d2 | 312 | static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t last_set_bit) |
a54694f8 | 313 | { |
f761d24d | 314 | __do_free_string_list char **cpulist = NULL; |
c19ad94b | 315 | char numstr[INTTYPE_TO_STRLEN(size_t)] = {0}; |
77c3e9a2 | 316 | int ret; |
a54694f8 | 317 | |
f5bc57d2 | 318 | for (size_t i = 0; i <= last_set_bit; i++) { |
414c6719 CB |
319 | if (!is_set(i, bitarr)) |
320 | continue; | |
321 | ||
0bba27c1 CB |
322 | ret = strnprintf(numstr, sizeof(numstr), "%zu", i); |
323 | if (ret < 0) | |
414c6719 | 324 | return NULL; |
414c6719 CB |
325 | |
326 | ret = lxc_append_string(&cpulist, numstr); | |
f761d24d | 327 | if (ret < 0) |
c5b8049e | 328 | return ret_set_errno(NULL, ENOMEM); |
a54694f8 | 329 | } |
414c6719 CB |
330 | |
331 | if (!cpulist) | |
c5b8049e | 332 | return ret_set_errno(NULL, ENOMEM); |
414c6719 | 333 | |
f761d24d | 334 | return lxc_string_join(",", (const char **)cpulist, false); |
a54694f8 CB |
335 | } |
336 | ||
77c3e9a2 | 337 | static inline bool is_unified_hierarchy(const struct hierarchy *h) |
c04a6d4e | 338 | { |
b8572e8c | 339 | return h->fs_type == UNIFIED_HIERARCHY; |
c04a6d4e CB |
340 | } |
341 | ||
f57ac67f CB |
342 | /* Return true if the controller @entry is found in the null-terminated list of |
343 | * hierarchies @hlist. | |
ccb4cabe | 344 | */ |
c7a1f72a | 345 | static bool controller_available(struct hierarchy **hlist, char *entry) |
ccb4cabe | 346 | { |
ccb4cabe SH |
347 | if (!hlist) |
348 | return false; | |
349 | ||
77c3e9a2 | 350 | for (int i = 0; hlist[i]; i++) |
ccb4cabe SH |
351 | if (string_in_list(hlist[i]->controllers, entry)) |
352 | return true; | |
d6337a5f | 353 | |
ccb4cabe SH |
354 | return false; |
355 | } | |
356 | ||
c7a1f72a | 357 | static bool controllers_available(struct cgroup_ops *ops) |
ccb4cabe | 358 | { |
77c3e9a2 | 359 | struct hierarchy **hlist; |
ccb4cabe | 360 | |
2202afc9 | 361 | if (!ops->cgroup_use) |
ccb4cabe | 362 | return true; |
c2712f64 | 363 | |
77c3e9a2 CB |
364 | hlist = ops->hierarchies; |
365 | for (char **cur = ops->cgroup_use; cur && *cur; cur++) | |
c7a1f72a CB |
366 | if (!controller_available(hlist, *cur)) |
367 | return log_error(false, "The %s controller found", *cur); | |
c2712f64 | 368 | |
ccb4cabe SH |
369 | return true; |
370 | } | |
371 | ||
63ba9eaf | 372 | static char **list_new(void) |
ccb4cabe | 373 | { |
63ba9eaf CB |
374 | __do_free_string_list char **list = NULL; |
375 | int idx; | |
376 | ||
4780b5e7 | 377 | idx = cg_list_add((void ***)&list); |
63ba9eaf CB |
378 | if (idx < 0) |
379 | return NULL; | |
a55f31bd | 380 | |
63ba9eaf CB |
381 | list[idx] = NULL; |
382 | return move_ptr(list); | |
35ec1a38 | 383 | } |
d6337a5f | 384 | |
63ba9eaf | 385 | static int list_add_string(char ***list, char *entry) |
35ec1a38 | 386 | { |
63ba9eaf CB |
387 | __do_free char *dup = NULL; |
388 | int idx; | |
389 | ||
390 | dup = strdup(entry); | |
391 | if (!dup) | |
392 | return ret_errno(ENOMEM); | |
393 | ||
4780b5e7 | 394 | idx = cg_list_add((void ***)list); |
63ba9eaf CB |
395 | if (idx < 0) |
396 | return idx; | |
397 | ||
398 | (*list)[idx] = move_ptr(dup); | |
399 | return 0; | |
400 | } | |
401 | ||
402 | static char **list_add_controllers(char *controllers) | |
403 | { | |
404 | __do_free_string_list char **list = NULL; | |
35ec1a38 | 405 | char *it; |
6328fd9c | 406 | |
327baffe | 407 | lxc_iterate_parts(it, controllers, ", \t\n") { |
63ba9eaf | 408 | int ret; |
d97919ab | 409 | |
63ba9eaf CB |
410 | ret = list_add_string(&list, it); |
411 | if (ret < 0) | |
d6337a5f | 412 | return NULL; |
411ac6d8 | 413 | } |
f205f10c | 414 | |
63ba9eaf | 415 | return move_ptr(list); |
d6337a5f CB |
416 | } |
417 | ||
35ec1a38 | 418 | static char **unified_controllers(int dfd, const char *file) |
d6337a5f | 419 | { |
d97919ab | 420 | __do_free char *buf = NULL; |
d6337a5f | 421 | |
46bf13b7 | 422 | buf = read_file_at(dfd, file, PROTECT_OPEN, 0); |
d6337a5f | 423 | if (!buf) |
411ac6d8 | 424 | return NULL; |
6328fd9c | 425 | |
63ba9eaf | 426 | return list_add_controllers(buf); |
ccb4cabe SH |
427 | } |
428 | ||
35ec1a38 | 429 | static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers) |
060e54d6 CB |
430 | { |
431 | if (!ops->cgroup_use) | |
35ec1a38 | 432 | return false; |
060e54d6 CB |
433 | |
434 | for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) { | |
435 | bool found = false; | |
436 | ||
437 | for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) { | |
438 | if (!strequal(*cur_use, *cur_ctrl)) | |
439 | continue; | |
440 | ||
441 | found = true; | |
442 | break; | |
443 | } | |
444 | ||
445 | if (found) | |
446 | continue; | |
447 | ||
35ec1a38 | 448 | return true; |
060e54d6 CB |
449 | } |
450 | ||
35ec1a38 | 451 | return false; |
060e54d6 CB |
452 | } |
453 | ||
179754a2 CB |
454 | static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt, |
455 | int dfd_base, char *base_cgroup, | |
b8572e8c | 456 | char **controllers, cgroupfs_type_magic_t fs_type) |
ccb4cabe | 457 | { |
600a0163 | 458 | __do_free struct hierarchy *new = NULL; |
701be30e | 459 | int idx; |
ccb4cabe | 460 | |
35ec1a38 | 461 | if (abspath(base_cgroup)) |
060aaa39 | 462 | return syserror_set(-EINVAL, "Container base path must be relative to controller mount"); |
060e54d6 | 463 | |
1973b62a | 464 | new = zalloc(sizeof(*new)); |
6e214b74 | 465 | if (!new) |
060e54d6 | 466 | return ret_errno(ENOMEM); |
c72e7cb5 | 467 | |
e33870e5 | 468 | new->dfd_con = -EBADF; |
c0af7b1c | 469 | new->dfd_lim = -EBADF; |
6a32c817 | 470 | new->dfd_mon = -EBADF; |
600a0163 | 471 | |
44585f1a CB |
472 | new->fs_type = fs_type; |
473 | new->controllers = controllers; | |
a58be2ad | 474 | new->at_mnt = mnt; |
44585f1a | 475 | new->at_base = base_cgroup; |
35ec1a38 | 476 | |
44585f1a CB |
477 | new->dfd_mnt = dfd_mnt; |
478 | new->dfd_base = dfd_base; | |
35ec1a38 CB |
479 | |
480 | TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s", | |
481 | mnt, maybe_empty(base_cgroup)); | |
060e54d6 | 482 | for (char *const *it = new->controllers; it && *it; it++) |
35ec1a38 | 483 | TRACE("The hierarchy contains the %s controller", *it); |
6328fd9c | 484 | |
4780b5e7 | 485 | idx = cg_list_add((void ***)&ops->hierarchies); |
63ba9eaf CB |
486 | if (idx < 0) |
487 | return ret_errno(idx); | |
488 | ||
b8572e8c | 489 | if (fs_type == UNIFIED_HIERARCHY) |
060e54d6 | 490 | ops->unified = new; |
701be30e | 491 | (ops->hierarchies)[idx] = move_ptr(new); |
ccb4cabe | 492 | |
63ba9eaf | 493 | return 0; |
ccb4cabe SH |
494 | } |
495 | ||
c55fe36d | 496 | static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune) |
c71d83e1 | 497 | { |
c55fe36d | 498 | if (!path_prune || !hierarchies) |
2202afc9 | 499 | return 0; |
d6337a5f | 500 | |
8e64b673 | 501 | for (int i = 0; hierarchies[i]; i++) { |
2202afc9 | 502 | struct hierarchy *h = hierarchies[i]; |
77c3e9a2 | 503 | int ret; |
d6337a5f | 504 | |
c55fe36d | 505 | ret = cgroup_tree_prune(h->dfd_base, path_prune); |
2202afc9 | 506 | if (ret < 0) |
c55fe36d CB |
507 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune); |
508 | else | |
509 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune); | |
2202afc9 | 510 | |
b1b1a60f | 511 | free_equal(h->path_lim, h->path_con); |
2202afc9 | 512 | } |
d6337a5f | 513 | |
c71d83e1 | 514 | return 0; |
d6337a5f CB |
515 | } |
516 | ||
2202afc9 CB |
517 | struct generic_userns_exec_data { |
518 | struct hierarchy **hierarchies; | |
c55fe36d | 519 | const char *path_prune; |
2202afc9 CB |
520 | struct lxc_conf *conf; |
521 | uid_t origuid; /* target uid in parent namespace */ | |
522 | char *path; | |
523 | }; | |
d6337a5f | 524 | |
de6fe132 | 525 | static int cgroup_tree_remove_wrapper(void *data) |
2202afc9 | 526 | { |
2202afc9 CB |
527 | struct generic_userns_exec_data *arg = data; |
528 | uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; | |
529 | gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; | |
8e64b673 | 530 | int ret; |
d6337a5f | 531 | |
8917c382 | 532 | if (!lxc_drop_groups() && errno != EPERM) |
b58214ac CB |
533 | return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); |
534 | ||
2202afc9 | 535 | ret = setresgid(nsgid, nsgid, nsgid); |
8e64b673 | 536 | if (ret < 0) |
77c3e9a2 | 537 | return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", |
8e64b673 | 538 | (int)nsgid, (int)nsgid, (int)nsgid); |
d6337a5f | 539 | |
2202afc9 | 540 | ret = setresuid(nsuid, nsuid, nsuid); |
8e64b673 | 541 | if (ret < 0) |
77c3e9a2 | 542 | return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", |
8e64b673 | 543 | (int)nsuid, (int)nsuid, (int)nsuid); |
d6337a5f | 544 | |
c55fe36d | 545 | return cgroup_tree_remove(arg->hierarchies, arg->path_prune); |
d6337a5f CB |
546 | } |
547 | ||
434c8e15 CB |
548 | __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, |
549 | struct lxc_handler *handler) | |
d6337a5f CB |
550 | { |
551 | int ret; | |
bd8ef4e4 | 552 | |
fc3b9533 CB |
553 | if (!ops) { |
554 | ERROR("Called with uninitialized cgroup operations"); | |
555 | return; | |
556 | } | |
fc1c3af9 | 557 | |
69b4a4bb CB |
558 | if (!ops->hierarchies) |
559 | return; | |
560 | ||
fc3b9533 CB |
561 | if (!handler) { |
562 | ERROR("Called with uninitialized handler"); | |
563 | return; | |
564 | } | |
fc1c3af9 | 565 | |
fc3b9533 CB |
566 | if (!handler->conf) { |
567 | ERROR("Called with uninitialized conf"); | |
568 | return; | |
569 | } | |
fc1c3af9 | 570 | |
a6aeb9f1 CB |
571 | if (!ops->container_limit_cgroup) { |
572 | WARN("Uninitialized limit cgroup"); | |
573 | return; | |
574 | } | |
575 | ||
31b84c7a | 576 | ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices); |
bf651989 CB |
577 | if (ret < 0) |
578 | WARN("Failed to detach bpf program from cgroup"); | |
bf651989 | 579 | |
0589d744 | 580 | if (!list_empty(&handler->conf->id_map)) { |
8e64b673 | 581 | struct generic_userns_exec_data wrap = { |
77c3e9a2 | 582 | .conf = handler->conf, |
c55fe36d | 583 | .path_prune = ops->container_limit_cgroup, |
77c3e9a2 CB |
584 | .hierarchies = ops->hierarchies, |
585 | .origuid = 0, | |
8e64b673 | 586 | }; |
de6fe132 CB |
587 | ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper, |
588 | &wrap, "cgroup_tree_remove_wrapper"); | |
8e64b673 | 589 | } else { |
c55fe36d | 590 | ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup); |
ccb4cabe | 591 | } |
8e64b673 | 592 | if (ret < 0) |
fc3b9533 | 593 | SYSWARN("Failed to destroy cgroups"); |
ccb4cabe SH |
594 | } |
595 | ||
033267c9 CB |
596 | #define __ISOL_CPUS "/sys/devices/system/cpu/isolated" |
597 | #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline" | |
598 | static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child, | |
599 | bool am_initialized) | |
434c8e15 | 600 | { |
033267c9 CB |
601 | __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL, |
602 | *offlinecpus = NULL, *posscpus = NULL; | |
4d8f68fb | 603 | __do_free uint32_t *possmask = NULL; |
033267c9 | 604 | int ret; |
4d8f68fb | 605 | size_t poss_last_set_bit = 0; |
b376d3d0 | 606 | |
033267c9 CB |
607 | posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0); |
608 | if (!posscpus) | |
609 | return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath); | |
610 | ||
033267c9 CB |
611 | if (file_exists(__ISOL_CPUS)) { |
612 | isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0); | |
613 | if (!isolcpus) | |
614 | return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS); | |
615 | ||
f5bc57d2 CB |
616 | if (!isdigit(isolcpus[0])) |
617 | free_disarm(isolcpus); | |
033267c9 CB |
618 | } else { |
619 | TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist"); | |
fc3b9533 | 620 | } |
434c8e15 | 621 | |
033267c9 CB |
622 | if (file_exists(__OFFLINE_CPUS)) { |
623 | offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0); | |
624 | if (!offlinecpus) | |
625 | return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS); | |
434c8e15 | 626 | |
f5bc57d2 CB |
627 | if (!isdigit(offlinecpus[0])) |
628 | free_disarm(offlinecpus); | |
033267c9 CB |
629 | } else { |
630 | TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist"); | |
fc3b9533 | 631 | } |
b376d3d0 | 632 | |
f5bc57d2 | 633 | if (!isolcpus && !offlinecpus) { |
033267c9 CB |
634 | cpulist = move_ptr(posscpus); |
635 | goto copy_parent; | |
fc3b9533 | 636 | } |
1973b62a | 637 | |
f5bc57d2 CB |
638 | ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit); |
639 | if (ret) | |
033267c9 | 640 | return log_error_errno(false, errno, "Failed to create cpumask for possible cpus"); |
434c8e15 | 641 | |
4d8f68fb CB |
642 | if (isolcpus) |
643 | ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true); | |
434c8e15 | 644 | |
4d8f68fb CB |
645 | if (offlinecpus) |
646 | ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true); | |
c468e4d4 | 647 | |
4d8f68fb | 648 | if (!ret) { |
f5bc57d2 | 649 | cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit); |
033267c9 CB |
650 | TRACE("No isolated or offline cpus present in cpuset"); |
651 | } else { | |
652 | cpulist = move_ptr(posscpus); | |
653 | TRACE("Removed isolated or offline cpus from cpuset"); | |
654 | } | |
655 | if (!cpulist) | |
656 | return log_error_errno(false, errno, "Failed to create cpu list"); | |
1973b62a | 657 | |
033267c9 CB |
658 | copy_parent: |
659 | if (!am_initialized) { | |
660 | ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist)); | |
661 | if (ret < 0) | |
662 | return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child); | |
77ffeed2 | 663 | |
033267c9 CB |
664 | TRACE("Copied cpu settings of parent cgroup"); |
665 | } | |
77ffeed2 | 666 | |
033267c9 CB |
667 | return true; |
668 | } | |
1973b62a | 669 | |
033267c9 CB |
670 | static bool cpuset1_initialize(int dfd_base, int dfd_next) |
671 | { | |
672 | char mems[PATH_MAX]; | |
673 | ssize_t bytes; | |
674 | char v; | |
434c8e15 | 675 | |
21e84b02 | 676 | /* Determine whether the base cgroup has cpuset inheritance turned on. */ |
033267c9 CB |
677 | bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1); |
678 | if (bytes < 0) | |
9fc21b2d | 679 | return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base); |
033267c9 | 680 | |
21e84b02 | 681 | /* Initialize cpuset.cpus removing any isolated and offline cpus. */ |
033267c9 | 682 | if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1')) |
9fc21b2d | 683 | return syserror_ret(false, "Failed to initialize cpuset.cpus"); |
033267c9 CB |
684 | |
685 | /* Read cpuset.mems from parent... */ | |
686 | bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems)); | |
687 | if (bytes < 0) | |
9fc21b2d | 688 | return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base); |
033267c9 | 689 | |
21e84b02 | 690 | /* and copy to first cgroup in the tree... */ |
033267c9 CB |
691 | bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes); |
692 | if (bytes < 0) | |
9fc21b2d | 693 | return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next); |
033267c9 | 694 | |
21e84b02 | 695 | /* and finally turn on cpuset inheritance. */ |
033267c9 CB |
696 | bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1); |
697 | if (bytes < 0) | |
9fc21b2d | 698 | return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next); |
033267c9 CB |
699 | |
700 | return log_trace(true, "Initialized cpuset in the legacy hierarchy"); | |
434c8e15 CB |
701 | } |
702 | ||
033267c9 CB |
703 | static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode, |
704 | bool cpuset_v1, bool eexist_ignore) | |
6099dd5a | 705 | { |
da42ac7b CB |
706 | __do_close int dfd_final = -EBADF; |
707 | int dfd_cur = dfd_base; | |
708 | int ret = 0; | |
709 | size_t len; | |
710 | char *cur; | |
711 | char buf[PATH_MAX]; | |
6099dd5a | 712 | |
da42ac7b | 713 | if (is_empty_string(path)) |
bce04069 | 714 | return ret_errno(EINVAL); |
6099dd5a | 715 | |
da42ac7b CB |
716 | len = strlcpy(buf, path, sizeof(buf)); |
717 | if (len >= sizeof(buf)) | |
bce04069 | 718 | return ret_errno(E2BIG); |
6099dd5a | 719 | |
da42ac7b CB |
720 | lxc_iterate_parts(cur, buf, "/") { |
721 | /* | |
722 | * Even though we vetted the paths when we parsed the config | |
723 | * we're paranoid here and check that the path is neither | |
724 | * absolute nor walks upwards. | |
725 | */ | |
e4db08ed | 726 | if (abspath(cur)) |
060aaa39 | 727 | return syserror_set(-EINVAL, "No absolute paths allowed"); |
6099dd5a | 728 | |
e4db08ed | 729 | if (strnequal(cur, "..", STRLITERALLEN(".."))) |
060aaa39 | 730 | return syserror_set(-EINVAL, "No upward walking paths allowed"); |
6099dd5a | 731 | |
da42ac7b CB |
732 | ret = mkdirat(dfd_cur, cur, mode); |
733 | if (ret < 0) { | |
734 | if (errno != EEXIST) | |
2d7b0895 | 735 | return syserror("Failed to create %d(%s)", dfd_cur, cur); |
da42ac7b CB |
736 | |
737 | ret = -EEXIST; | |
738 | } | |
739 | TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur); | |
740 | ||
741 | dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0); | |
742 | if (dfd_final < 0) | |
2d7b0895 | 743 | return syserror("Fail to open%s directory %d(%s)", |
da42ac7b CB |
744 | !ret ? " newly created" : "", dfd_base, cur); |
745 | if (dfd_cur != dfd_base) | |
746 | close(dfd_cur); | |
033267c9 | 747 | else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final)) |
2d7b0895 | 748 | return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy"); |
da42ac7b | 749 | /* |
033267c9 CB |
750 | * Leave dfd_final pointing to the last fd we opened so |
751 | * it will be automatically zapped if we return early. | |
da42ac7b CB |
752 | */ |
753 | dfd_cur = dfd_final; | |
754 | } | |
755 | ||
756 | /* The final cgroup must be succesfully creatd by us. */ | |
033267c9 CB |
757 | if (ret) { |
758 | if (ret != -EEXIST || !eexist_ignore) | |
0d8d13be | 759 | return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path); |
033267c9 | 760 | } |
da42ac7b CB |
761 | |
762 | return move_fd(dfd_final); | |
6099dd5a CB |
763 | } |
764 | ||
432faf20 | 765 | static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, |
a6aeb9f1 CB |
766 | struct hierarchy *h, const char *cgroup_limit_dir, |
767 | const char *cgroup_leaf, bool payload) | |
72068e74 | 768 | { |
da42ac7b | 769 | __do_close int fd_limit = -EBADF, fd_final = -EBADF; |
033267c9 | 770 | bool cpuset_v1 = false; |
72068e74 | 771 | |
033267c9 CB |
772 | /* |
773 | * The legacy cpuset controller needs massaging in case inheriting | |
774 | * settings from its immediate ancestor cgroup hasn't been turned on. | |
775 | */ | |
776 | cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset"); | |
0c3deb94 | 777 | |
a6aeb9f1 | 778 | if (payload && cgroup_leaf) { |
da42ac7b | 779 | /* With isolation both parts need to not already exist. */ |
033267c9 | 780 | fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); |
da42ac7b | 781 | if (fd_limit < 0) |
0d8d13be | 782 | return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir); |
432faf20 | 783 | |
eece10d5 | 784 | h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL); |
60052c3b | 785 | h->dfd_lim = move_fd(fd_limit); |
60052c3b | 786 | |
a6aeb9f1 | 787 | TRACE("Created limit cgroup %d->%d(%s)", |
60052c3b | 788 | h->dfd_lim, h->dfd_base, cgroup_limit_dir); |
432faf20 WB |
789 | |
790 | /* | |
791 | * With isolation the devices legacy cgroup needs to be | |
792 | * iinitialized early, as it typically contains an 'a' (all) | |
793 | * line, which is not possible once a subdirectory has been | |
794 | * created. | |
795 | */ | |
ec4d463d CB |
796 | if (string_in_list(h->controllers, "devices") && |
797 | !ops->setup_limits_legacy(ops, conf, true)) | |
0d8d13be | 798 | return log_warn(false, "Failed to setup legacy device limits"); |
432faf20 | 799 | |
a6aeb9f1 CB |
800 | /* |
801 | * If we use a separate limit cgroup, the leaf cgroup, i.e. the | |
802 | * cgroup the container actually resides in, is below fd_limit. | |
803 | */ | |
60052c3b | 804 | fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false); |
e2035358 CB |
805 | if (fd_final < 0) { |
806 | /* Ensure we don't leave any garbage behind. */ | |
807 | if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir)) | |
808 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir); | |
809 | else | |
810 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir); | |
0d8d13be | 811 | return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir); |
e2035358 | 812 | } |
60052c3b | 813 | h->dfd_con = move_fd(fd_final); |
eece10d5 | 814 | h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL); |
60052c3b | 815 | |
a6aeb9f1 | 816 | } else { |
9981107f | 817 | fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); |
60052c3b | 818 | if (fd_final < 0) |
0d8d13be | 819 | return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir); |
0c3deb94 | 820 | |
60052c3b SI |
821 | if (payload) { |
822 | h->dfd_con = move_fd(fd_final); | |
c0af7b1c | 823 | h->dfd_lim = h->dfd_con; |
eece10d5 SI |
824 | h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL); |
825 | ||
b1b1a60f | 826 | h->path_lim = h->path_con; |
60052c3b SI |
827 | } else { |
828 | h->dfd_mon = move_fd(fd_final); | |
829 | } | |
1973b62a | 830 | } |
fe70edee | 831 | |
c581d2a6 | 832 | return true; |
ccb4cabe SH |
833 | } |
834 | ||
6c880cdf CB |
835 | static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune, |
836 | bool payload) | |
ccb4cabe | 837 | { |
c1ece895 | 838 | bool prune = true; |
72068e74 | 839 | |
1973b62a | 840 | if (payload) { |
c1ece895 | 841 | /* Check whether we actually created the cgroup to prune. */ |
c0af7b1c | 842 | if (h->dfd_lim < 0) |
c1ece895 CB |
843 | prune = false; |
844 | ||
b1b1a60f | 845 | free_equal(h->path_con, h->path_lim); |
c0af7b1c | 846 | close_equal(h->dfd_con, h->dfd_lim); |
1973b62a | 847 | } else { |
c1ece895 | 848 | /* Check whether we actually created the cgroup to prune. */ |
6a32c817 | 849 | if (h->dfd_mon < 0) |
c1ece895 CB |
850 | prune = false; |
851 | ||
6a32c817 | 852 | close_prot_errno_disarm(h->dfd_mon); |
1973b62a | 853 | } |
e56639fb | 854 | |
c1ece895 CB |
855 | /* We didn't create this cgroup. */ |
856 | if (!prune) | |
857 | return; | |
858 | ||
859 | if (cgroup_tree_prune(h->dfd_base, path_prune)) | |
cb423bd3 CB |
860 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune); |
861 | else | |
862 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune); | |
a900cbaf WB |
863 | } |
864 | ||
033267c9 CB |
865 | __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops, |
866 | struct lxc_handler *handler) | |
867 | { | |
868 | int len; | |
869 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; | |
870 | const struct lxc_conf *conf; | |
871 | ||
872 | if (!ops) { | |
873 | ERROR("Called with uninitialized cgroup operations"); | |
874 | return; | |
875 | } | |
876 | ||
877 | if (!ops->hierarchies) | |
878 | return; | |
879 | ||
880 | if (!handler) { | |
881 | ERROR("Called with uninitialized handler"); | |
882 | return; | |
883 | } | |
884 | ||
885 | if (!handler->conf) { | |
886 | ERROR("Called with uninitialized conf"); | |
887 | return; | |
888 | } | |
889 | conf = handler->conf; | |
890 | ||
1e058855 CB |
891 | if (!ops->monitor_cgroup) { |
892 | WARN("Uninitialized monitor cgroup"); | |
893 | return; | |
894 | } | |
895 | ||
033267c9 CB |
896 | len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid); |
897 | if (len < 0) | |
898 | return; | |
899 | ||
900 | for (int i = 0; ops->hierarchies[i]; i++) { | |
901 | __do_close int fd_pivot = -EBADF; | |
902 | __do_free char *pivot_path = NULL; | |
903 | struct hierarchy *h = ops->hierarchies[i]; | |
904 | bool cpuset_v1 = false; | |
905 | int ret; | |
906 | ||
033267c9 CB |
907 | /* Monitor might have died before we entered the cgroup. */ |
908 | if (handler->monitor_pid <= 0) { | |
909 | WARN("No valid monitor process found while destroying cgroups"); | |
c55fe36d | 910 | goto cgroup_prune_tree; |
033267c9 CB |
911 | } |
912 | ||
913 | if (conf->cgroup_meta.monitor_pivot_dir) | |
914 | pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL); | |
033267c9 CB |
915 | else if (conf->cgroup_meta.dir) |
916 | pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL); | |
917 | else | |
918 | pivot_path = must_make_path(CGROUP_PIVOT, NULL); | |
919 | ||
920 | cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset"); | |
921 | ||
922 | fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true); | |
923 | if (fd_pivot < 0) { | |
924 | SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path); | |
925 | continue; | |
926 | } | |
927 | ||
928 | ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len); | |
929 | if (ret != 0) { | |
930 | SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path); | |
931 | continue; | |
932 | } | |
933 | ||
c55fe36d CB |
934 | cgroup_prune_tree: |
935 | ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup); | |
033267c9 | 936 | if (ret < 0) |
c55fe36d CB |
937 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup); |
938 | else | |
939 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup); | |
033267c9 CB |
940 | } |
941 | } | |
942 | ||
a900cbaf WB |
943 | /* |
944 | * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a | |
945 | * proper prefix directory of lxc.cgroup.dir.payload. | |
946 | * | |
947 | * Returns the prefix length if it is set, otherwise zero on success. | |
948 | */ | |
949 | static bool check_cgroup_dir_config(struct lxc_conf *conf) | |
950 | { | |
951 | const char *monitor_dir = conf->cgroup_meta.monitor_dir, | |
952 | *container_dir = conf->cgroup_meta.container_dir, | |
953 | *namespace_dir = conf->cgroup_meta.namespace_dir; | |
a900cbaf WB |
954 | |
955 | /* none of the new options are set, all is fine */ | |
956 | if (!monitor_dir && !container_dir && !namespace_dir) | |
957 | return true; | |
958 | ||
959 | /* some are set, make sure lxc.cgroup.dir is not also set*/ | |
960 | if (conf->cgroup_meta.dir) | |
961 | return log_error_errno(false, EINVAL, | |
962 | "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor"); | |
963 | ||
964 | /* make sure both monitor and payload are set */ | |
965 | if (!monitor_dir || !container_dir) | |
966 | return log_error_errno(false, EINVAL, | |
967 | "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set"); | |
968 | ||
969 | /* namespace_dir may be empty */ | |
970 | return true; | |
72068e74 CB |
971 | } |
972 | ||
59eac805 | 973 | __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler) |
72068e74 | 974 | { |
dcf6a5c7 | 975 | __do_free char *monitor_cgroup = NULL; |
fe70edee CB |
976 | int idx = 0; |
977 | int i; | |
5ce03bc0 | 978 | size_t len; |
a900cbaf | 979 | char *suffix = NULL; |
0d66e29a | 980 | struct lxc_conf *conf; |
72068e74 | 981 | |
0d66e29a CB |
982 | if (!ops) |
983 | return ret_set_errno(false, ENOENT); | |
e56639fb | 984 | |
69b4a4bb CB |
985 | if (!ops->hierarchies) |
986 | return true; | |
987 | ||
0d66e29a CB |
988 | if (ops->monitor_cgroup) |
989 | return ret_set_errno(false, EEXIST); | |
990 | ||
991 | if (!handler || !handler->conf) | |
992 | return ret_set_errno(false, EINVAL); | |
993 | ||
994 | conf = handler->conf; | |
995 | ||
a900cbaf WB |
996 | if (!check_cgroup_dir_config(conf)) |
997 | return false; | |
998 | ||
999 | if (conf->cgroup_meta.monitor_dir) { | |
a900cbaf WB |
1000 | monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir); |
1001 | } else if (conf->cgroup_meta.dir) { | |
fe70edee CB |
1002 | monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/", |
1003 | DEFAULT_MONITOR_CGROUP_PREFIX, | |
1004 | handler->name, | |
1005 | CGROUP_CREATE_RETRY, NULL); | |
b3ed2061 | 1006 | } else if (ops->cgroup_pattern) { |
dcf6a5c7 CB |
1007 | __do_free char *cgroup_tree = NULL; |
1008 | ||
1009 | cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern); | |
1010 | if (!cgroup_tree) | |
d6bdd182 CB |
1011 | return ret_set_errno(false, ENOMEM); |
1012 | ||
d6bdd182 CB |
1013 | monitor_cgroup = must_concat(&len, cgroup_tree, "/", |
1014 | DEFAULT_MONITOR_CGROUP, | |
b3ed2061 CB |
1015 | CGROUP_CREATE_RETRY, NULL); |
1016 | } else { | |
fe70edee CB |
1017 | monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX, |
1018 | handler->name, | |
1019 | CGROUP_CREATE_RETRY, NULL); | |
b3ed2061 | 1020 | } |
fe70edee | 1021 | if (!monitor_cgroup) |
0d66e29a | 1022 | return ret_set_errno(false, ENOMEM); |
72068e74 | 1023 | |
a900cbaf WB |
1024 | if (!conf->cgroup_meta.monitor_dir) { |
1025 | suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN; | |
1026 | *suffix = '\0'; | |
1027 | } | |
5ce03bc0 | 1028 | do { |
a900cbaf | 1029 | if (idx && suffix) |
fe70edee | 1030 | sprintf(suffix, "-%d", idx); |
72068e74 | 1031 | |
ebc10afe | 1032 | for (i = 0; ops->hierarchies[i]; i++) { |
432faf20 | 1033 | if (cgroup_tree_create(ops, handler->conf, |
dcf6a5c7 | 1034 | ops->hierarchies[i], |
6fec4327 | 1035 | monitor_cgroup, NULL, false)) |
fe70edee CB |
1036 | continue; |
1037 | ||
7064ee3a | 1038 | DEBUG("Failed to create cgroup %s)", monitor_cgroup); |
6c880cdf CB |
1039 | for (int j = 0; j <= i; j++) |
1040 | cgroup_tree_prune_leaf(ops->hierarchies[j], | |
1041 | monitor_cgroup, false); | |
fe70edee CB |
1042 | |
1043 | idx++; | |
1044 | break; | |
5ce03bc0 | 1045 | } |
a900cbaf | 1046 | } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix); |
5ce03bc0 | 1047 | |
a900cbaf | 1048 | if (idx == 1000 || (!suffix && idx != 0)) |
04a49a14 | 1049 | return log_error_errno(false, ERANGE, "Failed to create monitor cgroup"); |
72068e74 | 1050 | |
c581d2a6 | 1051 | ops->monitor_cgroup = move_ptr(monitor_cgroup); |
6e8703a4 | 1052 | return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup); |
ccb4cabe SH |
1053 | } |
1054 | ||
fe70edee CB |
1055 | /* |
1056 | * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern; | |
cecad0c1 | 1057 | * next cgroup_pattern-1, -2, ..., -999. |
ccb4cabe | 1058 | */ |
59eac805 | 1059 | __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler) |
ccb4cabe | 1060 | { |
a6aeb9f1 CB |
1061 | __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL; |
1062 | char *limit_cgroup; | |
f3839f12 | 1063 | int idx = 0; |
fe70edee | 1064 | int i; |
ccb4cabe | 1065 | size_t len; |
a900cbaf | 1066 | char *suffix = NULL; |
f3839f12 | 1067 | struct lxc_conf *conf; |
43654d34 | 1068 | |
f3839f12 CB |
1069 | if (!ops) |
1070 | return ret_set_errno(false, ENOENT); | |
ccb4cabe | 1071 | |
69b4a4bb CB |
1072 | if (!ops->hierarchies) |
1073 | return true; | |
1074 | ||
471929c6 | 1075 | if (ops->container_cgroup || ops->container_limit_cgroup) |
f3839f12 CB |
1076 | return ret_set_errno(false, EEXIST); |
1077 | ||
1078 | if (!handler || !handler->conf) | |
1079 | return ret_set_errno(false, EINVAL); | |
1080 | ||
1081 | conf = handler->conf; | |
1082 | ||
a900cbaf WB |
1083 | if (!check_cgroup_dir_config(conf)) |
1084 | return false; | |
1085 | ||
1086 | if (conf->cgroup_meta.container_dir) { | |
a6aeb9f1 CB |
1087 | __limit_cgroup = strdup(conf->cgroup_meta.container_dir); |
1088 | if (!__limit_cgroup) | |
a900cbaf WB |
1089 | return ret_set_errno(false, ENOMEM); |
1090 | ||
432faf20 | 1091 | if (conf->cgroup_meta.namespace_dir) { |
a6aeb9f1 | 1092 | container_cgroup = must_make_path(__limit_cgroup, |
432faf20 WB |
1093 | conf->cgroup_meta.namespace_dir, |
1094 | NULL); | |
a6aeb9f1 | 1095 | limit_cgroup = __limit_cgroup; |
432faf20 WB |
1096 | } else { |
1097 | /* explicit paths but without isolation */ | |
a6aeb9f1 CB |
1098 | limit_cgroup = move_ptr(__limit_cgroup); |
1099 | container_cgroup = limit_cgroup; | |
432faf20 | 1100 | } |
a900cbaf | 1101 | } else if (conf->cgroup_meta.dir) { |
a6aeb9f1 CB |
1102 | limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/", |
1103 | DEFAULT_PAYLOAD_CGROUP_PREFIX, | |
1104 | handler->name, | |
1105 | CGROUP_CREATE_RETRY, NULL); | |
1106 | container_cgroup = limit_cgroup; | |
b3ed2061 | 1107 | } else if (ops->cgroup_pattern) { |
dcf6a5c7 CB |
1108 | __do_free char *cgroup_tree = NULL; |
1109 | ||
1110 | cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern); | |
1111 | if (!cgroup_tree) | |
d6bdd182 CB |
1112 | return ret_set_errno(false, ENOMEM); |
1113 | ||
a6aeb9f1 CB |
1114 | limit_cgroup = must_concat(&len, cgroup_tree, "/", |
1115 | DEFAULT_PAYLOAD_CGROUP, | |
1116 | CGROUP_CREATE_RETRY, NULL); | |
1117 | container_cgroup = limit_cgroup; | |
b3ed2061 | 1118 | } else { |
a6aeb9f1 CB |
1119 | limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX, |
1120 | handler->name, | |
1121 | CGROUP_CREATE_RETRY, NULL); | |
1122 | container_cgroup = limit_cgroup; | |
b3ed2061 | 1123 | } |
a6aeb9f1 | 1124 | if (!limit_cgroup) |
fe70edee | 1125 | return ret_set_errno(false, ENOMEM); |
ccb4cabe | 1126 | |
a900cbaf WB |
1127 | if (!conf->cgroup_meta.container_dir) { |
1128 | suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN; | |
1129 | *suffix = '\0'; | |
1130 | } | |
d97919ab | 1131 | do { |
a900cbaf | 1132 | if (idx && suffix) |
fe70edee | 1133 | sprintf(suffix, "-%d", idx); |
bb30b52a | 1134 | |
d97919ab | 1135 | for (i = 0; ops->hierarchies[i]; i++) { |
432faf20 | 1136 | if (cgroup_tree_create(ops, handler->conf, |
a6aeb9f1 CB |
1137 | ops->hierarchies[i], limit_cgroup, |
1138 | conf->cgroup_meta.namespace_dir, | |
6fec4327 | 1139 | true)) |
fe70edee CB |
1140 | continue; |
1141 | ||
67ed60ce | 1142 | DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)"); |
6c880cdf CB |
1143 | for (int j = 0; j <= i; j++) |
1144 | cgroup_tree_prune_leaf(ops->hierarchies[j], | |
a6aeb9f1 | 1145 | limit_cgroup, true); |
fe70edee CB |
1146 | |
1147 | idx++; | |
1148 | break; | |
66b66624 | 1149 | } |
a900cbaf | 1150 | } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix); |
cecad0c1 | 1151 | |
a900cbaf | 1152 | if (idx == 1000 || (!suffix && idx != 0)) |
04a49a14 | 1153 | return log_error_errno(false, ERANGE, "Failed to create container cgroup"); |
cecad0c1 | 1154 | |
fe70edee | 1155 | ops->container_cgroup = move_ptr(container_cgroup); |
a6aeb9f1 CB |
1156 | if (__limit_cgroup) |
1157 | ops->container_limit_cgroup = move_ptr(__limit_cgroup); | |
c55fe36d CB |
1158 | else |
1159 | ops->container_limit_cgroup = ops->container_cgroup; | |
a6aeb9f1 CB |
1160 | INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup", |
1161 | ops->container_cgroup, ops->container_limit_cgroup); | |
ccb4cabe | 1162 | return true; |
ccb4cabe SH |
1163 | } |
1164 | ||
c581d2a6 CB |
1165 | __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, |
1166 | struct lxc_handler *handler) | |
ccb4cabe | 1167 | { |
fdb0b8ab | 1168 | int monitor_len, transient_len = 0; |
c581d2a6 CB |
1169 | char monitor[INTTYPE_TO_STRLEN(pid_t)], |
1170 | transient[INTTYPE_TO_STRLEN(pid_t)]; | |
ccb4cabe | 1171 | |
797fa65e CB |
1172 | if (!ops) |
1173 | return ret_set_errno(false, ENOENT); | |
1174 | ||
69b4a4bb CB |
1175 | if (!ops->hierarchies) |
1176 | return true; | |
1177 | ||
797fa65e CB |
1178 | if (!ops->monitor_cgroup) |
1179 | return ret_set_errno(false, ENOENT); | |
1180 | ||
1181 | if (!handler || !handler->conf) | |
1182 | return ret_set_errno(false, EINVAL); | |
1183 | ||
0bba27c1 CB |
1184 | monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid); |
1185 | if (monitor_len < 0) | |
1186 | return false; | |
1187 | ||
1188 | if (handler->transient_pid > 0) { | |
1189 | transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid); | |
1190 | if (transient_len < 0) | |
1191 | return false; | |
1192 | } | |
ccb4cabe | 1193 | |
eeef32bb | 1194 | for (int i = 0; ops->hierarchies[i]; i++) { |
1973b62a | 1195 | struct hierarchy *h = ops->hierarchies[i]; |
c581d2a6 | 1196 | int ret; |
08768001 | 1197 | |
6a32c817 | 1198 | ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len); |
1973b62a | 1199 | if (ret) |
6a32c817 | 1200 | return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon); |
c581d2a6 | 1201 | |
6a32c817 | 1202 | TRACE("Moved monitor into cgroup %d", h->dfd_mon); |
ebf88e5b | 1203 | |
34683042 | 1204 | if (handler->transient_pid <= 0) |
d1ee8719 | 1205 | continue; |
c581d2a6 | 1206 | |
6a32c817 | 1207 | ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len); |
1973b62a | 1208 | if (ret) |
6a32c817 | 1209 | return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon); |
1973b62a | 1210 | |
6a32c817 | 1211 | TRACE("Moved transient process into cgroup %d", h->dfd_mon); |
ebf88e5b | 1212 | |
1973b62a | 1213 | /* |
78eb6aa6 | 1214 | * we don't keep the fds for non-unified hierarchies around |
1973b62a | 1215 | * mainly because we don't make use of them anymore after the |
78eb6aa6 | 1216 | * core cgroup setup is done but also because there are quite a |
1973b62a CB |
1217 | * lot of them. |
1218 | */ | |
1219 | if (!is_unified_hierarchy(h)) | |
6a32c817 | 1220 | close_prot_errno_disarm(h->dfd_mon); |
ccb4cabe | 1221 | } |
c581d2a6 | 1222 | handler->transient_pid = -1; |
ccb4cabe SH |
1223 | |
1224 | return true; | |
1225 | } | |
1226 | ||
c581d2a6 CB |
1227 | __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops, |
1228 | struct lxc_handler *handler) | |
eeef32bb | 1229 | { |
c581d2a6 CB |
1230 | int len; |
1231 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; | |
eeef32bb | 1232 | |
4490328e CB |
1233 | if (!ops) |
1234 | return ret_set_errno(false, ENOENT); | |
1235 | ||
c581d2a6 CB |
1236 | if (!ops->hierarchies) |
1237 | return true; | |
1238 | ||
4490328e CB |
1239 | if (!ops->container_cgroup) |
1240 | return ret_set_errno(false, ENOENT); | |
1241 | ||
1242 | if (!handler || !handler->conf) | |
1243 | return ret_set_errno(false, EINVAL); | |
1244 | ||
0bba27c1 CB |
1245 | len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid); |
1246 | if (len < 0) | |
1247 | return false; | |
c581d2a6 CB |
1248 | |
1249 | for (int i = 0; ops->hierarchies[i]; i++) { | |
1973b62a | 1250 | struct hierarchy *h = ops->hierarchies[i]; |
c581d2a6 CB |
1251 | int ret; |
1252 | ||
b3a42865 CB |
1253 | if (is_unified_hierarchy(h) && |
1254 | (handler->clone_flags & CLONE_INTO_CGROUP)) | |
f7176c3e CB |
1255 | continue; |
1256 | ||
e33870e5 | 1257 | ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len); |
c581d2a6 | 1258 | if (ret != 0) |
67ed60ce | 1259 | return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con); |
25db3f94 | 1260 | |
67ed60ce | 1261 | TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con); |
c581d2a6 CB |
1262 | } |
1263 | ||
1264 | return true; | |
eeef32bb CB |
1265 | } |
1266 | ||
1973b62a CB |
1267 | static int fchowmodat(int dirfd, const char *path, uid_t chown_uid, |
1268 | gid_t chown_gid, mode_t chmod_mode) | |
6efacf80 CB |
1269 | { |
1270 | int ret; | |
1271 | ||
1973b62a CB |
1272 | ret = fchownat(dirfd, path, chown_uid, chown_gid, |
1273 | AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); | |
1274 | if (ret < 0) | |
1275 | return log_warn_errno(-1, | |
1276 | errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )", | |
1277 | dirfd, path, (int)chown_uid, | |
1278 | (int)chown_gid); | |
6efacf80 | 1279 | |
1973b62a CB |
1280 | ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0); |
1281 | if (ret < 0) | |
1282 | return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)", | |
1283 | dirfd, path, (int)chmod_mode); | |
6efacf80 CB |
1284 | |
1285 | return 0; | |
1286 | } | |
1287 | ||
1288 | /* chgrp the container cgroups to container group. We leave | |
c0888dfe SH |
1289 | * the container owner as cgroup owner. So we must make the |
1290 | * directories 775 so that the container can create sub-cgroups. | |
43647298 SH |
1291 | * |
1292 | * Also chown the tasks and cgroup.procs files. Those may not | |
1293 | * exist depending on kernel version. | |
c0888dfe | 1294 | */ |
ccb4cabe SH |
1295 | static int chown_cgroup_wrapper(void *data) |
1296 | { | |
6a720d74 | 1297 | int ret; |
4160c3a0 CB |
1298 | uid_t destuid; |
1299 | struct generic_userns_exec_data *arg = data; | |
1300 | uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; | |
1301 | gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; | |
ccb4cabe | 1302 | |
8917c382 | 1303 | if (!lxc_drop_groups() && errno != EPERM) |
b58214ac CB |
1304 | return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); |
1305 | ||
6efacf80 | 1306 | ret = setresgid(nsgid, nsgid, nsgid); |
803e4123 | 1307 | if (ret < 0) |
77c3e9a2 | 1308 | return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", |
803e4123 | 1309 | (int)nsgid, (int)nsgid, (int)nsgid); |
6efacf80 CB |
1310 | |
1311 | ret = setresuid(nsuid, nsuid, nsuid); | |
803e4123 | 1312 | if (ret < 0) |
77c3e9a2 | 1313 | return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", |
803e4123 | 1314 | (int)nsuid, (int)nsuid, (int)nsuid); |
6efacf80 | 1315 | |
ccb4cabe | 1316 | destuid = get_ns_uid(arg->origuid); |
b962868f CB |
1317 | if (destuid == LXC_INVALID_UID) |
1318 | destuid = 0; | |
ccb4cabe | 1319 | |
6a720d74 | 1320 | for (int i = 0; arg->hierarchies[i]; i++) { |
e33870e5 | 1321 | int dirfd = arg->hierarchies[i]->dfd_con; |
43647298 | 1322 | |
7f02fd24 | 1323 | if (dirfd < 0) |
060aaa39 | 1324 | return syserror_set(-EBADF, "Invalid cgroup file descriptor"); |
7f02fd24 | 1325 | |
1973b62a | 1326 | (void)fchowmodat(dirfd, "", destuid, nsgid, 0775); |
c0888dfe | 1327 | |
1973b62a CB |
1328 | /* |
1329 | * Failures to chown() these are inconvenient but not | |
6efacf80 CB |
1330 | * detrimental We leave these owned by the container launcher, |
1331 | * so that container root can write to the files to attach. We | |
1332 | * chmod() them 664 so that container systemd can write to the | |
1333 | * files (which systemd in wily insists on doing). | |
ab8f5424 | 1334 | */ |
6efacf80 | 1335 | |
b8572e8c | 1336 | if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY) |
1973b62a | 1337 | (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664); |
43647298 | 1338 | |
1973b62a | 1339 | (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664); |
0e17357c | 1340 | |
b8572e8c | 1341 | if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY) |
0e17357c CB |
1342 | continue; |
1343 | ||
042f9e9c | 1344 | for (char **p = arg->hierarchies[i]->delegate; p && *p; p++) |
1973b62a | 1345 | (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664); |
ccb4cabe SH |
1346 | } |
1347 | ||
1348 | return 0; | |
1349 | } | |
1350 | ||
b857f4be | 1351 | __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops, |
c98bbf71 | 1352 | struct lxc_conf *conf) |
ccb4cabe | 1353 | { |
4160c3a0 | 1354 | struct generic_userns_exec_data wrap; |
ccb4cabe | 1355 | |
c98bbf71 CB |
1356 | if (!ops) |
1357 | return ret_set_errno(false, ENOENT); | |
ccb4cabe | 1358 | |
69b4a4bb CB |
1359 | if (!ops->hierarchies) |
1360 | return true; | |
1361 | ||
c98bbf71 CB |
1362 | if (!ops->container_cgroup) |
1363 | return ret_set_errno(false, ENOENT); | |
1364 | ||
1365 | if (!conf) | |
1366 | return ret_set_errno(false, EINVAL); | |
1367 | ||
0589d744 | 1368 | if (list_empty(&conf->id_map)) |
c98bbf71 CB |
1369 | return true; |
1370 | ||
ccb4cabe | 1371 | wrap.origuid = geteuid(); |
4160c3a0 | 1372 | wrap.path = NULL; |
2202afc9 | 1373 | wrap.hierarchies = ops->hierarchies; |
4160c3a0 | 1374 | wrap.conf = conf; |
ccb4cabe | 1375 | |
c98bbf71 CB |
1376 | if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0) |
1377 | return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace"); | |
ccb4cabe SH |
1378 | |
1379 | return true; | |
1380 | } | |
1381 | ||
840eec19 | 1382 | __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops) |
78eb6aa6 CB |
1383 | { |
1384 | if (!ops) | |
1385 | return; | |
1386 | ||
1387 | if (!ops->hierarchies) | |
1388 | return; | |
1389 | ||
840eec19 CB |
1390 | for (int i = 0; ops->hierarchies[i]; i++) { |
1391 | struct hierarchy *h = ops->hierarchies[i]; | |
1392 | ||
1393 | /* Close all monitor cgroup file descriptors. */ | |
1394 | close_prot_errno_disarm(h->dfd_mon); | |
1395 | } | |
1396 | /* Close the cgroup root file descriptor. */ | |
1397 | close_prot_errno_disarm(ops->dfd_mnt); | |
1398 | ||
6dcd6f02 CB |
1399 | /* |
1400 | * The checking for freezer support should obviously be done at cgroup | |
1401 | * initialization time but that doesn't work reliable. The freezer | |
1402 | * controller has been demoted (rightly so) to a simple file located in | |
1403 | * each non-root cgroup. At the time when the container is created we | |
1404 | * might still be located in /sys/fs/cgroup and so checking for | |
1405 | * cgroup.freeze won't tell us anything because this file doesn't exist | |
1406 | * in the root cgroup. We could then iterate through /sys/fs/cgroup and | |
1407 | * find an already existing cgroup and then check within that cgroup | |
1408 | * for the existence of cgroup.freeze but that will only work on | |
1409 | * systemd based hosts. Other init systems might not manage cgroups and | |
1410 | * so no cgroup will exist. So we defer until we have created cgroups | |
1411 | * for our container which means we check here. | |
1412 | */ | |
1413 | if (pure_unified_layout(ops) && | |
e33870e5 | 1414 | !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK, |
6dcd6f02 CB |
1415 | AT_SYMLINK_NOFOLLOW)) { |
1416 | TRACE("Unified hierarchy supports freezer"); | |
ca72ccb5 | 1417 | ops->unified->utilities |= FREEZER_CONTROLLER; |
6dcd6f02 | 1418 | } |
78eb6aa6 CB |
1419 | } |
1420 | ||
8aa1044f | 1421 | /* cgroup-full:* is done, no need to create subdirs */ |
bd09ee98 | 1422 | static inline bool cg_mount_needs_subdirs(int cgroup_automount_type) |
8aa1044f | 1423 | { |
bd09ee98 | 1424 | switch (cgroup_automount_type) { |
51feb8db CB |
1425 | case LXC_AUTO_CGROUP_RO: |
1426 | return true; | |
1427 | case LXC_AUTO_CGROUP_RW: | |
1428 | return true; | |
1429 | case LXC_AUTO_CGROUP_MIXED: | |
1430 | return true; | |
1431 | } | |
1432 | ||
1433 | return false; | |
8aa1044f SH |
1434 | } |
1435 | ||
886cac86 CB |
1436 | /* After $rootfs/sys/fs/container/controller/the/cg/path has been created, |
1437 | * remount controller ro if needed and bindmount the cgroupfs onto | |
25fa6f8c | 1438 | * control/the/cg/path. |
8aa1044f | 1439 | */ |
bd09ee98 | 1440 | static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h, |
a9db9474 | 1441 | char *hierarchy_mnt, char *cgpath, |
6812d833 | 1442 | const char *container_cgroup) |
8aa1044f | 1443 | { |
d97919ab | 1444 | __do_free char *sourcepath = NULL; |
5285689c | 1445 | int ret, remount_flags; |
886cac86 CB |
1446 | int flags = MS_BIND; |
1447 | ||
bd09ee98 CB |
1448 | if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || |
1449 | (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) { | |
a9db9474 | 1450 | ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL); |
77c3e9a2 CB |
1451 | if (ret < 0) |
1452 | return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"", | |
a9db9474 | 1453 | hierarchy_mnt, hierarchy_mnt); |
886cac86 | 1454 | |
a9db9474 CB |
1455 | remount_flags = add_required_remount_flags(hierarchy_mnt, |
1456 | hierarchy_mnt, | |
5285689c | 1457 | flags | MS_REMOUNT); |
a9db9474 | 1458 | ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", |
8186c5c7 CB |
1459 | remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY, |
1460 | NULL); | |
77c3e9a2 | 1461 | if (ret < 0) |
a9db9474 | 1462 | return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt); |
886cac86 | 1463 | |
a9db9474 | 1464 | INFO("Remounted %s read-only", hierarchy_mnt); |
8aa1044f | 1465 | } |
886cac86 | 1466 | |
44585f1a | 1467 | sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL); |
bd09ee98 | 1468 | if (cgroup_automount_type == LXC_AUTO_CGROUP_RO) |
8aa1044f | 1469 | flags |= MS_RDONLY; |
886cac86 CB |
1470 | |
1471 | ret = mount(sourcepath, cgpath, "cgroup", flags, NULL); | |
77c3e9a2 CB |
1472 | if (ret < 0) |
1473 | return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"", | |
1474 | h->controllers[0], cgpath); | |
886cac86 | 1475 | INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath); |
f8c40ffa L |
1476 | |
1477 | if (flags & MS_RDONLY) { | |
5285689c CB |
1478 | remount_flags = add_required_remount_flags(sourcepath, cgpath, |
1479 | flags | MS_REMOUNT); | |
1480 | ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL); | |
77c3e9a2 CB |
1481 | if (ret < 0) |
1482 | return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath); | |
5285689c | 1483 | INFO("Remounted %s read-only", cgpath); |
f8c40ffa L |
1484 | } |
1485 | ||
886cac86 | 1486 | INFO("Completed second stage cgroup automounts for \"%s\"", cgpath); |
8aa1044f SH |
1487 | return 0; |
1488 | } | |
1489 | ||
44234ae1 | 1490 | /* __cgroupfs_mount |
6812d833 CB |
1491 | * |
1492 | * Mount cgroup hierarchies directly without using bind-mounts. The main | |
1493 | * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting | |
1494 | * cgroups for the LXC_AUTO_CGROUP_FULL option. | |
1495 | */ | |
bd09ee98 | 1496 | static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, |
44234ae1 CB |
1497 | struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs, |
1498 | const char *hierarchy_mnt) | |
b635e92d | 1499 | { |
a099c5db CB |
1500 | __do_close int fd_fs = -EBADF; |
1501 | unsigned int flags = 0; | |
02efd041 CB |
1502 | char *fstype; |
1503 | int ret; | |
1504 | ||
1505 | if (dfd_mnt_cgroupfs < 0) | |
1506 | return ret_errno(EINVAL); | |
1507 | ||
a099c5db CB |
1508 | flags |= MOUNT_ATTR_NOSUID; |
1509 | flags |= MOUNT_ATTR_NOEXEC; | |
1510 | flags |= MOUNT_ATTR_NODEV; | |
1511 | flags |= MOUNT_ATTR_RELATIME; | |
02efd041 | 1512 | |
bd09ee98 CB |
1513 | if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || |
1514 | (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO)) | |
a099c5db | 1515 | flags |= MOUNT_ATTR_RDONLY; |
02efd041 | 1516 | |
bd09ee98 | 1517 | if (is_unified_hierarchy(h)) |
02efd041 | 1518 | fstype = "cgroup2"; |
bd09ee98 | 1519 | else |
02efd041 | 1520 | fstype = "cgroup"; |
b635e92d | 1521 | |
de7f9f33 | 1522 | if (can_use_mount_api()) { |
635e7bac CB |
1523 | fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); |
1524 | if (fd_fs < 0) | |
1525 | return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype); | |
1526 | ||
1527 | if (!is_unified_hierarchy(h)) { | |
1528 | for (const char **it = (const char **)h->controllers; it && *it; it++) { | |
aa72fbe7 | 1529 | if (strnequal(*it, "name=", STRLITERALLEN("name="))) |
635e7bac CB |
1530 | ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name=")); |
1531 | else | |
1532 | ret = fs_set_property(fd_fs, *it, ""); | |
1533 | if (ret < 0) | |
1534 | return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs); | |
1535 | } | |
1536 | } | |
1537 | ||
1538 | ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt, | |
1539 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, | |
1540 | flags); | |
1541 | } else { | |
a099c5db CB |
1542 | __do_free char *controllers = NULL, *target = NULL; |
1543 | unsigned int old_flags = 0; | |
02efd041 CB |
1544 | const char *rootfs_mnt; |
1545 | ||
a099c5db CB |
1546 | if (!is_unified_hierarchy(h)) { |
1547 | controllers = lxc_string_join(",", (const char **)h->controllers, false); | |
1548 | if (!controllers) | |
1549 | return ret_errno(ENOMEM); | |
1550 | } | |
1551 | ||
02efd041 | 1552 | rootfs_mnt = get_rootfs_mnt(rootfs); |
a099c5db CB |
1553 | ret = mnt_attributes_old(flags, &old_flags); |
1554 | if (ret) | |
1555 | return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified"); | |
1556 | ||
02efd041 | 1557 | target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL); |
a099c5db | 1558 | ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt); |
02efd041 | 1559 | } |
77c3e9a2 | 1560 | if (ret < 0) |
02efd041 CB |
1561 | return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)", |
1562 | fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); | |
b635e92d | 1563 | |
02efd041 CB |
1564 | DEBUG("Mounted cgroup filesystem %s onto %d(%s)", |
1565 | fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); | |
b635e92d CB |
1566 | return 0; |
1567 | } | |
1568 | ||
bd09ee98 | 1569 | static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, |
074af890 CB |
1570 | struct lxc_rootfs *rootfs, |
1571 | int dfd_mnt_cgroupfs, const char *hierarchy_mnt) | |
6812d833 | 1572 | { |
bd09ee98 CB |
1573 | return __cgroupfs_mount(cgroup_automount_type, h, rootfs, |
1574 | dfd_mnt_cgroupfs, hierarchy_mnt); | |
6812d833 CB |
1575 | } |
1576 | ||
bd09ee98 | 1577 | static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h, |
14111650 CB |
1578 | struct lxc_rootfs *rootfs, |
1579 | int dfd_mnt_cgroupfs, | |
1580 | const char *hierarchy_mnt) | |
6812d833 | 1581 | { |
bd09ee98 | 1582 | switch (cgroup_automount_type) { |
51feb8db CB |
1583 | case LXC_AUTO_CGROUP_FULL_RO: |
1584 | break; | |
1585 | case LXC_AUTO_CGROUP_FULL_RW: | |
1586 | break; | |
1587 | case LXC_AUTO_CGROUP_FULL_MIXED: | |
1588 | break; | |
1589 | default: | |
6812d833 | 1590 | return 0; |
51feb8db | 1591 | } |
6812d833 | 1592 | |
bd09ee98 CB |
1593 | return __cgroupfs_mount(cgroup_automount_type, h, rootfs, |
1594 | dfd_mnt_cgroupfs, hierarchy_mnt); | |
6812d833 CB |
1595 | } |
1596 | ||
b857f4be | 1597 | __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, |
cdd3b77d | 1598 | struct lxc_handler *handler, int cg_flags) |
ccb4cabe | 1599 | { |
9bca62b3 | 1600 | __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF; |
6607d6e9 | 1601 | __do_free char *cgroup_root = NULL; |
bd09ee98 | 1602 | int cgroup_automount_type; |
937a3af9 | 1603 | bool in_cgroup_ns = false, wants_force_mount = false; |
ab8cd5d9 | 1604 | struct lxc_conf *conf = handler->conf; |
315f8a4e | 1605 | struct lxc_rootfs *rootfs = &conf->rootfs; |
02efd041 | 1606 | const char *rootfs_mnt = get_rootfs_mnt(rootfs); |
dfa835ac | 1607 | int ret; |
8aa1044f | 1608 | |
9585ccb3 CB |
1609 | if (!ops) |
1610 | return ret_set_errno(false, ENOENT); | |
1611 | ||
69b4a4bb CB |
1612 | if (!ops->hierarchies) |
1613 | return true; | |
1614 | ||
315f8a4e | 1615 | if (!conf) |
9585ccb3 CB |
1616 | return ret_set_errno(false, EINVAL); |
1617 | ||
cdd3b77d | 1618 | if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0) |
c581c8a3 | 1619 | return log_trace(true, "No cgroup mounts requested"); |
8aa1044f | 1620 | |
69c29673 CB |
1621 | if (cg_flags & LXC_AUTO_CGROUP_FORCE) { |
1622 | cg_flags &= ~LXC_AUTO_CGROUP_FORCE; | |
3f69fb12 | 1623 | wants_force_mount = true; |
69c29673 CB |
1624 | } |
1625 | ||
1626 | switch (cg_flags) { | |
1627 | case LXC_AUTO_CGROUP_RO: | |
1628 | TRACE("Read-only cgroup mounts requested"); | |
1629 | break; | |
1630 | case LXC_AUTO_CGROUP_RW: | |
1631 | TRACE("Read-write cgroup mounts requested"); | |
1632 | break; | |
1633 | case LXC_AUTO_CGROUP_MIXED: | |
1634 | TRACE("Mixed cgroup mounts requested"); | |
1635 | break; | |
1636 | case LXC_AUTO_CGROUP_FULL_RO: | |
1637 | TRACE("Full read-only cgroup mounts requested"); | |
1638 | break; | |
1639 | case LXC_AUTO_CGROUP_FULL_RW: | |
1640 | TRACE("Full read-write cgroup mounts requested"); | |
1641 | break; | |
1642 | case LXC_AUTO_CGROUP_FULL_MIXED: | |
1643 | TRACE("Full mixed cgroup mounts requested"); | |
1644 | break; | |
1645 | default: | |
1646 | return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified"); | |
1647 | } | |
bd09ee98 | 1648 | cgroup_automount_type = cg_flags; |
b635e92d | 1649 | |
4547e73e | 1650 | if (!wants_force_mount) { |
315f8a4e | 1651 | wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf); |
4547e73e CB |
1652 | |
1653 | /* | |
1654 | * Most recent distro versions currently have init system that | |
1655 | * do support cgroup2 but do not mount it by default unless | |
1656 | * explicitly told so even if the host is cgroup2 only. That | |
1657 | * means they often will fail to boot. Fix this by pre-mounting | |
1658 | * cgroup2 by default. We will likely need to be doing this a | |
1659 | * few years until all distros have switched over to cgroup2 at | |
1660 | * which point we can safely assume that their init systems | |
1661 | * will mount it themselves. | |
1662 | */ | |
1663 | if (pure_unified_layout(ops)) | |
1664 | wants_force_mount = true; | |
3f69fb12 | 1665 | } |
8aa1044f | 1666 | |
2c4348bd | 1667 | if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP)) |
937a3af9 | 1668 | in_cgroup_ns = true; |
6768700d | 1669 | |
937a3af9 | 1670 | if (in_cgroup_ns && !wants_force_mount) |
3a86fb37 | 1671 | return log_trace(true, "Mounting cgroups not requested or needed"); |
8aa1044f | 1672 | |
02efd041 CB |
1673 | /* This is really the codepath that we want. */ |
1674 | if (pure_unified_layout(ops)) { | |
9bca62b3 CB |
1675 | __do_close int dfd_mnt_unified = -EBADF; |
1676 | ||
1677 | dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, | |
1678 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
1679 | if (dfd_mnt_unified < 0) | |
9fc21b2d CB |
1680 | return syserror_ret(false, "Failed to open %d(%s)", |
1681 | rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); | |
e7e45fdf CB |
1682 | /* |
1683 | * If cgroup namespaces are supported but the container will | |
1684 | * not have CAP_SYS_ADMIN after it has started we need to mount | |
1685 | * the cgroups manually. | |
a3e5ec26 CB |
1686 | * |
1687 | * Note that here we know that wants_force_mount is true. | |
1688 | * Otherwise we would've returned early above. | |
e7e45fdf | 1689 | */ |
a3e5ec26 CB |
1690 | if (in_cgroup_ns) { |
1691 | /* | |
1692 | * 1. cgroup:rw:force -> Mount the cgroup2 filesystem. | |
1693 | * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only. | |
1694 | * 3. cgroup:mixed:force -> See comment above how this | |
1695 | * does not apply so | |
1696 | * cgroup:mixed is equal to | |
1697 | * cgroup:rw when cgroup | |
1698 | * namespaces are supported. | |
1699 | ||
1700 | * 4. cgroup:rw -> No-op; init system responsible for mounting. | |
1701 | * 5. cgroup:ro -> No-op; init system responsible for mounting. | |
1702 | * 6. cgroup:mixed -> No-op; init system responsible for mounting. | |
1703 | * | |
1704 | * 7. cgroup-full:rw -> Not supported. | |
1705 | * 8. cgroup-full:ro -> Not supported. | |
1706 | * 9. cgroup-full:mixed -> Not supported. | |
1707 | ||
1708 | * 10. cgroup-full:rw:force -> Not supported. | |
1709 | * 11. cgroup-full:ro:force -> Not supported. | |
1710 | * 12. cgroup-full:mixed:force -> Not supported. | |
1711 | */ | |
bd09ee98 | 1712 | ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, ""); |
a3e5ec26 | 1713 | if (ret < 0) |
9fc21b2d | 1714 | return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace"); |
a3e5ec26 CB |
1715 | |
1716 | return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace"); | |
1717 | } else { | |
1718 | /* | |
1719 | * Either no cgroup namespace supported (highly | |
1720 | * unlikely unless we're dealing with a Frankenkernel. | |
1721 | * Or the user requested to keep the cgroup namespace | |
1722 | * of the host or another container. | |
1723 | */ | |
1724 | if (wants_force_mount) { | |
1725 | /* | |
1726 | * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable. | |
1727 | * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only. | |
1728 | * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and | |
1729 | * and make the parent directory of the | |
1730 | * container's cgroup read-only but the | |
1731 | * container's cgroup writable. | |
1732 | * | |
1733 | * 10. cgroup-full:rw:force -> | |
1734 | * 11. cgroup-full:ro:force -> | |
1735 | * 12. cgroup-full:mixed:force -> | |
1736 | */ | |
1737 | errno = EOPNOTSUPP; | |
1738 | SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); | |
1739 | } else { | |
1740 | errno = EOPNOTSUPP; | |
1741 | SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); | |
1742 | } | |
1743 | } | |
8d661d38 | 1744 | |
9fc21b2d | 1745 | return syserror_ret(false, "Failed to mount cgroups"); |
8d661d38 CB |
1746 | } |
1747 | ||
e6d4df78 CB |
1748 | /* |
1749 | * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're | |
1750 | * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the | |
1751 | * DEFAULT_CGROUP_MOUNTPOINT define. | |
1752 | */ | |
de7f9f33 | 1753 | if (can_use_mount_api()) { |
635e7bac CB |
1754 | fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); |
1755 | if (fd_fs < 0) | |
1756 | return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs"); | |
1757 | ||
23a20dbe CB |
1758 | ret = fs_set_property(fd_fs, "mode", "0755"); |
1759 | if (ret < 0) | |
1760 | return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); | |
1761 | ||
1762 | ret = fs_set_property(fd_fs, "size", "10240k"); | |
1763 | if (ret < 0) | |
1764 | return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); | |
1765 | ||
1766 | ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, | |
1767 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, | |
1768 | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | | |
1769 | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME); | |
635e7bac CB |
1770 | } else { |
1771 | cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); | |
1772 | ret = safe_mount(NULL, cgroup_root, "tmpfs", | |
1773 | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, | |
1774 | "size=10240k,mode=755", rootfs_mnt); | |
8b1f4dd9 | 1775 | } |
3f69fb12 | 1776 | if (ret < 0) |
02efd041 CB |
1777 | return log_error_errno(false, errno, "Failed to mount tmpfs on %s", |
1778 | DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); | |
8aa1044f | 1779 | |
9bca62b3 CB |
1780 | dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, |
1781 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
1782 | if (dfd_mnt_tmpfs < 0) | |
9fc21b2d CB |
1783 | return syserror_ret(false, "Failed to open %d(%s)", |
1784 | rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); | |
9bca62b3 | 1785 | |
dfa835ac | 1786 | for (int i = 0; ops->hierarchies[i]; i++) { |
a9db9474 | 1787 | __do_free char *hierarchy_mnt = NULL, *path2 = NULL; |
2202afc9 | 1788 | struct hierarchy *h = ops->hierarchies[i]; |
8aa1044f | 1789 | |
a58be2ad | 1790 | ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000); |
d7314671 | 1791 | if (ret < 0) |
9fc21b2d | 1792 | return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); |
b635e92d | 1793 | |
937a3af9 | 1794 | if (in_cgroup_ns && wants_force_mount) { |
02efd041 CB |
1795 | /* |
1796 | * If cgroup namespaces are supported but the container | |
b635e92d CB |
1797 | * will not have CAP_SYS_ADMIN after it has started we |
1798 | * need to mount the cgroups manually. | |
1799 | */ | |
a9db9474 | 1800 | ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, |
a58be2ad | 1801 | dfd_mnt_tmpfs, h->at_mnt); |
3f69fb12 | 1802 | if (ret < 0) |
d7314671 | 1803 | return false; |
3f69fb12 | 1804 | |
b635e92d CB |
1805 | continue; |
1806 | } | |
1807 | ||
02efd041 | 1808 | /* Here is where the ancient kernel section begins. */ |
a9db9474 | 1809 | ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, |
a58be2ad | 1810 | dfd_mnt_tmpfs, h->at_mnt); |
d97919ab | 1811 | if (ret < 0) |
d7314671 | 1812 | return false; |
3f69fb12 | 1813 | |
bd09ee98 | 1814 | if (!cg_mount_needs_subdirs(cgroup_automount_type)) |
8aa1044f | 1815 | continue; |
3f69fb12 | 1816 | |
f1921f35 CB |
1817 | if (!cgroup_root) |
1818 | cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); | |
1819 | ||
a58be2ad | 1820 | hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL); |
44585f1a | 1821 | path2 = must_make_path(hierarchy_mnt, h->at_base, |
a9db9474 | 1822 | ops->container_cgroup, NULL); |
3f69fb12 | 1823 | ret = mkdir_p(path2, 0755); |
77410c98 | 1824 | if (ret < 0 && (errno != EEXIST)) |
d7314671 | 1825 | return false; |
2f62fb00 | 1826 | |
a9db9474 CB |
1827 | ret = cg_legacy_mount_controllers(cgroup_automount_type, h, |
1828 | hierarchy_mnt, path2, | |
1829 | ops->container_cgroup); | |
3f69fb12 | 1830 | if (ret < 0) |
d7314671 | 1831 | return false; |
8aa1044f | 1832 | } |
8aa1044f | 1833 | |
d7314671 | 1834 | return true; |
ccb4cabe SH |
1835 | } |
1836 | ||
11c23867 | 1837 | /* Only root needs to escape to the cgroup of its init. */ |
ff9edd2d CB |
1838 | __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops, |
1839 | struct lxc_conf *conf) | |
ccb4cabe | 1840 | { |
52d08ab0 CB |
1841 | if (!ops) |
1842 | return ret_set_errno(false, ENOENT); | |
1843 | ||
1844 | if (!ops->hierarchies) | |
1845 | return true; | |
1846 | ||
1847 | if (!conf) | |
1848 | return ret_set_errno(false, EINVAL); | |
1849 | ||
1850 | if (conf->cgroup_meta.relative || geteuid()) | |
ccb4cabe SH |
1851 | return true; |
1852 | ||
779b3d82 | 1853 | for (int i = 0; ops->hierarchies[i]; i++) { |
88396101 | 1854 | __do_free char *fullpath = NULL; |
52d08ab0 | 1855 | int ret; |
11c23867 | 1856 | |
35ec1a38 | 1857 | fullpath = make_cgroup_path(ops->hierarchies[i], |
44585f1a | 1858 | ops->hierarchies[i]->at_base, |
35ec1a38 | 1859 | "cgroup.procs", NULL); |
7cea5905 | 1860 | ret = lxc_write_to_file(fullpath, "0", 2, false, 0666); |
52d08ab0 | 1861 | if (ret != 0) |
77c3e9a2 | 1862 | return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath); |
ccb4cabe SH |
1863 | } |
1864 | ||
6df334d1 | 1865 | return true; |
ccb4cabe SH |
1866 | } |
1867 | ||
ff9edd2d | 1868 | __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops) |
36662416 | 1869 | { |
69b4a4bb CB |
1870 | int i = 0; |
1871 | ||
e3ffb28b CB |
1872 | if (!ops) |
1873 | return ret_set_errno(-1, ENOENT); | |
1874 | ||
69b4a4bb CB |
1875 | if (!ops->hierarchies) |
1876 | return 0; | |
36662416 | 1877 | |
69b4a4bb | 1878 | for (; ops->hierarchies[i]; i++) |
36662416 TA |
1879 | ; |
1880 | ||
1881 | return i; | |
1882 | } | |
1883 | ||
ff9edd2d CB |
1884 | __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops, |
1885 | int n, char ***out) | |
36662416 TA |
1886 | { |
1887 | int i; | |
1888 | ||
aa48a34f CB |
1889 | if (!ops) |
1890 | return ret_set_errno(false, ENOENT); | |
1891 | ||
69b4a4bb | 1892 | if (!ops->hierarchies) |
77c3e9a2 | 1893 | return ret_set_errno(false, ENOENT); |
69b4a4bb | 1894 | |
b7b227cc | 1895 | /* consistency check n */ |
6b38e644 | 1896 | for (i = 0; i < n; i++) |
2202afc9 | 1897 | if (!ops->hierarchies[i]) |
aa48a34f | 1898 | return ret_set_errno(false, ENOENT); |
36662416 | 1899 | |
2202afc9 | 1900 | *out = ops->hierarchies[i]->controllers; |
36662416 TA |
1901 | |
1902 | return true; | |
1903 | } | |
1904 | ||
b8a4fe12 | 1905 | static int cg_legacy_freeze(struct cgroup_ops *ops) |
ccb4cabe | 1906 | { |
d6337a5f | 1907 | struct hierarchy *h; |
ccb4cabe | 1908 | |
ee3a7775 CB |
1909 | h = get_hierarchy(ops, "freezer"); |
1910 | if (!h) | |
d2203230 | 1911 | return ret_set_errno(-1, ENOENT); |
81468ea7 | 1912 | |
67ed60ce | 1913 | return lxc_write_openat(h->path_con, "freezer.state", |
c04a6d4e | 1914 | "FROZEN", STRLITERALLEN("FROZEN")); |
ee3a7775 | 1915 | } |
942e193e | 1916 | |
018051e3 | 1917 | static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata, |
3298b37d | 1918 | struct lxc_async_descr *descr) |
ee3a7775 | 1919 | { |
018051e3 | 1920 | __do_free char *line = NULL; |
ee3a7775 | 1921 | __do_fclose FILE *f = NULL; |
018051e3 CB |
1922 | int state = PTR_TO_INT(cbdata); |
1923 | size_t len; | |
1924 | const char *state_string; | |
1925 | ||
c8af3332 | 1926 | f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH); |
018051e3 CB |
1927 | if (!f) |
1928 | return LXC_MAINLOOP_ERROR; | |
018051e3 CB |
1929 | |
1930 | if (state == 1) | |
1931 | state_string = "frozen 1"; | |
1932 | else | |
1933 | state_string = "frozen 0"; | |
1934 | ||
1935 | while (getline(&line, &len, f) != -1) | |
aa72fbe7 | 1936 | if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2)) |
018051e3 CB |
1937 | return LXC_MAINLOOP_CLOSE; |
1938 | ||
281c3645 CB |
1939 | rewind(f); |
1940 | ||
018051e3 CB |
1941 | return LXC_MAINLOOP_CONTINUE; |
1942 | } | |
1943 | ||
443be565 WB |
1944 | static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout, |
1945 | const char *state_string, | |
1946 | int state_num, | |
1947 | const char *epoll_error, | |
1948 | const char *wait_error) | |
018051e3 | 1949 | { |
f62cf1d4 | 1950 | __do_close int fd = -EBADF; |
3298b37d | 1951 | call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL; |
018051e3 | 1952 | int ret; |
3298b37d | 1953 | struct lxc_async_descr descr; |
ee3a7775 | 1954 | struct hierarchy *h; |
942e193e CB |
1955 | |
1956 | h = ops->unified; | |
457ca9aa | 1957 | if (!h) |
d2203230 | 1958 | return ret_set_errno(-1, ENOENT); |
d6337a5f | 1959 | |
67ed60ce | 1960 | if (!h->path_con) |
d2203230 | 1961 | return ret_set_errno(-1, EEXIST); |
d6337a5f | 1962 | |
018051e3 CB |
1963 | if (timeout != 0) { |
1964 | __do_free char *events_file = NULL; | |
942e193e | 1965 | |
67ed60ce | 1966 | events_file = must_make_path(h->path_con, "cgroup.events", NULL); |
018051e3 CB |
1967 | fd = open(events_file, O_RDONLY | O_CLOEXEC); |
1968 | if (fd < 0) | |
d2203230 | 1969 | return log_error_errno(-1, errno, "Failed to open cgroup.events file"); |
942e193e | 1970 | |
018051e3 CB |
1971 | ret = lxc_mainloop_open(&descr); |
1972 | if (ret) | |
443be565 | 1973 | return log_error_errno(-1, errno, "%s", epoll_error); |
942e193e | 1974 | |
018051e3 CB |
1975 | /* automatically cleaned up now */ |
1976 | descr_ptr = &descr; | |
942e193e | 1977 | |
543d2f83 CB |
1978 | ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, |
1979 | freezer_cgroup_events_cb, | |
1980 | default_cleanup_handler, | |
1981 | INT_TO_PTR(state_num), | |
1982 | "freezer_cgroup_events_cb"); | |
018051e3 | 1983 | if (ret < 0) |
d2203230 | 1984 | return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); |
018051e3 | 1985 | } |
942e193e | 1986 | |
67ed60ce | 1987 | ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1); |
018051e3 | 1988 | if (ret < 0) |
d2203230 | 1989 | return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); |
018051e3 CB |
1990 | |
1991 | if (timeout != 0 && lxc_mainloop(&descr, timeout)) | |
443be565 | 1992 | return log_error_errno(-1, errno, "%s", wait_error); |
018051e3 CB |
1993 | |
1994 | return 0; | |
942e193e CB |
1995 | } |
1996 | ||
443be565 WB |
1997 | static int cg_unified_freeze(struct cgroup_ops *ops, int timeout) |
1998 | { | |
1999 | return cg_unified_freeze_do(ops, timeout, "1", 1, | |
2000 | "Failed to create epoll instance to wait for container freeze", | |
2001 | "Failed to wait for container to be frozen"); | |
2002 | } | |
2003 | ||
018051e3 | 2004 | __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout) |
942e193e | 2005 | { |
81468ea7 | 2006 | if (!ops->hierarchies) |
d2203230 | 2007 | return ret_set_errno(-1, ENOENT); |
81468ea7 | 2008 | |
ee3a7775 CB |
2009 | if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) |
2010 | return cg_legacy_freeze(ops); | |
942e193e | 2011 | |
018051e3 | 2012 | return cg_unified_freeze(ops, timeout); |
ee3a7775 CB |
2013 | } |
2014 | ||
018051e3 | 2015 | static int cg_legacy_unfreeze(struct cgroup_ops *ops) |
ee3a7775 | 2016 | { |
ee3a7775 CB |
2017 | struct hierarchy *h; |
2018 | ||
2019 | h = get_hierarchy(ops, "freezer"); | |
2020 | if (!h) | |
d2203230 | 2021 | return ret_set_errno(-1, ENOENT); |
ee3a7775 | 2022 | |
67ed60ce | 2023 | return lxc_write_openat(h->path_con, "freezer.state", |
c04a6d4e | 2024 | "THAWED", STRLITERALLEN("THAWED")); |
ee3a7775 CB |
2025 | } |
2026 | ||
018051e3 | 2027 | static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout) |
ee3a7775 | 2028 | { |
443be565 WB |
2029 | return cg_unified_freeze_do(ops, timeout, "0", 0, |
2030 | "Failed to create epoll instance to wait for container unfreeze", | |
2031 | "Failed to wait for container to be unfrozen"); | |
ee3a7775 CB |
2032 | } |
2033 | ||
018051e3 | 2034 | __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout) |
ee3a7775 CB |
2035 | { |
2036 | if (!ops->hierarchies) | |
d2203230 | 2037 | return ret_set_errno(-1, ENOENT); |
ee3a7775 CB |
2038 | |
2039 | if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) | |
2040 | return cg_legacy_unfreeze(ops); | |
2041 | ||
018051e3 | 2042 | return cg_unified_unfreeze(ops, timeout); |
ccb4cabe SH |
2043 | } |
2044 | ||
a900cbaf WB |
2045 | static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops, |
2046 | const char *controller, bool limiting) | |
ccb4cabe | 2047 | { |
d6337a5f | 2048 | struct hierarchy *h; |
35ec1a38 CB |
2049 | size_t len; |
2050 | const char *path; | |
d6337a5f | 2051 | |
2202afc9 | 2052 | h = get_hierarchy(ops, controller); |
6bdf9691 | 2053 | if (!h) |
35ec1a38 CB |
2054 | return log_warn_errno(NULL, ENOENT, |
2055 | "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller)); | |
ccb4cabe | 2056 | |
a900cbaf | 2057 | if (limiting) |
b1b1a60f | 2058 | path = h->path_lim; |
35ec1a38 | 2059 | else |
67ed60ce | 2060 | path = h->path_con; |
35ec1a38 CB |
2061 | if (!path) |
2062 | return NULL; | |
a900cbaf | 2063 | |
a58be2ad CB |
2064 | len = strlen(h->at_mnt); |
2065 | if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT, | |
35ec1a38 CB |
2066 | STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) { |
2067 | path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT); | |
2068 | path += strspn(path, "/"); | |
2069 | } | |
2070 | return path += len; | |
371f834d SH |
2071 | } |
2072 | ||
a900cbaf WB |
2073 | __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops, |
2074 | const char *controller) | |
2075 | { | |
2076 | return cgfsng_get_cgroup_do(ops, controller, false); | |
2077 | } | |
2078 | ||
a9b642ee CB |
2079 | __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops, |
2080 | const char *controller) | |
a900cbaf WB |
2081 | { |
2082 | return cgfsng_get_cgroup_do(ops, controller, true); | |
2083 | } | |
2084 | ||
c40c8209 CB |
2085 | /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path, |
2086 | * which must be freed by the caller. | |
371f834d | 2087 | */ |
c40c8209 CB |
2088 | static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h, |
2089 | const char *inpath, | |
2090 | const char *filename) | |
371f834d | 2091 | { |
35ec1a38 | 2092 | return make_cgroup_path(h, inpath, filename, NULL); |
ccb4cabe SH |
2093 | } |
2094 | ||
4b86fefd | 2095 | static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid) |
c2aed66d | 2096 | { |
ad275c16 | 2097 | int idx = 1; |
c2aed66d | 2098 | int ret; |
900b6606 | 2099 | char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; |
6e2078de | 2100 | ssize_t pidstr_len; |
c2aed66d | 2101 | |
ad275c16 | 2102 | /* Create leaf cgroup. */ |
275e8ef8 | 2103 | ret = mkdirat(unified_fd, ".lxc", 0755); |
ad275c16 | 2104 | if (ret < 0 && errno != EEXIST) |
6e2078de CB |
2105 | return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\""); |
2106 | ||
0bba27c1 CB |
2107 | pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid); |
2108 | if (pidstr_len < 0) | |
2109 | return pidstr_len; | |
ad275c16 | 2110 | |
275e8ef8 | 2111 | ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len); |
ad275c16 CB |
2112 | if (ret < 0) |
2113 | ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len); | |
c2aed66d | 2114 | if (ret == 0) |
6e2078de | 2115 | return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd); |
ad275c16 | 2116 | |
bad788b0 CB |
2117 | /* this is a non-leaf node */ |
2118 | if (errno != EBUSY) | |
6e2078de | 2119 | return log_error_errno(-errno, errno, "Failed to attach to unified cgroup"); |
c2aed66d | 2120 | |
c2aed66d | 2121 | do { |
7581a82f | 2122 | bool rm = false; |
c80c9a70 | 2123 | char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1]; |
9fd047d1 | 2124 | char *slash = attach_cgroup; |
c2aed66d | 2125 | |
0bba27c1 CB |
2126 | ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx); |
2127 | if (ret < 0) | |
2128 | return ret; | |
5045306b | 2129 | |
c80c9a70 CB |
2130 | /* |
2131 | * This shouldn't really happen but the compiler might complain | |
2132 | * that a short write would cause a buffer overrun. So be on | |
2133 | * the safe side. | |
2134 | */ | |
2135 | if (ret < STRLITERALLEN(".lxc-/cgroup.procs")) | |
2136 | return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun"); | |
2137 | ||
9fd047d1 | 2138 | slash += (ret - STRLITERALLEN("/cgroup.procs")); |
bad788b0 | 2139 | *slash = '\0'; |
ad275c16 | 2140 | |
bad788b0 | 2141 | ret = mkdirat(unified_fd, attach_cgroup, 0755); |
c2aed66d | 2142 | if (ret < 0 && errno != EEXIST) |
d2203230 | 2143 | return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup); |
7581a82f CB |
2144 | if (ret == 0) |
2145 | rm = true; | |
c2aed66d | 2146 | |
bad788b0 | 2147 | *slash = '/'; |
ad275c16 | 2148 | |
bad788b0 | 2149 | ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len); |
c2aed66d | 2150 | if (ret == 0) |
6e2078de | 2151 | return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup); |
c2aed66d | 2152 | |
7581a82f CB |
2153 | if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR)) |
2154 | SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup); | |
2155 | ||
c2aed66d CB |
2156 | /* this is a non-leaf node */ |
2157 | if (errno != EBUSY) | |
d2203230 | 2158 | return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); |
c2aed66d | 2159 | |
edae86e9 CB |
2160 | idx++; |
2161 | } while (idx < 1000); | |
c2aed66d | 2162 | |
ad275c16 | 2163 | return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); |
c2aed66d CB |
2164 | } |
2165 | ||
d1783ef4 CB |
2166 | static int cgroup_attach_create_leaf(const struct lxc_conf *conf, |
2167 | int unified_fd, int *sk_fd) | |
2168 | { | |
7d849163 CB |
2169 | __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; |
2170 | int target_fds[2]; | |
d1783ef4 CB |
2171 | ssize_t ret; |
2172 | ||
2173 | /* Create leaf cgroup. */ | |
2174 | ret = mkdirat(unified_fd, ".lxc", 0755); | |
2175 | if (ret < 0 && errno != EEXIST) | |
2176 | return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\""); | |
2177 | ||
7043e2b4 | 2178 | target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); |
7d849163 | 2179 | if (target_fd0 < 0) |
d1783ef4 | 2180 | return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\""); |
7d849163 | 2181 | target_fds[0] = target_fd0; |
d1783ef4 | 2182 | |
7043e2b4 | 2183 | target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); |
7d849163 | 2184 | if (target_fd1 < 0) |
49df620b | 2185 | return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\""); |
7d849163 | 2186 | target_fds[1] = target_fd1; |
49df620b CB |
2187 | |
2188 | ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0); | |
d1783ef4 | 2189 | if (ret <= 0) |
49df620b | 2190 | return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d", |
7d849163 | 2191 | target_fd0, target_fd1); |
d1783ef4 | 2192 | |
7d849163 | 2193 | return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1); |
d1783ef4 CB |
2194 | } |
2195 | ||
2196 | static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, | |
2197 | int *sk_fd, pid_t pid) | |
2198 | { | |
7d849163 | 2199 | __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; |
d1783ef4 CB |
2200 | char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; |
2201 | size_t pidstr_len; | |
2202 | ssize_t ret; | |
2203 | ||
1b82d721 | 2204 | ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1); |
d17c815d | 2205 | if (ret < 0) |
d1783ef4 CB |
2206 | return log_error_errno(-1, errno, "Failed to receive target cgroup fd"); |
2207 | ||
2208 | pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); | |
2209 | ||
7d849163 CB |
2210 | ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len); |
2211 | if (ret > 0 && ret == pidstr_len) | |
2212 | return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0); | |
2213 | ||
49df620b | 2214 | ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len); |
7d849163 CB |
2215 | if (ret > 0 && ret == pidstr_len) |
2216 | return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1); | |
d1783ef4 | 2217 | |
7d849163 CB |
2218 | return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d", |
2219 | target_fd0, target_fd1); | |
d1783ef4 CB |
2220 | } |
2221 | ||
4b86fefd CB |
2222 | struct userns_exec_unified_attach_data { |
2223 | const struct lxc_conf *conf; | |
2224 | int unified_fd; | |
d1783ef4 | 2225 | int sk_pair[2]; |
4b86fefd CB |
2226 | pid_t pid; |
2227 | }; | |
2228 | ||
d1783ef4 CB |
2229 | static int cgroup_unified_attach_child_wrapper(void *data) |
2230 | { | |
2231 | struct userns_exec_unified_attach_data *args = data; | |
2232 | ||
2233 | if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || | |
2234 | args->sk_pair[0] < 0 || args->sk_pair[1] < 0) | |
2235 | return ret_errno(EINVAL); | |
2236 | ||
2237 | close_prot_errno_disarm(args->sk_pair[0]); | |
2238 | return cgroup_attach_create_leaf(args->conf, args->unified_fd, | |
2239 | &args->sk_pair[1]); | |
2240 | } | |
2241 | ||
2242 | static int cgroup_unified_attach_parent_wrapper(void *data) | |
4b86fefd CB |
2243 | { |
2244 | struct userns_exec_unified_attach_data *args = data; | |
4b86fefd | 2245 | |
d1783ef4 CB |
2246 | if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || |
2247 | args->sk_pair[0] < 0 || args->sk_pair[1] < 0) | |
4b86fefd CB |
2248 | return ret_errno(EINVAL); |
2249 | ||
d1783ef4 CB |
2250 | close_prot_errno_disarm(args->sk_pair[1]); |
2251 | return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0], | |
2252 | args->pid); | |
4b86fefd CB |
2253 | } |
2254 | ||
900b6606 CB |
2255 | /* Technically, we're always at a delegation boundary here (This is especially |
2256 | * true when cgroup namespaces are available.). The reasoning is that in order | |
2257 | * for us to have been able to start a container in the first place the root | |
2258 | * cgroup must have been a leaf node. Now, either the container's init system | |
2259 | * has populated the cgroup and kept it as a leaf node or it has created | |
2260 | * subtrees. In the former case we will simply attach to the leaf node we | |
2261 | * created when we started the container in the latter case we create our own | |
2262 | * cgroup for the attaching process. | |
2263 | */ | |
7581a82f CB |
2264 | static int __cg_unified_attach(const struct hierarchy *h, |
2265 | const struct lxc_conf *conf, const char *name, | |
900b6606 CB |
2266 | const char *lxcpath, pid_t pid, |
2267 | const char *controller) | |
2268 | { | |
f62cf1d4 | 2269 | __do_close int unified_fd = -EBADF; |
32908bfd | 2270 | __do_free char *path = NULL, *cgroup = NULL; |
900b6606 CB |
2271 | int ret; |
2272 | ||
7581a82f CB |
2273 | if (!conf || !name || !lxcpath || pid <= 0) |
2274 | return ret_errno(EINVAL); | |
2275 | ||
2276 | ret = cgroup_attach(conf, name, lxcpath, pid); | |
32908bfd CB |
2277 | if (ret == 0) |
2278 | return log_trace(0, "Attached to unified cgroup via command handler"); | |
112ccbc9 | 2279 | if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2) |
32908bfd CB |
2280 | return log_error_errno(ret, errno, "Failed to attach to unified cgroup"); |
2281 | ||
2282 | /* Fall back to retrieving the path for the unified cgroup. */ | |
2283 | cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller); | |
2284 | /* not running */ | |
2285 | if (!cgroup) | |
2286 | return 0; | |
900b6606 | 2287 | |
35ec1a38 | 2288 | path = make_cgroup_path(h, cgroup, NULL); |
900b6606 | 2289 | |
32908bfd | 2290 | unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC); |
900b6606 | 2291 | if (unified_fd < 0) |
7581a82f CB |
2292 | return ret_errno(EBADF); |
2293 | ||
0589d744 | 2294 | if (!list_empty(&conf->id_map)) { |
4b86fefd CB |
2295 | struct userns_exec_unified_attach_data args = { |
2296 | .conf = conf, | |
2297 | .unified_fd = unified_fd, | |
2298 | .pid = pid, | |
2299 | }; | |
2300 | ||
d1783ef4 CB |
2301 | ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); |
2302 | if (ret < 0) | |
2303 | return -errno; | |
2304 | ||
2305 | ret = userns_exec_minimal(conf, | |
2306 | cgroup_unified_attach_parent_wrapper, | |
2307 | &args, | |
2308 | cgroup_unified_attach_child_wrapper, | |
2309 | &args); | |
4b86fefd CB |
2310 | } else { |
2311 | ret = cgroup_attach_leaf(conf, unified_fd, pid); | |
2312 | } | |
2313 | ||
2314 | return ret; | |
900b6606 CB |
2315 | } |
2316 | ||
7581a82f CB |
2317 | __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, |
2318 | const struct lxc_conf *conf, | |
2319 | const char *name, const char *lxcpath, | |
2320 | pid_t pid) | |
ccb4cabe | 2321 | { |
81b5d48a | 2322 | int len, ret; |
a3650c0c | 2323 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; |
ccb4cabe | 2324 | |
ab9a452d CB |
2325 | if (!ops) |
2326 | return ret_set_errno(false, ENOENT); | |
2327 | ||
69b4a4bb CB |
2328 | if (!ops->hierarchies) |
2329 | return true; | |
2330 | ||
0bba27c1 CB |
2331 | len = strnprintf(pidstr, sizeof(pidstr), "%d", pid); |
2332 | if (len < 0) | |
ccb4cabe SH |
2333 | return false; |
2334 | ||
81b5d48a | 2335 | for (int i = 0; ops->hierarchies[i]; i++) { |
c05b17bd | 2336 | __do_free char *fullpath = NULL, *path = NULL; |
2202afc9 | 2337 | struct hierarchy *h = ops->hierarchies[i]; |
ccb4cabe | 2338 | |
b8572e8c | 2339 | if (h->fs_type == UNIFIED_HIERARCHY) { |
7581a82f | 2340 | ret = __cg_unified_attach(h, conf, name, lxcpath, pid, |
a3926f6a | 2341 | h->controllers[0]); |
c2aed66d CB |
2342 | if (ret < 0) |
2343 | return false; | |
2344 | ||
2345 | continue; | |
2346 | } | |
2347 | ||
ccb4cabe | 2348 | path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]); |
6159413b CB |
2349 | if (!path) { |
2350 | /* | |
2351 | * Someone might have created a name=<controller> | |
2352 | * controller after the container has started and so | |
2353 | * the container doesn't make use of this controller. | |
2354 | * | |
2355 | * Link: https://github.com/lxc/lxd/issues/8577 | |
2356 | */ | |
2357 | TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0])); | |
2358 | continue; | |
2359 | } | |
ccb4cabe | 2360 | |
371f834d | 2361 | fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs"); |
7cea5905 | 2362 | ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666); |
ab9a452d | 2363 | if (ret < 0) |
77c3e9a2 | 2364 | return log_error_errno(false, errno, "Failed to attach %d to %s", |
ab9a452d | 2365 | (int)pid, fullpath); |
ccb4cabe SH |
2366 | } |
2367 | ||
ccb4cabe SH |
2368 | return true; |
2369 | } | |
2370 | ||
e2bd2b13 CB |
2371 | /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we |
2372 | * don't have a cgroup_data set up, so we ask the running container through the | |
2373 | * commands API for the cgroup path. | |
ccb4cabe | 2374 | */ |
b857f4be | 2375 | __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename, |
fb55e009 CB |
2376 | char *value, size_t len, const char *name, |
2377 | const char *lxcpath) | |
ccb4cabe | 2378 | { |
d97919ab | 2379 | __do_free char *path = NULL; |
88396101 | 2380 | __do_free char *controller = NULL; |
d97919ab | 2381 | char *p; |
0069cc61 | 2382 | struct hierarchy *h; |
861cb8c2 | 2383 | int ret = -1; |
ccb4cabe | 2384 | |
a358028a CB |
2385 | if (!ops) |
2386 | return ret_set_errno(-1, ENOENT); | |
2387 | ||
63ba9eaf CB |
2388 | controller = strdup(filename); |
2389 | if (!controller) | |
2390 | return ret_errno(ENOMEM); | |
2391 | ||
0069cc61 CB |
2392 | p = strchr(controller, '.'); |
2393 | if (p) | |
ccb4cabe SH |
2394 | *p = '\0'; |
2395 | ||
a9b642ee | 2396 | path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller); |
0069cc61 CB |
2397 | /* not running */ |
2398 | if (!path) | |
ccb4cabe SH |
2399 | return -1; |
2400 | ||
2202afc9 | 2401 | h = get_hierarchy(ops, controller); |
ccb4cabe | 2402 | if (h) { |
88396101 | 2403 | __do_free char *fullpath = NULL; |
0069cc61 CB |
2404 | |
2405 | fullpath = build_full_cgpath_from_monitorpath(h, path, filename); | |
ccb4cabe | 2406 | ret = lxc_read_from_file(fullpath, value, len); |
ccb4cabe | 2407 | } |
ccb4cabe SH |
2408 | |
2409 | return ret; | |
2410 | } | |
2411 | ||
cb3fc90c CB |
2412 | static int device_cgroup_parse_access(struct device_item *device, const char *val) |
2413 | { | |
2414 | for (int count = 0; count < 3; count++, val++) { | |
2415 | switch (*val) { | |
2416 | case 'r': | |
2417 | device->access[count] = *val; | |
2418 | break; | |
2419 | case 'w': | |
2420 | device->access[count] = *val; | |
2421 | break; | |
2422 | case 'm': | |
2423 | device->access[count] = *val; | |
2424 | break; | |
2425 | case '\n': | |
2426 | case '\0': | |
2427 | count = 3; | |
2428 | break; | |
2429 | default: | |
2430 | return ret_errno(EINVAL); | |
2431 | } | |
2432 | } | |
2433 | ||
2434 | return 0; | |
2435 | } | |
2436 | ||
2a63b5cb CB |
2437 | static int device_cgroup_rule_parse(struct device_item *device, const char *key, |
2438 | const char *val) | |
2439 | { | |
2440 | int count, ret; | |
2441 | char temp[50]; | |
2442 | ||
8b99a20a | 2443 | if (strequal("devices.allow", key)) |
69885a76 | 2444 | device->allow = 1; /* allow the device */ |
2a63b5cb | 2445 | else |
69885a76 | 2446 | device->allow = 0; /* deny the device */ |
2a63b5cb | 2447 | |
8b99a20a | 2448 | if (strequal(val, "a")) { |
2a63b5cb CB |
2449 | /* global rule */ |
2450 | device->type = 'a'; | |
2451 | device->major = -1; | |
2452 | device->minor = -1; | |
2a63b5cb | 2453 | return 0; |
2a63b5cb CB |
2454 | } |
2455 | ||
2456 | switch (*val) { | |
2457 | case 'a': | |
2458 | __fallthrough; | |
2459 | case 'b': | |
2460 | __fallthrough; | |
2461 | case 'c': | |
2462 | device->type = *val; | |
2463 | break; | |
2464 | default: | |
2465 | return -1; | |
2466 | } | |
2467 | ||
2468 | val++; | |
2469 | if (!isspace(*val)) | |
2470 | return -1; | |
2471 | val++; | |
2472 | if (*val == '*') { | |
2473 | device->major = -1; | |
2474 | val++; | |
2475 | } else if (isdigit(*val)) { | |
2476 | memset(temp, 0, sizeof(temp)); | |
2477 | for (count = 0; count < sizeof(temp) - 1; count++) { | |
2478 | temp[count] = *val; | |
2479 | val++; | |
2480 | if (!isdigit(*val)) | |
2481 | break; | |
2482 | } | |
2483 | ret = lxc_safe_int(temp, &device->major); | |
2484 | if (ret) | |
2485 | return -1; | |
2486 | } else { | |
2487 | return -1; | |
2488 | } | |
2489 | if (*val != ':') | |
2490 | return -1; | |
2491 | val++; | |
2492 | ||
2493 | /* read minor */ | |
2494 | if (*val == '*') { | |
2495 | device->minor = -1; | |
2496 | val++; | |
2497 | } else if (isdigit(*val)) { | |
2498 | memset(temp, 0, sizeof(temp)); | |
2499 | for (count = 0; count < sizeof(temp) - 1; count++) { | |
2500 | temp[count] = *val; | |
2501 | val++; | |
2502 | if (!isdigit(*val)) | |
2503 | break; | |
2504 | } | |
2505 | ret = lxc_safe_int(temp, &device->minor); | |
2506 | if (ret) | |
2507 | return -1; | |
2508 | } else { | |
2509 | return -1; | |
2510 | } | |
2511 | if (!isspace(*val)) | |
2512 | return -1; | |
2a63b5cb | 2513 | |
cb3fc90c | 2514 | return device_cgroup_parse_access(device, ++val); |
2a63b5cb CB |
2515 | } |
2516 | ||
eec533e3 CB |
2517 | /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we |
2518 | * don't have a cgroup_data set up, so we ask the running container through the | |
2519 | * commands API for the cgroup path. | |
ccb4cabe | 2520 | */ |
b857f4be | 2521 | __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops, |
2a63b5cb | 2522 | const char *key, const char *value, |
fb55e009 | 2523 | const char *name, const char *lxcpath) |
ccb4cabe | 2524 | { |
d97919ab | 2525 | __do_free char *path = NULL; |
88396101 | 2526 | __do_free char *controller = NULL; |
d97919ab | 2527 | char *p; |
87777968 | 2528 | struct hierarchy *h; |
861cb8c2 | 2529 | int ret = -1; |
ccb4cabe | 2530 | |
b7aeda96 CB |
2531 | if (!ops || is_empty_string(key) || is_empty_string(value) || |
2532 | is_empty_string(name) || is_empty_string(lxcpath)) | |
2533 | return ret_errno(EINVAL); | |
a358028a | 2534 | |
63ba9eaf CB |
2535 | controller = strdup(key); |
2536 | if (!controller) | |
2537 | return ret_errno(ENOMEM); | |
2538 | ||
87777968 CB |
2539 | p = strchr(controller, '.'); |
2540 | if (p) | |
ccb4cabe SH |
2541 | *p = '\0'; |
2542 | ||
8b99a20a | 2543 | if (pure_unified_layout(ops) && strequal(controller, "devices")) { |
50329f28 | 2544 | struct device_item device = {}; |
2a63b5cb CB |
2545 | |
2546 | ret = device_cgroup_rule_parse(&device, key, value); | |
2547 | if (ret < 0) | |
d2203230 | 2548 | return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", |
2a63b5cb CB |
2549 | key, value); |
2550 | ||
2551 | ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device); | |
2552 | if (ret < 0) | |
2553 | return -1; | |
2554 | ||
2555 | return 0; | |
2556 | } | |
2557 | ||
a9b642ee | 2558 | path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller); |
87777968 CB |
2559 | /* not running */ |
2560 | if (!path) | |
ccb4cabe SH |
2561 | return -1; |
2562 | ||
2202afc9 | 2563 | h = get_hierarchy(ops, controller); |
ccb4cabe | 2564 | if (h) { |
88396101 | 2565 | __do_free char *fullpath = NULL; |
87777968 | 2566 | |
2a63b5cb | 2567 | fullpath = build_full_cgpath_from_monitorpath(h, path, key); |
7cea5905 | 2568 | ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); |
ccb4cabe | 2569 | } |
ccb4cabe SH |
2570 | |
2571 | return ret; | |
2572 | } | |
2573 | ||
91d1a13a | 2574 | /* take devices cgroup line |
72add155 SH |
2575 | * /dev/foo rwx |
2576 | * and convert it to a valid | |
2577 | * type major:minor mode | |
91d1a13a CB |
2578 | * line. Return <0 on error. Dest is a preallocated buffer long enough to hold |
2579 | * the output. | |
72add155 | 2580 | */ |
cb3fc90c CB |
2581 | static int device_cgroup_rule_parse_devpath(struct device_item *device, |
2582 | const char *devpath) | |
72add155 | 2583 | { |
88396101 | 2584 | __do_free char *path = NULL; |
2a06d041 | 2585 | char *mode = NULL; |
cb3fc90c CB |
2586 | int n_parts, ret; |
2587 | char *p; | |
2588 | struct stat sb; | |
72add155 | 2589 | |
63ba9eaf CB |
2590 | path = strdup(devpath); |
2591 | if (!path) | |
2592 | return ret_errno(ENOMEM); | |
72add155 | 2593 | |
cb3fc90c CB |
2594 | /* |
2595 | * Read path followed by mode. Ignore any trailing text. | |
91d1a13a CB |
2596 | * A ' # comment' would be legal. Technically other text is not |
2597 | * legal, we could check for that if we cared to. | |
72add155 | 2598 | */ |
0dbdb99e | 2599 | for (n_parts = 1, p = path; *p; p++) { |
2c2d6c49 SH |
2600 | if (*p != ' ') |
2601 | continue; | |
2602 | *p = '\0'; | |
91d1a13a | 2603 | |
2c2d6c49 SH |
2604 | if (n_parts != 1) |
2605 | break; | |
2606 | p++; | |
2607 | n_parts++; | |
91d1a13a | 2608 | |
2c2d6c49 SH |
2609 | while (*p == ' ') |
2610 | p++; | |
91d1a13a | 2611 | |
2c2d6c49 | 2612 | mode = p; |
91d1a13a | 2613 | |
2c2d6c49 | 2614 | if (*p == '\0') |
cb3fc90c | 2615 | return ret_set_errno(-1, EINVAL); |
72add155 | 2616 | } |
2c2d6c49 | 2617 | |
83b25c4d CB |
2618 | if (!mode) |
2619 | return ret_errno(EINVAL); | |
2620 | ||
cb3fc90c CB |
2621 | if (device_cgroup_parse_access(device, mode) < 0) |
2622 | return -1; | |
2623 | ||
72add155 SH |
2624 | ret = stat(path, &sb); |
2625 | if (ret < 0) | |
cb3fc90c | 2626 | return ret_set_errno(-1, errno); |
72add155 | 2627 | |
72add155 SH |
2628 | mode_t m = sb.st_mode & S_IFMT; |
2629 | switch (m) { | |
2630 | case S_IFBLK: | |
cb3fc90c | 2631 | device->type = 'b'; |
72add155 SH |
2632 | break; |
2633 | case S_IFCHR: | |
cb3fc90c | 2634 | device->type = 'c'; |
72add155 | 2635 | break; |
2c2d6c49 | 2636 | default: |
77c3e9a2 | 2637 | return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path); |
72add155 | 2638 | } |
2c2d6c49 | 2639 | |
cb3fc90c CB |
2640 | device->major = MAJOR(sb.st_rdev); |
2641 | device->minor = MINOR(sb.st_rdev); | |
2642 | device->allow = 1; | |
72add155 | 2643 | |
cb3fc90c CB |
2644 | return 0; |
2645 | } | |
2646 | ||
2647 | static int convert_devpath(const char *invalue, char *dest) | |
2648 | { | |
50329f28 | 2649 | struct device_item device = {}; |
cb3fc90c CB |
2650 | int ret; |
2651 | ||
2652 | ret = device_cgroup_rule_parse_devpath(&device, invalue); | |
2653 | if (ret < 0) | |
2654 | return -1; | |
2655 | ||
0bba27c1 CB |
2656 | ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major, |
2657 | device.minor, device.access); | |
2658 | if (ret < 0) | |
2659 | return log_error_errno(ret, -ret, | |
2660 | "Error on configuration value \"%c %d:%d %s\" (max 50 chars)", | |
2661 | device.type, device.major, device.minor, | |
2662 | device.access); | |
cb3fc90c CB |
2663 | |
2664 | return 0; | |
72add155 SH |
2665 | } |
2666 | ||
90e97284 CB |
2667 | /* Called from setup_limits - here we have the container's cgroup_data because |
2668 | * we created the cgroups. | |
ccb4cabe | 2669 | */ |
2202afc9 | 2670 | static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, |
a900cbaf | 2671 | const char *value, bool is_cpuset) |
ccb4cabe | 2672 | { |
88396101 | 2673 | __do_free char *controller = NULL; |
d97919ab | 2674 | char *p; |
1a0e70ac CB |
2675 | /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */ |
2676 | char converted_value[50]; | |
b3646d7e | 2677 | struct hierarchy *h; |
64e82f8b | 2678 | |
63ba9eaf CB |
2679 | controller = strdup(filename); |
2680 | if (!controller) | |
2681 | return ret_errno(ENOMEM); | |
2682 | ||
ab1a6cac CB |
2683 | p = strchr(controller, '.'); |
2684 | if (p) | |
ccb4cabe SH |
2685 | *p = '\0'; |
2686 | ||
8b99a20a | 2687 | if (strequal("devices.allow", filename) && value[0] == '/') { |
c04a6d4e CB |
2688 | int ret; |
2689 | ||
72add155 SH |
2690 | ret = convert_devpath(value, converted_value); |
2691 | if (ret < 0) | |
c8bf519d | 2692 | return ret; |
72add155 | 2693 | value = converted_value; |
c8bf519d | 2694 | } |
2695 | ||
2202afc9 | 2696 | h = get_hierarchy(ops, controller); |
77c3e9a2 CB |
2697 | if (!h) |
2698 | return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller); | |
b3646d7e | 2699 | |
a900cbaf | 2700 | if (is_cpuset) { |
67ed60ce | 2701 | int ret = lxc_write_openat(h->path_con, filename, value, strlen(value)); |
a900cbaf WB |
2702 | if (ret) |
2703 | return ret; | |
2704 | } | |
b1b1a60f | 2705 | return lxc_write_openat(h->path_lim, filename, value, strlen(value)); |
ccb4cabe SH |
2706 | } |
2707 | ||
bca286f2 CB |
2708 | /* |
2709 | * Return the list of cgroup_settings sorted according to the following rules | |
2710 | * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes | |
2711 | */ | |
2712 | static void sort_cgroup_settings(struct lxc_conf *conf) | |
2713 | { | |
2714 | LIST_HEAD(memsw_list); | |
2715 | struct lxc_cgroup *cgroup, *ncgroup; | |
2716 | ||
2717 | /* Iterate over the cgroup settings and copy them to the output list. */ | |
2718 | list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) { | |
2719 | if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes")) | |
2720 | continue; | |
2721 | ||
2722 | /* Move the memsw entry from the cgroup settings list. */ | |
2723 | list_move_tail(&cgroup->head, &memsw_list); | |
2724 | } | |
2725 | ||
2726 | /* | |
2727 | * Append all the memsw entries to the end of the cgroup settings list | |
2728 | * to make sure they are applied after all memory limit settings. | |
2729 | */ | |
2730 | list_splice_tail(&memsw_list, &conf->cgroup); | |
2731 | ||
2732 | } | |
2733 | ||
c581d2a6 CB |
2734 | __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops, |
2735 | struct lxc_conf *conf, | |
2736 | bool do_devices) | |
ccb4cabe | 2737 | { |
c9dbb8ed CB |
2738 | struct list_head *cgroup_settings; |
2739 | struct lxc_cgroup *cgroup; | |
ccb4cabe | 2740 | |
92ca7eb5 CB |
2741 | if (!ops) |
2742 | return ret_set_errno(false, ENOENT); | |
2743 | ||
2744 | if (!conf) | |
2745 | return ret_set_errno(false, EINVAL); | |
2746 | ||
2747 | cgroup_settings = &conf->cgroup; | |
c9dbb8ed | 2748 | if (list_empty(cgroup_settings)) |
ccb4cabe SH |
2749 | return true; |
2750 | ||
69b4a4bb | 2751 | if (!ops->hierarchies) |
92ca7eb5 | 2752 | return ret_set_errno(false, EINVAL); |
69b4a4bb | 2753 | |
92afbe74 | 2754 | if (pure_unified_layout(ops)) |
b96aa96f CB |
2755 | return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system"); |
2756 | ||
c9dbb8ed CB |
2757 | sort_cgroup_settings(conf); |
2758 | list_for_each_entry(cgroup, cgroup_settings, head) { | |
2759 | if (do_devices == strnequal("devices", cgroup->subsystem, 7)) { | |
2760 | if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) { | |
fc3b9533 | 2761 | if (do_devices && (errno == EACCES || errno == EPERM)) { |
c9dbb8ed | 2762 | SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); |
fc3b9533 CB |
2763 | continue; |
2764 | } | |
c9dbb8ed CB |
2765 | SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); |
2766 | return false; | |
ccb4cabe | 2767 | } |
c9dbb8ed | 2768 | DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value); |
ccb4cabe | 2769 | } |
ccb4cabe SH |
2770 | } |
2771 | ||
6b38e644 | 2772 | INFO("Limits for the legacy cgroup hierarchies have been setup"); |
c9dbb8ed | 2773 | return true; |
ccb4cabe SH |
2774 | } |
2775 | ||
bf651989 CB |
2776 | /* |
2777 | * Some of the parsing logic comes from the original cgroup device v1 | |
2778 | * implementation in the kernel. | |
2779 | */ | |
4bfb655e CB |
2780 | static int bpf_device_cgroup_prepare(struct cgroup_ops *ops, |
2781 | struct lxc_conf *conf, const char *key, | |
bf651989 CB |
2782 | const char *val) |
2783 | { | |
50329f28 | 2784 | struct device_item device_item = {}; |
2a63b5cb | 2785 | int ret; |
bf651989 | 2786 | |
30bfbd3f | 2787 | if (strequal("devices.allow", key) && abspath(val)) |
cb3fc90c CB |
2788 | ret = device_cgroup_rule_parse_devpath(&device_item, val); |
2789 | else | |
2790 | ret = device_cgroup_rule_parse(&device_item, key, val); | |
2a63b5cb | 2791 | if (ret < 0) |
060aaa39 | 2792 | return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val); |
4bfb655e | 2793 | |
60532b18 | 2794 | /* |
15970277 CB |
2795 | * Note that bpf_list_add_device() returns 1 if it altered the device |
2796 | * list and 0 if it didn't; both return values indicate success. | |
2797 | * Only a negative return value indicates an error. | |
60532b18 | 2798 | */ |
a134099d | 2799 | ret = bpf_list_add_device(&conf->bpf_devices, &device_item); |
2a63b5cb | 2800 | if (ret < 0) |
4bfb655e | 2801 | return -1; |
a134099d | 2802 | |
bf651989 CB |
2803 | return 0; |
2804 | } | |
2805 | ||
c581d2a6 CB |
2806 | __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops, |
2807 | struct lxc_handler *handler) | |
6b38e644 | 2808 | { |
c9dbb8ed | 2809 | struct list_head *cgroup_settings; |
7e31931f CB |
2810 | struct hierarchy *h; |
2811 | struct lxc_conf *conf; | |
c9dbb8ed | 2812 | struct lxc_cgroup *cgroup; |
6b38e644 | 2813 | |
7e31931f CB |
2814 | if (!ops) |
2815 | return ret_set_errno(false, ENOENT); | |
2816 | ||
2817 | if (!ops->hierarchies) | |
6b38e644 CB |
2818 | return true; |
2819 | ||
7e31931f CB |
2820 | if (!ops->container_cgroup) |
2821 | return ret_set_errno(false, EINVAL); | |
2822 | ||
2823 | if (!handler || !handler->conf) | |
2824 | return ret_set_errno(false, EINVAL); | |
2825 | conf = handler->conf; | |
2826 | ||
7e31931f | 2827 | cgroup_settings = &conf->cgroup2; |
c9dbb8ed | 2828 | if (list_empty(cgroup_settings)) |
0e7a013e CB |
2829 | return true; |
2830 | ||
2831 | if (!pure_unified_layout(ops)) | |
2832 | return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system"); | |
7e31931f CB |
2833 | |
2834 | if (!ops->unified) | |
6b38e644 | 2835 | return false; |
7e31931f | 2836 | h = ops->unified; |
6b38e644 | 2837 | |
c9dbb8ed | 2838 | list_for_each_entry(cgroup, cgroup_settings, head) { |
c04a6d4e | 2839 | int ret; |
6b38e644 | 2840 | |
c9dbb8ed CB |
2841 | if (strnequal("devices", cgroup->subsystem, 7)) |
2842 | ret = bpf_device_cgroup_prepare(ops, conf, cgroup->subsystem, cgroup->value); | |
ee9d3ef0 | 2843 | else |
c9dbb8ed | 2844 | ret = lxc_write_openat(h->path_lim, cgroup->subsystem, cgroup->value, strlen(cgroup->value)); |
ee9d3ef0 | 2845 | if (ret < 0) |
c9dbb8ed | 2846 | return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); |
ee9d3ef0 | 2847 | |
c9dbb8ed | 2848 | TRACE("Set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value); |
6b38e644 CB |
2849 | } |
2850 | ||
7e31931f | 2851 | return log_info(true, "Limits for the unified cgroup hierarchy have been setup"); |
6b38e644 CB |
2852 | } |
2853 | ||
59eac805 | 2854 | __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler) |
bf651989 | 2855 | { |
e552bd1a CB |
2856 | struct lxc_conf *conf; |
2857 | struct hierarchy *unified; | |
bf651989 | 2858 | |
e552bd1a CB |
2859 | if (!ops) |
2860 | return ret_set_errno(false, ENOENT); | |
2861 | ||
2862 | if (!ops->hierarchies) | |
2863 | return true; | |
2864 | ||
2865 | if (!ops->container_cgroup) | |
2866 | return ret_set_errno(false, EEXIST); | |
2867 | ||
2868 | if (!handler || !handler->conf) | |
2869 | return ret_set_errno(false, EINVAL); | |
2870 | conf = handler->conf; | |
2871 | ||
2872 | unified = ops->unified; | |
ca72ccb5 | 2873 | if (!unified || !device_utility_controller(unified) || |
93de768e | 2874 | !unified->path_con || list_empty(&(conf->bpf_devices).devices)) |
bf651989 CB |
2875 | return true; |
2876 | ||
a134099d | 2877 | return bpf_cgroup_devices_attach(ops, &conf->bpf_devices); |
bf651989 CB |
2878 | } |
2879 | ||
59eac805 | 2880 | static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup) |
6b38e644 | 2881 | { |
95ab26af CB |
2882 | __do_close int dfd_final = -EBADF; |
2883 | __do_free char *add_controllers = NULL, *copy = NULL; | |
c581d2a6 | 2884 | size_t full_len = 0; |
0954f6ce CB |
2885 | struct hierarchy *unified; |
2886 | int dfd_cur, ret; | |
95ab26af CB |
2887 | char *cur; |
2888 | char **it; | |
6b38e644 | 2889 | |
0954f6ce CB |
2890 | if (!ops->hierarchies || !pure_unified_layout(ops)) |
2891 | return true; | |
2892 | ||
2893 | unified = ops->unified; | |
2894 | if (!unified->controllers[0]) | |
bf651989 CB |
2895 | return true; |
2896 | ||
c581d2a6 CB |
2897 | /* For now we simply enable all controllers that we have detected by |
2898 | * creating a string like "+memory +pids +cpu +io". | |
2899 | * TODO: In the near future we might want to support "-<controller>" | |
2900 | * etc. but whether supporting semantics like this make sense will need | |
2901 | * some thinking. | |
2902 | */ | |
2903 | for (it = unified->controllers; it && *it; it++) { | |
2904 | full_len += strlen(*it) + 2; | |
2905 | add_controllers = must_realloc(add_controllers, full_len + 1); | |
2906 | ||
2907 | if (unified->controllers[0] == *it) | |
2908 | add_controllers[0] = '\0'; | |
2909 | ||
2910 | (void)strlcat(add_controllers, "+", full_len + 1); | |
2911 | (void)strlcat(add_controllers, *it, full_len + 1); | |
2912 | ||
2913 | if ((it + 1) && *(it + 1)) | |
2914 | (void)strlcat(add_controllers, " ", full_len + 1); | |
2915 | } | |
2916 | ||
95ab26af CB |
2917 | copy = strdup(cgroup); |
2918 | if (!copy) | |
f761d24d | 2919 | return false; |
c581d2a6 | 2920 | |
95ab26af CB |
2921 | /* |
2922 | * Placing the write to cgroup.subtree_control before the open() is | |
2923 | * intentional because of the cgroup2 delegation model. It enforces | |
2924 | * that leaf cgroups don't have any controllers enabled for delegation. | |
2925 | */ | |
0954f6ce | 2926 | dfd_cur = unified->dfd_base; |
95ab26af CB |
2927 | lxc_iterate_parts(cur, copy, "/") { |
2928 | /* | |
2929 | * Even though we vetted the paths when we parsed the config | |
2930 | * we're paranoid here and check that the path is neither | |
2931 | * absolute nor walks upwards. | |
2932 | */ | |
2933 | if (abspath(cur)) | |
060aaa39 | 2934 | return syserror_set(-EINVAL, "No absolute paths allowed"); |
ac01a9b8 | 2935 | |
95ab26af | 2936 | if (strnequal(cur, "..", STRLITERALLEN(".."))) |
060aaa39 | 2937 | return syserror_set(-EINVAL, "No upward walking paths allowed"); |
ac01a9b8 | 2938 | |
95ab26af | 2939 | ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len); |
61fbc369 | 2940 | if (ret < 0) |
2d7b0895 | 2941 | return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur); |
95ab26af CB |
2942 | |
2943 | TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur); | |
ac01a9b8 | 2944 | |
95ab26af CB |
2945 | dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0); |
2946 | if (dfd_final < 0) | |
2d7b0895 | 2947 | return syserror("Fail to open directory %d(%s)", dfd_cur, cur); |
95ab26af CB |
2948 | if (dfd_cur != unified->dfd_base) |
2949 | close(dfd_cur); | |
2950 | /* | |
2951 | * Leave dfd_final pointing to the last fd we opened so | |
2952 | * it will be automatically zapped if we return early. | |
2953 | */ | |
2954 | dfd_cur = dfd_final; | |
c581d2a6 CB |
2955 | } |
2956 | ||
f761d24d | 2957 | return true; |
c581d2a6 CB |
2958 | } |
2959 | ||
59eac805 | 2960 | __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops) |
c581d2a6 | 2961 | { |
61fbc369 CB |
2962 | if (!ops) |
2963 | return ret_set_errno(false, ENOENT); | |
2964 | ||
c581d2a6 CB |
2965 | return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup); |
2966 | } | |
2967 | ||
59eac805 | 2968 | __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops) |
c581d2a6 | 2969 | { |
61fbc369 CB |
2970 | if (!ops) |
2971 | return ret_set_errno(false, ENOENT); | |
2972 | ||
c581d2a6 | 2973 | return __cgfsng_delegate_controllers(ops, ops->container_cgroup); |
2202afc9 CB |
2974 | } |
2975 | ||
0da35ac7 CB |
2976 | static inline bool unified_cgroup(const char *line) |
2977 | { | |
2978 | return *line == '0'; | |
2979 | } | |
2980 | ||
2981 | static inline char *current_unified_cgroup(bool relative, char *line) | |
2982 | { | |
2983 | char *current_cgroup; | |
2984 | ||
2985 | line += STRLITERALLEN("0::"); | |
2986 | ||
2987 | if (!abspath(line)) | |
2988 | return ERR_PTR(-EINVAL); | |
2989 | ||
2990 | /* remove init.scope */ | |
2991 | if (!relative) | |
2992 | line = prune_init_scope(line); | |
2993 | ||
2994 | /* create a relative path */ | |
2995 | line = deabs(line); | |
2996 | ||
2997 | current_cgroup = strdup(line); | |
2998 | if (!current_cgroup) | |
2999 | return ERR_PTR(-ENOMEM); | |
3000 | ||
3001 | return current_cgroup; | |
3002 | } | |
3003 | ||
3004 | static inline const char *unprefix(const char *controllers) | |
3005 | { | |
3006 | if (strnequal(controllers, "name=", STRLITERALLEN("name="))) | |
3007 | return controllers + STRLITERALLEN("name="); | |
3008 | return controllers; | |
3009 | } | |
3010 | ||
3011 | static int __list_cgroup_delegate(char ***delegate) | |
a6ca2ed8 | 3012 | { |
63ba9eaf | 3013 | __do_free char **list = NULL; |
d606c4e9 | 3014 | __do_free char *buf = NULL; |
35ec1a38 CB |
3015 | char *standard[] = { |
3016 | "cgroup.procs", | |
3017 | "cgroup.threads", | |
3018 | "cgroup.subtree_control", | |
3019 | "memory.oom.group", | |
3020 | NULL, | |
3021 | }; | |
d606c4e9 | 3022 | char *token; |
63ba9eaf | 3023 | int ret; |
a6ca2ed8 | 3024 | |
46bf13b7 | 3025 | buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0); |
d606c4e9 | 3026 | if (!buf) { |
a6ca2ed8 | 3027 | for (char **p = standard; p && *p; p++) { |
63ba9eaf CB |
3028 | ret = list_add_string(&list, *p); |
3029 | if (ret < 0) | |
3030 | return ret; | |
a6ca2ed8 | 3031 | } |
35ec1a38 | 3032 | |
63ba9eaf | 3033 | *delegate = move_ptr(list); |
6d95e0b7 | 3034 | return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate"); |
d606c4e9 | 3035 | } |
a6ca2ed8 | 3036 | |
257f04ec | 3037 | lxc_iterate_parts(token, buf, " \t\n") { |
d606c4e9 CB |
3038 | /* |
3039 | * We always need to chown this for both cgroup and | |
3040 | * cgroup2. | |
3041 | */ | |
8b99a20a | 3042 | if (strequal(token, "cgroup.procs")) |
d606c4e9 CB |
3043 | continue; |
3044 | ||
63ba9eaf CB |
3045 | ret = list_add_string(&list, token); |
3046 | if (ret < 0) | |
3047 | return ret; | |
a6ca2ed8 | 3048 | } |
2202afc9 | 3049 | |
63ba9eaf | 3050 | *delegate = move_ptr(list); |
341e6516 | 3051 | return 0; |
2202afc9 CB |
3052 | } |
3053 | ||
0da35ac7 | 3054 | static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files) |
0e3af26b | 3055 | { |
0da35ac7 CB |
3056 | __do_free_string_list char **list = NULL; |
3057 | int ret; | |
0e3af26b | 3058 | |
0da35ac7 CB |
3059 | ret = __list_cgroup_delegate(&list); |
3060 | if (ret < 0) | |
9fc21b2d | 3061 | return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements"); |
0e3af26b | 3062 | |
0da35ac7 CB |
3063 | for (char *const *s = list; s && *s; s++) { |
3064 | if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT) | |
3065 | continue; | |
0e3af26b | 3066 | |
815c378b | 3067 | return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s); |
0da35ac7 | 3068 | } |
0e3af26b | 3069 | |
0da35ac7 CB |
3070 | *ret_files = move_ptr(list); |
3071 | return true; | |
0e3af26b CB |
3072 | } |
3073 | ||
0da35ac7 | 3074 | static bool legacy_hierarchy_delegated(int dfd_base) |
35ec1a38 | 3075 | { |
98db769c CB |
3076 | int ret; |
3077 | ||
3078 | ret = faccessat(dfd_base, ".", W_OK, 0); | |
3079 | if (ret < 0 && errno != ENOENT) | |
3080 | return sysinfo_ret(false, "Legacy hierarchy not writable, skipping"); | |
0da35ac7 CB |
3081 | |
3082 | return true; | |
35ec1a38 CB |
3083 | } |
3084 | ||
91d0151d CB |
3085 | /** |
3086 | * systemd guarantees that the order of co-mounted controllers is stable. On | |
3087 | * some systems the order of the controllers might be reversed though. | |
3088 | * | |
3089 | * For example, this is how the order is mismatched on CentOS 7: | |
3090 | * | |
3091 | * [root@localhost ~]# cat /proc/self/cgroup | |
3092 | * 11:perf_event:/ | |
3093 | * 10:pids:/ | |
3094 | * 9:freezer:/ | |
3095 | * >>>> 8:cpuacct,cpu:/ | |
3096 | * 7:memory:/ | |
3097 | * 6:blkio:/ | |
3098 | * 5:devices:/ | |
3099 | * 4:hugetlb:/ | |
3100 | * >>>> 3:net_prio,net_cls:/ | |
3101 | * 2:cpuset:/ | |
3102 | * 1:name=systemd:/user.slice/user-0.slice/session-c1.scope | |
3103 | * | |
3104 | * whereas the mountpoint: | |
3105 | * | |
3106 | * | |-/sys/fs/cgroup tmpfs tmpfs ro,nosuid,nodev,noexec,mode=755 | |
3107 | * | | |-/sys/fs/cgroup/systemd cgroup cgroup rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd | |
3108 | * | | |-/sys/fs/cgroup/cpuset cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuset | |
3109 | * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup cgroup rw,nosuid,nodev,noexec,relatime,net_prio,net_cls | |
3110 | * | | |-/sys/fs/cgroup/hugetlb cgroup cgroup rw,nosuid,nodev,noexec,relatime,hugetlb | |
3111 | * | | |-/sys/fs/cgroup/devices cgroup cgroup rw,nosuid,nodev,noexec,relatime,devices | |
3112 | * | | |-/sys/fs/cgroup/blkio cgroup cgroup rw,nosuid,nodev,noexec,relatime,blkio | |
3113 | * | | |-/sys/fs/cgroup/memory cgroup cgroup rw,nosuid,nodev,noexec,relatime,memory | |
3114 | * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpuacct,cpu | |
3115 | * | | |-/sys/fs/cgroup/freezer cgroup cgroup rw,nosuid,nodev,noexec,relatime,freezer | |
3116 | * | | |-/sys/fs/cgroup/pids cgroup cgroup rw,nosuid,nodev,noexec,relatime,pids | |
3117 | * | | `-/sys/fs/cgroup/perf_event cgroup cgroup rw,nosuid,nodev,noexec,relatime,perf_event | |
3118 | * | |
3119 | * Ensure that we always use the systemd-guaranteed stable order when checking | |
3120 | * for the mountpoint. | |
3121 | */ | |
3122 | __attribute__((returns_nonnull)) __attribute__((nonnull)) | |
3123 | static const char *stable_order(const char *controllers) | |
3124 | { | |
3125 | if (strequal(controllers, "cpuacct,cpu")) | |
3126 | return "cpu,cpuacct"; | |
3127 | ||
3128 | if (strequal(controllers, "net_prio,net_cls")) | |
3129 | return "net_cls,net_prio"; | |
3130 | ||
3131 | return unprefix(controllers); | |
3132 | } | |
3133 | ||
35ec1a38 CB |
3134 | static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, |
3135 | bool unprivileged) | |
2202afc9 | 3136 | { |
8033666c CB |
3137 | __do_free char *cgroup_info = NULL; |
3138 | char *it; | |
2202afc9 | 3139 | |
35ec1a38 CB |
3140 | /* |
3141 | * Root spawned containers escape the current cgroup, so use init's | |
3142 | * cgroups as our base in that case. | |
3143 | */ | |
9caee129 | 3144 | if (!relative && (geteuid() == 0)) |
8033666c | 3145 | cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0); |
2202afc9 | 3146 | else |
8033666c CB |
3147 | cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); |
3148 | if (!cgroup_info) | |
35ec1a38 | 3149 | return ret_errno(ENOMEM); |
2202afc9 | 3150 | |
8033666c | 3151 | lxc_iterate_parts(it, cgroup_info, "\n") { |
35ec1a38 CB |
3152 | __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF; |
3153 | __do_free char *controllers = NULL, *current_cgroup = NULL; | |
3154 | __do_free_string_list char **controller_list = NULL, | |
3155 | **delegate = NULL; | |
3156 | char *line; | |
3157 | int dfd, ret, type; | |
3158 | ||
3159 | /* Handle the unified cgroup hierarchy. */ | |
3160 | line = it; | |
3161 | if (unified_cgroup(line)) { | |
3162 | char *unified_mnt; | |
3163 | ||
b8572e8c CB |
3164 | type = UNIFIED_HIERARCHY; |
3165 | ||
35ec1a38 CB |
3166 | current_cgroup = current_unified_cgroup(relative, line); |
3167 | if (IS_ERR(current_cgroup)) | |
3168 | return PTR_ERR(current_cgroup); | |
3169 | ||
e18e9053 CB |
3170 | if (unified_cgroup_fd(ops->dfd_mnt)) { |
3171 | dfd_mnt = dup_cloexec(ops->dfd_mnt); | |
35ec1a38 CB |
3172 | unified_mnt = ""; |
3173 | } else { | |
e18e9053 | 3174 | dfd_mnt = open_at(ops->dfd_mnt, |
35ec1a38 CB |
3175 | "unified", |
3176 | PROTECT_OPATH_DIRECTORY, | |
3177 | PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); | |
3178 | unified_mnt = "unified"; | |
3179 | } | |
3180 | if (dfd_mnt < 0) { | |
3181 | if (errno != ENOENT) | |
2d7b0895 | 3182 | return syserror("Failed to open %d/unified", ops->dfd_mnt); |
2202afc9 | 3183 | |
35ec1a38 CB |
3184 | SYSTRACE("Unified cgroup not mounted"); |
3185 | continue; | |
3186 | } | |
3187 | dfd = dfd_mnt; | |
3188 | ||
3189 | if (!is_empty_string(current_cgroup)) { | |
3190 | dfd_base = open_at(dfd_mnt, current_cgroup, | |
3191 | PROTECT_OPATH_DIRECTORY, | |
3192 | PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
f4afdfbe CB |
3193 | if (dfd_base < 0) { |
3194 | if (errno != ENOENT) | |
3195 | return syserror("Failed to open %d/%s", | |
3196 | dfd_mnt, current_cgroup); | |
3197 | ||
3198 | SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)", | |
3199 | dfd_mnt, current_cgroup); | |
3200 | continue; | |
3201 | } | |
35ec1a38 CB |
3202 | dfd = dfd_base; |
3203 | } | |
8033666c | 3204 | |
0da35ac7 CB |
3205 | if (!unified_hierarchy_delegated(dfd, &delegate)) |
3206 | continue; | |
3207 | ||
35ec1a38 CB |
3208 | controller_list = unified_controllers(dfd, "cgroup.controllers"); |
3209 | if (!controller_list) { | |
3210 | TRACE("No controllers are enabled for delegation in the unified hierarchy"); | |
63ba9eaf CB |
3211 | controller_list = list_new(); |
3212 | if (!controller_list) | |
2d7b0895 | 3213 | return syserror_set(-ENOMEM, "Failed to create empty controller list"); |
35ec1a38 | 3214 | } |
8033666c | 3215 | |
35ec1a38 CB |
3216 | controllers = strdup(unified_mnt); |
3217 | if (!controllers) | |
3218 | return ret_errno(ENOMEM); | |
3219 | } else { | |
3220 | char *__controllers, *__current_cgroup; | |
2202afc9 | 3221 | |
b8572e8c CB |
3222 | type = LEGACY_HIERARCHY; |
3223 | ||
35ec1a38 CB |
3224 | __controllers = strchr(line, ':'); |
3225 | if (!__controllers) | |
3226 | return ret_errno(EINVAL); | |
3227 | __controllers++; | |
3228 | ||
3229 | __current_cgroup = strchr(__controllers, ':'); | |
3230 | if (!__current_cgroup) | |
3231 | return ret_errno(EINVAL); | |
3232 | *__current_cgroup = '\0'; | |
3233 | __current_cgroup++; | |
3234 | ||
91d0151d | 3235 | controllers = strdup(stable_order(__controllers)); |
35ec1a38 CB |
3236 | if (!controllers) |
3237 | return ret_errno(ENOMEM); | |
3238 | ||
e18e9053 | 3239 | dfd_mnt = open_at(ops->dfd_mnt, |
91d0151d CB |
3240 | controllers, |
3241 | PROTECT_OPATH_DIRECTORY, | |
35ec1a38 CB |
3242 | PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); |
3243 | if (dfd_mnt < 0) { | |
3244 | if (errno != ENOENT) | |
2d7b0895 | 3245 | return syserror("Failed to open %d/%s", |
e18e9053 | 3246 | ops->dfd_mnt, controllers); |
2202afc9 | 3247 | |
35ec1a38 CB |
3248 | SYSTRACE("%s not mounted", controllers); |
3249 | continue; | |
3250 | } | |
3251 | dfd = dfd_mnt; | |
3252 | ||
3253 | if (!abspath(__current_cgroup)) | |
3254 | return ret_errno(EINVAL); | |
3255 | ||
3256 | /* remove init.scope */ | |
3257 | if (!relative) | |
3258 | __current_cgroup = prune_init_scope(__current_cgroup); | |
3259 | ||
3260 | /* create a relative path */ | |
3261 | __current_cgroup = deabs(__current_cgroup); | |
6e214b74 | 3262 | |
35ec1a38 CB |
3263 | current_cgroup = strdup(__current_cgroup); |
3264 | if (!current_cgroup) | |
3265 | return ret_errno(ENOMEM); | |
2202afc9 | 3266 | |
35ec1a38 CB |
3267 | if (!is_empty_string(current_cgroup)) { |
3268 | dfd_base = open_at(dfd_mnt, current_cgroup, | |
3269 | PROTECT_OPATH_DIRECTORY, | |
3270 | PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
f4afdfbe CB |
3271 | if (dfd_base < 0) { |
3272 | if (errno != ENOENT) | |
3273 | return syserror("Failed to open %d/%s", | |
3274 | dfd_mnt, current_cgroup); | |
3275 | ||
3276 | SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)", | |
3277 | dfd_mnt, current_cgroup); | |
3278 | continue; | |
3279 | } | |
35ec1a38 CB |
3280 | dfd = dfd_base; |
3281 | } | |
2a63b5cb | 3282 | |
0da35ac7 CB |
3283 | if (!legacy_hierarchy_delegated(dfd)) |
3284 | continue; | |
35ec1a38 CB |
3285 | |
3286 | /* | |
3287 | * We intentionally pass __current_cgroup here and not | |
3288 | * controllers because we would otherwise chop the | |
3289 | * mountpoint. | |
3290 | */ | |
63ba9eaf CB |
3291 | controller_list = list_add_controllers(__controllers); |
3292 | if (!controller_list) | |
2d7b0895 | 3293 | return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers); |
35ec1a38 CB |
3294 | |
3295 | if (skip_hierarchy(ops, controller_list)) | |
3296 | continue; | |
3297 | ||
35ec1a38 CB |
3298 | ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; |
3299 | } | |
3300 | ||
179754a2 CB |
3301 | ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd, |
3302 | current_cgroup, controller_list, type); | |
35ec1a38 | 3303 | if (ret < 0) |
9fc21b2d | 3304 | return syserror_ret(ret, "Failed to add %s hierarchy", controllers); |
35ec1a38 CB |
3305 | |
3306 | /* Transfer ownership. */ | |
3307 | move_fd(dfd_mnt); | |
3308 | move_fd(dfd_base); | |
3309 | move_ptr(current_cgroup); | |
3310 | move_ptr(controllers); | |
3311 | move_ptr(controller_list); | |
b8572e8c | 3312 | if (type == UNIFIED_HIERARCHY) |
042f9e9c | 3313 | ops->unified->delegate = move_ptr(delegate); |
35ec1a38 CB |
3314 | } |
3315 | ||
3316 | /* determine cgroup layout */ | |
3317 | if (ops->unified) { | |
3318 | if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { | |
3319 | ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; | |
3320 | } else { | |
3321 | if (bpf_devices_cgroup_supported()) | |
ca72ccb5 | 3322 | ops->unified->utilities |= DEVICES_CONTROLLER; |
35ec1a38 CB |
3323 | ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; |
3324 | } | |
3325 | } | |
3326 | ||
c7a1f72a | 3327 | if (!controllers_available(ops)) |
060aaa39 | 3328 | return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated"); |
c7a1f72a | 3329 | |
35ec1a38 | 3330 | return 0; |
2202afc9 CB |
3331 | } |
3332 | ||
35ec1a38 | 3333 | static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) |
2202afc9 | 3334 | { |
d4cff352 | 3335 | __do_close int dfd = -EBADF; |
2202afc9 | 3336 | int ret; |
0fbf99d6 | 3337 | const char *controllers_use; |
d4cff352 | 3338 | |
e18e9053 | 3339 | if (ops->dfd_mnt >= 0) |
a96be3c3 | 3340 | return ret_errno(EBUSY); |
d4cff352 CB |
3341 | |
3342 | /* | |
3343 | * I don't see the need for allowing symlinks here. If users want to | |
3344 | * have their hierarchy available in different locations I strongly | |
3345 | * suggest bind-mounts. | |
3346 | */ | |
3347 | dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT, | |
3348 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); | |
3349 | if (dfd < 0) | |
2d7b0895 | 3350 | return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT); |
2202afc9 | 3351 | |
0fbf99d6 CB |
3352 | controllers_use = lxc_global_config_value("lxc.cgroup.use"); |
3353 | if (controllers_use) { | |
3354 | __do_free char *dup = NULL; | |
3355 | char *it; | |
b7b18fc5 | 3356 | |
0fbf99d6 CB |
3357 | dup = strdup(controllers_use); |
3358 | if (!dup) | |
7a0c8ed3 | 3359 | return -errno; |
b7b18fc5 | 3360 | |
63ba9eaf CB |
3361 | lxc_iterate_parts(it, dup, ",") { |
3362 | ret = list_add_string(&ops->cgroup_use, it); | |
3363 | if (ret < 0) | |
3364 | return ret; | |
3365 | } | |
b7b18fc5 | 3366 | } |
2202afc9 | 3367 | |
d4cff352 CB |
3368 | /* |
3369 | * Keep dfd referenced by the cleanup function and actually move the fd | |
3370 | * once we know the initialization succeeded. So if we fail we clean up | |
3371 | * the dfd. | |
3372 | */ | |
e18e9053 | 3373 | ops->dfd_mnt = dfd; |
2202afc9 | 3374 | |
0589d744 | 3375 | ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map)); |
d4cff352 | 3376 | if (ret < 0) |
9fc21b2d | 3377 | return syserror_ret(ret, "Failed to initialize cgroups"); |
2202afc9 | 3378 | |
d4cff352 CB |
3379 | /* Transfer ownership to cgroup_ops. */ |
3380 | move_fd(dfd); | |
3381 | return 0; | |
2202afc9 CB |
3382 | } |
3383 | ||
341e6516 | 3384 | __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops) |
2202afc9 CB |
3385 | { |
3386 | const char *cgroup_pattern; | |
3387 | ||
341e6516 CB |
3388 | if (!ops) |
3389 | return ret_set_errno(-1, ENOENT); | |
3390 | ||
2202afc9 CB |
3391 | /* copy system-wide cgroup information */ |
3392 | cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern"); | |
63ba9eaf CB |
3393 | if (cgroup_pattern && !strequal(cgroup_pattern, "")) { |
3394 | ops->cgroup_pattern = strdup(cgroup_pattern); | |
3395 | if (!ops->cgroup_pattern) | |
3396 | return ret_errno(ENOMEM); | |
3397 | } | |
2202afc9 | 3398 | |
341e6516 | 3399 | return 0; |
2202afc9 CB |
3400 | } |
3401 | ||
35ec1a38 | 3402 | struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) |
2202afc9 | 3403 | { |
e3d78fdc | 3404 | __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL; |
2202afc9 | 3405 | |
c5d0238a | 3406 | cgfsng_ops = zalloc(sizeof(struct cgroup_ops)); |
2202afc9 | 3407 | if (!cgfsng_ops) |
341e6516 | 3408 | return ret_set_errno(NULL, ENOMEM); |
2202afc9 | 3409 | |
e3d78fdc CB |
3410 | cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; |
3411 | cgfsng_ops->dfd_mnt = -EBADF; | |
2202afc9 | 3412 | |
35ec1a38 | 3413 | if (initialize_cgroups(cgfsng_ops, conf)) |
2202afc9 | 3414 | return NULL; |
2202afc9 | 3415 | |
ca76baed CB |
3416 | cgfsng_ops->data_init = cgfsng_data_init; |
3417 | cgfsng_ops->payload_destroy = cgfsng_payload_destroy; | |
3418 | cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy; | |
3419 | cgfsng_ops->monitor_create = cgfsng_monitor_create; | |
3420 | cgfsng_ops->monitor_enter = cgfsng_monitor_enter; | |
3421 | cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers; | |
3422 | cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers; | |
3423 | cgfsng_ops->payload_create = cgfsng_payload_create; | |
3424 | cgfsng_ops->payload_enter = cgfsng_payload_enter; | |
840eec19 | 3425 | cgfsng_ops->finalize = cgfsng_finalize; |
ca76baed CB |
3426 | cgfsng_ops->get_cgroup = cgfsng_get_cgroup; |
3427 | cgfsng_ops->get = cgfsng_get; | |
3428 | cgfsng_ops->set = cgfsng_set; | |
3429 | cgfsng_ops->freeze = cgfsng_freeze; | |
3430 | cgfsng_ops->unfreeze = cgfsng_unfreeze; | |
3431 | cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy; | |
3432 | cgfsng_ops->setup_limits = cgfsng_setup_limits; | |
3433 | cgfsng_ops->driver = "cgfsng"; | |
3434 | cgfsng_ops->version = "1.0.0"; | |
3435 | cgfsng_ops->attach = cgfsng_attach; | |
3436 | cgfsng_ops->chown = cgfsng_chown; | |
3437 | cgfsng_ops->mount = cgfsng_mount; | |
3438 | cgfsng_ops->devices_activate = cgfsng_devices_activate; | |
a9b642ee | 3439 | cgfsng_ops->get_limit_cgroup = cgfsng_get_limit_cgroup; |
2202afc9 | 3440 | |
ff9edd2d CB |
3441 | cgfsng_ops->criu_escape = cgfsng_criu_escape; |
3442 | cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies; | |
3443 | cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies; | |
3444 | ||
a64edc1c | 3445 | return move_ptr(cgfsng_ops); |
2202afc9 | 3446 | } |
be835470 | 3447 | |
2092492c | 3448 | static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid) |
029d8e88 | 3449 | { |
029d8e88 CB |
3450 | int ret; |
3451 | ||
0589d744 | 3452 | if (!list_empty(&conf->id_map)) { |
029d8e88 CB |
3453 | struct userns_exec_unified_attach_data args = { |
3454 | .conf = conf, | |
2092492c | 3455 | .unified_fd = fd_unified, |
029d8e88 CB |
3456 | .pid = pid, |
3457 | }; | |
3458 | ||
3459 | ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); | |
3460 | if (ret < 0) | |
3461 | return -errno; | |
3462 | ||
3463 | ret = userns_exec_minimal(conf, | |
3464 | cgroup_unified_attach_parent_wrapper, | |
3465 | &args, | |
3466 | cgroup_unified_attach_child_wrapper, | |
3467 | &args); | |
3468 | } else { | |
2092492c CB |
3469 | ret = cgroup_attach_leaf(conf, fd_unified, pid); |
3470 | } | |
3471 | ||
3472 | return ret; | |
3473 | } | |
3474 | ||
3475 | static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name, | |
3476 | const char *lxcpath, pid_t pid) | |
3477 | { | |
c071c112 | 3478 | call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){}; |
2092492c | 3479 | int ret; |
c071c112 | 3480 | size_t idx; |
2092492c | 3481 | ssize_t pidstr_len; |
9d3480da | 3482 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; |
2092492c | 3483 | |
bce2970f | 3484 | ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx); |
2092492c CB |
3485 | if (ret < 0) |
3486 | return ret_errno(ENOSYS); | |
3487 | ||
3488 | pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid); | |
3489 | if (pidstr_len < 0) | |
3490 | return pidstr_len; | |
3491 | ||
c071c112 CB |
3492 | for (idx = 0; idx < ctx->fd_len; idx++) { |
3493 | int dfd_con = ctx->fd[idx]; | |
2092492c CB |
3494 | |
3495 | if (unified_cgroup_fd(dfd_con)) | |
3496 | ret = __unified_attach_fd(conf, dfd_con, pid); | |
3497 | else | |
3498 | ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len); | |
3499 | if (ret) | |
9fc21b2d | 3500 | return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con); |
2092492c CB |
3501 | else |
3502 | TRACE("Attached to cgroup fd %d", dfd_con); | |
3503 | } | |
3504 | ||
c071c112 | 3505 | if (idx == 0) |
060aaa39 | 3506 | return syserror_set(-ENOENT, "Failed to attach to cgroups"); |
c071c112 | 3507 | |
61983e15 | 3508 | TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout)); |
c071c112 | 3509 | return 0; |
2092492c CB |
3510 | } |
3511 | ||
3512 | static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name, | |
3513 | const char *lxcpath, pid_t pid) | |
3514 | { | |
3515 | __do_close int dfd_unified = -EBADF; | |
3516 | ||
3517 | if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0) | |
3518 | return ret_errno(EINVAL); | |
3519 | ||
3520 | dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath); | |
3521 | if (dfd_unified < 0) | |
f740bc63 | 3522 | return ret_errno(ENOSYS); |
2092492c CB |
3523 | |
3524 | return __unified_attach_fd(conf, dfd_unified, pid); | |
3525 | } | |
3526 | ||
3527 | int cgroup_attach(const struct lxc_conf *conf, const char *name, | |
3528 | const char *lxcpath, pid_t pid) | |
3529 | { | |
3530 | int ret; | |
3531 | ||
3532 | ret = __cgroup_attach_many(conf, name, lxcpath, pid); | |
3533 | if (ret < 0) { | |
f740bc63 | 3534 | if (!ERRNO_IS_NOT_SUPPORTED(ret)) |
2092492c CB |
3535 | return ret; |
3536 | ||
3537 | ret = __cgroup_attach_unified(conf, name, lxcpath, pid); | |
f740bc63 CB |
3538 | if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret)) |
3539 | return ret_errno(ENOSYS); | |
029d8e88 CB |
3540 | } |
3541 | ||
3542 | return ret; | |
3543 | } | |
3544 | ||
751a624f | 3545 | /* Connects to command socket therefore isn't callable from command handler. */ |
abb6f657 | 3546 | int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len) |
be835470 | 3547 | { |
abb6f657 CB |
3548 | __do_close int dfd = -EBADF; |
3549 | struct cgroup_fd fd = { | |
3550 | .fd = -EBADF, | |
3551 | }; | |
3552 | size_t len_controller; | |
3553 | int ret; | |
be835470 | 3554 | |
abb6f657 CB |
3555 | if (is_empty_string(name) || is_empty_string(lxcpath) || |
3556 | is_empty_string(key)) | |
be835470 CB |
3557 | return ret_errno(EINVAL); |
3558 | ||
3559 | if ((buf && !len) || (len && !buf)) | |
3560 | return ret_errno(EINVAL); | |
3561 | ||
abb6f657 CB |
3562 | len_controller = strcspn(key, "."); |
3563 | len_controller++; /* Don't forget the \0 byte. */ | |
3564 | if (len_controller >= MAX_CGROUP_ROOT_NAMELEN) | |
3565 | return ret_errno(EINVAL); | |
3566 | (void)strlcpy(fd.controller, key, len_controller); | |
be835470 | 3567 | |
abb6f657 CB |
3568 | ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd); |
3569 | if (ret < 0) { | |
3570 | if (!ERRNO_IS_NOT_SUPPORTED(ret)) | |
3571 | return ret; | |
3572 | ||
3573 | dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); | |
3574 | if (dfd < 0) { | |
3575 | if (!ERRNO_IS_NOT_SUPPORTED(ret)) | |
3576 | return ret; | |
3577 | ||
3578 | return ret_errno(ENOSYS); | |
3579 | } | |
3580 | fd.type = UNIFIED_HIERARCHY; | |
3581 | fd.fd = move_fd(dfd); | |
3582 | } | |
3583 | dfd = move_fd(fd.fd); | |
3584 | ||
3585 | TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type)); | |
3586 | ||
3587 | if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) | |
3588 | return ret_errno(EOPNOTSUPP); | |
3589 | else | |
3590 | ret = lxc_read_try_buf_at(dfd, key, buf, len); | |
be835470 CB |
3591 | |
3592 | return ret; | |
3593 | } | |
3594 | ||
751a624f | 3595 | /* Connects to command socket therefore isn't callable from command handler. */ |
abb6f657 | 3596 | int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value) |
be835470 | 3597 | { |
abb6f657 CB |
3598 | __do_close int dfd = -EBADF; |
3599 | struct cgroup_fd fd = { | |
3600 | .fd = -EBADF, | |
3601 | }; | |
3602 | size_t len_controller; | |
3603 | int ret; | |
be835470 | 3604 | |
abb6f657 CB |
3605 | if (is_empty_string(name) || is_empty_string(lxcpath) || |
3606 | is_empty_string(key) || is_empty_string(value)) | |
be835470 CB |
3607 | return ret_errno(EINVAL); |
3608 | ||
abb6f657 CB |
3609 | len_controller = strcspn(key, "."); |
3610 | len_controller++; /* Don't forget the \0 byte. */ | |
3611 | if (len_controller >= MAX_CGROUP_ROOT_NAMELEN) | |
3612 | return ret_errno(EINVAL); | |
3613 | (void)strlcpy(fd.controller, key, len_controller); | |
3614 | ||
3615 | ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd); | |
3616 | if (ret < 0) { | |
3617 | if (!ERRNO_IS_NOT_SUPPORTED(ret)) | |
3618 | return ret; | |
3619 | ||
3620 | dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); | |
3621 | if (dfd < 0) { | |
3622 | if (!ERRNO_IS_NOT_SUPPORTED(ret)) | |
3623 | return ret; | |
be835470 | 3624 | |
abb6f657 CB |
3625 | return ret_errno(ENOSYS); |
3626 | } | |
3627 | fd.type = UNIFIED_HIERARCHY; | |
3628 | fd.fd = move_fd(dfd); | |
3629 | } | |
3630 | dfd = move_fd(fd.fd); | |
3631 | ||
3632 | TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type)); | |
3633 | ||
3634 | if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) { | |
be835470 CB |
3635 | struct device_item device = {}; |
3636 | ||
abb6f657 | 3637 | ret = device_cgroup_rule_parse(&device, key, value); |
be835470 | 3638 | if (ret < 0) |
abb6f657 CB |
3639 | return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", |
3640 | key, value); | |
be835470 CB |
3641 | |
3642 | ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device); | |
3643 | } else { | |
abb6f657 | 3644 | ret = lxc_writeat(dfd, key, value, strlen(value)); |
be835470 CB |
3645 | } |
3646 | ||
3647 | return ret; | |
3648 | } | |
c8af3332 | 3649 | |
c9c814f4 CB |
3650 | static int do_cgroup_freeze(int unified_fd, |
3651 | const char *state_string, | |
3652 | int state_num, | |
3653 | int timeout, | |
3654 | const char *epoll_error, | |
3655 | const char *wait_error) | |
c8af3332 CB |
3656 | { |
3657 | __do_close int events_fd = -EBADF; | |
3298b37d | 3658 | call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL; |
c8af3332 | 3659 | int ret; |
3298b37d | 3660 | struct lxc_async_descr descr = {}; |
c8af3332 CB |
3661 | |
3662 | if (timeout != 0) { | |
3663 | ret = lxc_mainloop_open(&descr); | |
3664 | if (ret) | |
3665 | return log_error_errno(-1, errno, "%s", epoll_error); | |
3666 | ||
3667 | /* automatically cleaned up now */ | |
3668 | descr_ptr = &descr; | |
3669 | ||
3670 | events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0); | |
3671 | if (events_fd < 0) | |
3672 | return log_error_errno(-errno, errno, "Failed to open cgroup.events file"); | |
3673 | ||
543d2f83 CB |
3674 | ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, |
3675 | freezer_cgroup_events_cb, | |
3676 | default_cleanup_handler, | |
3677 | INT_TO_PTR(state_num), | |
3678 | "freezer_cgroup_events_cb"); | |
c8af3332 CB |
3679 | if (ret < 0) |
3680 | return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); | |
3681 | } | |
3682 | ||
3683 | ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1); | |
3684 | if (ret < 0) | |
3685 | return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); | |
3686 | ||
3687 | if (timeout != 0) { | |
3688 | ret = lxc_mainloop(&descr, timeout); | |
3689 | if (ret) | |
3690 | return log_error_errno(-1, errno, "%s", wait_error); | |
3691 | } | |
3692 | ||
3693 | return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen"); | |
3694 | } | |
3695 | ||
c9c814f4 CB |
3696 | static inline int __cgroup_freeze(int unified_fd, int timeout) |
3697 | { | |
3698 | return do_cgroup_freeze(unified_fd, "1", 1, timeout, | |
3699 | "Failed to create epoll instance to wait for container freeze", | |
3700 | "Failed to wait for container to be frozen"); | |
3701 | } | |
3702 | ||
5ef7547f | 3703 | int cgroup_freeze(const char *name, const char *lxcpath, int timeout) |
c8af3332 CB |
3704 | { |
3705 | __do_close int unified_fd = -EBADF; | |
3706 | int ret; | |
3707 | ||
b57f9b13 CB |
3708 | if (is_empty_string(name) || is_empty_string(lxcpath)) |
3709 | return ret_errno(EINVAL); | |
3710 | ||
a9b642ee | 3711 | unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); |
c8af3332 CB |
3712 | if (unified_fd < 0) |
3713 | return ret_errno(ENOCGROUP2); | |
3714 | ||
3715 | lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING); | |
c9c814f4 | 3716 | ret = __cgroup_freeze(unified_fd, timeout); |
c8af3332 | 3717 | lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING); |
5ef7547f | 3718 | return ret; |
c8af3332 CB |
3719 | } |
3720 | ||
c9c814f4 CB |
3721 | int __cgroup_unfreeze(int unified_fd, int timeout) |
3722 | { | |
3723 | return do_cgroup_freeze(unified_fd, "0", 0, timeout, | |
3724 | "Failed to create epoll instance to wait for container freeze", | |
3725 | "Failed to wait for container to be frozen"); | |
3726 | } | |
3727 | ||
5ef7547f | 3728 | int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout) |
c8af3332 CB |
3729 | { |
3730 | __do_close int unified_fd = -EBADF; | |
3731 | int ret; | |
3732 | ||
b57f9b13 CB |
3733 | if (is_empty_string(name) || is_empty_string(lxcpath)) |
3734 | return ret_errno(EINVAL); | |
3735 | ||
a9b642ee | 3736 | unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath); |
c8af3332 CB |
3737 | if (unified_fd < 0) |
3738 | return ret_errno(ENOCGROUP2); | |
3739 | ||
3740 | lxc_cmd_notify_state_listeners(name, lxcpath, THAWED); | |
c9c814f4 | 3741 | ret = __cgroup_unfreeze(unified_fd, timeout); |
c8af3332 | 3742 | lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN); |
5ef7547f | 3743 | return ret; |
c8af3332 | 3744 | } |