]>
Commit | Line | Data |
---|---|---|
cc73685d | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
ccb4cabe SH |
2 | |
3 | /* | |
4 | * cgfs-ng.c: this is a new, simplified implementation of a filesystem | |
5 | * cgroup backend. The original cgfs.c was designed to be as flexible | |
6 | * as possible. It would try to find cgroup filesystems no matter where | |
7 | * or how you had them mounted, and deduce the most usable mount for | |
0e7ff52c | 8 | * each controller. |
ccb4cabe SH |
9 | * |
10 | * This new implementation assumes that cgroup filesystems are mounted | |
11 | * under /sys/fs/cgroup/clist where clist is either the controller, or | |
18406e5a | 12 | * a comma-separated list of controllers. |
ccb4cabe | 13 | */ |
a54694f8 | 14 | |
d38dd64a CB |
15 | #ifndef _GNU_SOURCE |
16 | #define _GNU_SOURCE 1 | |
17 | #endif | |
a54694f8 CB |
18 | #include <ctype.h> |
19 | #include <dirent.h> | |
20 | #include <errno.h> | |
21 | #include <grp.h> | |
d38dd64a CB |
22 | #include <linux/kdev_t.h> |
23 | #include <linux/types.h> | |
942e193e CB |
24 | #include <poll.h> |
25 | #include <signal.h> | |
a54694f8 | 26 | #include <stdint.h> |
ccb4cabe SH |
27 | #include <stdio.h> |
28 | #include <stdlib.h> | |
a54694f8 | 29 | #include <string.h> |
385e58e8 | 30 | #include <sys/epoll.h> |
438c4581 | 31 | #include <sys/types.h> |
d38dd64a | 32 | #include <unistd.h> |
c8bf519d | 33 | |
d1783ef4 | 34 | #include "af_unix.h" |
b635e92d | 35 | #include "caps.h" |
ccb4cabe | 36 | #include "cgroup.h" |
bf651989 | 37 | #include "cgroup2_devices.h" |
6328fd9c | 38 | #include "cgroup_utils.h" |
ccb4cabe | 39 | #include "commands.h" |
c8af3332 | 40 | #include "commands_utils.h" |
43654d34 | 41 | #include "conf.h" |
d38dd64a | 42 | #include "config.h" |
a54694f8 | 43 | #include "log.h" |
c19ad94b | 44 | #include "macro.h" |
018051e3 | 45 | #include "mainloop.h" |
861cb8c2 | 46 | #include "memory_utils.h" |
74ed30d7 | 47 | #include "mount_utils.h" |
43654d34 | 48 | #include "storage/storage.h" |
600a0163 | 49 | #include "string_utils.h" |
315f8a4e | 50 | #include "syscall_wrappers.h" |
a54694f8 | 51 | #include "utils.h" |
ccb4cabe | 52 | |
64e82f8b DJ |
53 | #ifndef HAVE_STRLCPY |
54 | #include "include/strlcpy.h" | |
55 | #endif | |
56 | ||
3ebe2fbd DJ |
57 | #ifndef HAVE_STRLCAT |
58 | #include "include/strlcat.h" | |
59 | #endif | |
60 | ||
ac2cecc4 | 61 | lxc_log_define(cgfsng, cgroup); |
ccb4cabe | 62 | |
35ec1a38 CB |
63 | /* |
64 | * Given a pointer to a null-terminated array of pointers, realloc to add one | |
8b8db2f6 CB |
65 | * entry, and point the new entry to NULL. Do not fail. Return the index to the |
66 | * second-to-last entry - that is, the one which is now available for use | |
67 | * (keeping the list null-terminated). | |
ccb4cabe | 68 | */ |
35ec1a38 | 69 | static int list_add(void ***list) |
ccb4cabe | 70 | { |
35ec1a38 CB |
71 | int idx = 0; |
72 | void **p; | |
ccb4cabe SH |
73 | |
74 | if (*list) | |
35ec1a38 | 75 | for (; (*list)[idx]; idx++) |
8b8db2f6 | 76 | ; |
ccb4cabe | 77 | |
35ec1a38 CB |
78 | p = realloc(*list, (idx + 2) * sizeof(void **)); |
79 | if (!p) | |
80 | return ret_errno(ENOMEM); | |
81 | ||
82 | p[idx + 1] = NULL; | |
83 | *list = p; | |
84 | ||
85 | return idx; | |
ccb4cabe SH |
86 | } |
87 | ||
8073018d CB |
88 | /* Given a null-terminated array of strings, check whether @entry is one of the |
89 | * strings. | |
ccb4cabe SH |
90 | */ |
91 | static bool string_in_list(char **list, const char *entry) | |
92 | { | |
ccb4cabe SH |
93 | if (!list) |
94 | return false; | |
d6337a5f | 95 | |
77c3e9a2 | 96 | for (int i = 0; list[i]; i++) |
8b99a20a | 97 | if (strequal(list[i], entry)) |
ccb4cabe SH |
98 | return true; |
99 | ||
100 | return false; | |
101 | } | |
102 | ||
5ae0207c CB |
103 | /* Given a handler's cgroup data, return the struct hierarchy for the controller |
104 | * @c, or NULL if there is none. | |
ccb4cabe | 105 | */ |
59eac805 | 106 | static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller) |
ccb4cabe | 107 | { |
77c3e9a2 CB |
108 | if (!ops->hierarchies) |
109 | return log_trace_errno(NULL, errno, "There are no useable cgroup controllers"); | |
d6337a5f | 110 | |
77c3e9a2 | 111 | for (int i = 0; ops->hierarchies[i]; i++) { |
27a5132c | 112 | if (!controller) { |
d6337a5f | 113 | /* This is the empty unified hierarchy. */ |
09ed8992 | 114 | if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0]) |
2202afc9 | 115 | return ops->hierarchies[i]; |
09ed8992 | 116 | |
106f1f38 | 117 | continue; |
6dcd6f02 | 118 | } |
09ed8992 | 119 | |
6dcd6f02 CB |
120 | /* |
121 | * Handle controllers with significant implementation changes | |
122 | * from cgroup to cgroup2. | |
123 | */ | |
124 | if (pure_unified_layout(ops)) { | |
8b99a20a | 125 | if (strequal(controller, "devices")) { |
ca72ccb5 | 126 | if (device_utility_controller(ops->unified)) |
6dcd6f02 CB |
127 | return ops->unified; |
128 | ||
129 | break; | |
8b99a20a | 130 | } else if (strequal(controller, "freezer")) { |
ca72ccb5 | 131 | if (freezer_utility_controller(ops->unified)) |
6dcd6f02 CB |
132 | return ops->unified; |
133 | ||
134 | break; | |
135 | } | |
d6337a5f CB |
136 | } |
137 | ||
27a5132c | 138 | if (string_in_list(ops->hierarchies[i]->controllers, controller)) |
2202afc9 | 139 | return ops->hierarchies[i]; |
ccb4cabe | 140 | } |
d6337a5f | 141 | |
27a5132c CB |
142 | if (controller) |
143 | WARN("There is no useable %s controller", controller); | |
144 | else | |
145 | WARN("There is no empty unified cgroup hierarchy"); | |
146 | ||
77c3e9a2 | 147 | return ret_set_errno(NULL, ENOENT); |
ccb4cabe SH |
148 | } |
149 | ||
a54694f8 CB |
150 | /* Taken over modified from the kernel sources. */ |
151 | #define NBITS 32 /* bits in uint32_t */ | |
152 | #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) | |
153 | #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS) | |
154 | ||
155 | static void set_bit(unsigned bit, uint32_t *bitarr) | |
156 | { | |
157 | bitarr[bit / NBITS] |= (1 << (bit % NBITS)); | |
158 | } | |
159 | ||
160 | static void clear_bit(unsigned bit, uint32_t *bitarr) | |
161 | { | |
162 | bitarr[bit / NBITS] &= ~(1 << (bit % NBITS)); | |
163 | } | |
164 | ||
165 | static bool is_set(unsigned bit, uint32_t *bitarr) | |
166 | { | |
167 | return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0; | |
168 | } | |
169 | ||
170 | /* Create cpumask from cpulist aka turn: | |
171 | * | |
172 | * 0,2-3 | |
173 | * | |
d5d468f6 | 174 | * into bit array |
a54694f8 CB |
175 | * |
176 | * 1 0 1 1 | |
177 | */ | |
178 | static uint32_t *lxc_cpumask(char *buf, size_t nbits) | |
179 | { | |
77c3e9a2 | 180 | __do_free uint32_t *bitarr = NULL; |
a54694f8 | 181 | char *token; |
d5d468f6 | 182 | size_t arrlen; |
d5d468f6 CB |
183 | |
184 | arrlen = BITS_TO_LONGS(nbits); | |
185 | bitarr = calloc(arrlen, sizeof(uint32_t)); | |
a54694f8 | 186 | if (!bitarr) |
c5b8049e | 187 | return ret_set_errno(NULL, ENOMEM); |
a54694f8 | 188 | |
0be0d78f | 189 | lxc_iterate_parts(token, buf, ",") { |
a54694f8 | 190 | errno = 0; |
d5d468f6 CB |
191 | unsigned end, start; |
192 | char *range; | |
a54694f8 | 193 | |
d5d468f6 CB |
194 | start = strtoul(token, NULL, 0); |
195 | end = start; | |
196 | range = strchr(token, '-'); | |
a54694f8 CB |
197 | if (range) |
198 | end = strtoul(range + 1, NULL, 0); | |
d5d468f6 | 199 | |
c5b8049e CB |
200 | if (!(start <= end)) |
201 | return ret_set_errno(NULL, EINVAL); | |
a54694f8 | 202 | |
c5b8049e CB |
203 | if (end >= nbits) |
204 | return ret_set_errno(NULL, EINVAL); | |
a54694f8 CB |
205 | |
206 | while (start <= end) | |
207 | set_bit(start++, bitarr); | |
208 | } | |
209 | ||
c5b8049e | 210 | return move_ptr(bitarr); |
a54694f8 CB |
211 | } |
212 | ||
a54694f8 CB |
213 | /* Turn cpumask into simple, comma-separated cpulist. */ |
214 | static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits) | |
215 | { | |
f761d24d | 216 | __do_free_string_list char **cpulist = NULL; |
c19ad94b | 217 | char numstr[INTTYPE_TO_STRLEN(size_t)] = {0}; |
77c3e9a2 | 218 | int ret; |
a54694f8 | 219 | |
77c3e9a2 | 220 | for (size_t i = 0; i <= nbits; i++) { |
414c6719 CB |
221 | if (!is_set(i, bitarr)) |
222 | continue; | |
223 | ||
0bba27c1 CB |
224 | ret = strnprintf(numstr, sizeof(numstr), "%zu", i); |
225 | if (ret < 0) | |
414c6719 | 226 | return NULL; |
414c6719 CB |
227 | |
228 | ret = lxc_append_string(&cpulist, numstr); | |
f761d24d | 229 | if (ret < 0) |
c5b8049e | 230 | return ret_set_errno(NULL, ENOMEM); |
a54694f8 | 231 | } |
414c6719 CB |
232 | |
233 | if (!cpulist) | |
c5b8049e | 234 | return ret_set_errno(NULL, ENOMEM); |
414c6719 | 235 | |
f761d24d | 236 | return lxc_string_join(",", (const char **)cpulist, false); |
a54694f8 CB |
237 | } |
238 | ||
239 | static ssize_t get_max_cpus(char *cpulist) | |
240 | { | |
241 | char *c1, *c2; | |
242 | char *maxcpus = cpulist; | |
243 | size_t cpus = 0; | |
244 | ||
245 | c1 = strrchr(maxcpus, ','); | |
246 | if (c1) | |
247 | c1++; | |
248 | ||
249 | c2 = strrchr(maxcpus, '-'); | |
250 | if (c2) | |
251 | c2++; | |
252 | ||
253 | if (!c1 && !c2) | |
254 | c1 = maxcpus; | |
255 | else if (c1 > c2) | |
256 | c2 = c1; | |
257 | else if (c1 < c2) | |
258 | c1 = c2; | |
333987b9 | 259 | else if (!c1 && c2) |
a54694f8 CB |
260 | c1 = c2; |
261 | ||
a54694f8 CB |
262 | errno = 0; |
263 | cpus = strtoul(c1, NULL, 0); | |
264 | if (errno != 0) | |
265 | return -1; | |
266 | ||
267 | return cpus; | |
268 | } | |
269 | ||
77c3e9a2 | 270 | static inline bool is_unified_hierarchy(const struct hierarchy *h) |
c04a6d4e | 271 | { |
b8572e8c | 272 | return h->fs_type == UNIFIED_HIERARCHY; |
c04a6d4e CB |
273 | } |
274 | ||
f57ac67f CB |
275 | /* Return true if the controller @entry is found in the null-terminated list of |
276 | * hierarchies @hlist. | |
ccb4cabe | 277 | */ |
c7a1f72a | 278 | static bool controller_available(struct hierarchy **hlist, char *entry) |
ccb4cabe | 279 | { |
ccb4cabe SH |
280 | if (!hlist) |
281 | return false; | |
282 | ||
77c3e9a2 | 283 | for (int i = 0; hlist[i]; i++) |
ccb4cabe SH |
284 | if (string_in_list(hlist[i]->controllers, entry)) |
285 | return true; | |
d6337a5f | 286 | |
ccb4cabe SH |
287 | return false; |
288 | } | |
289 | ||
c7a1f72a | 290 | static bool controllers_available(struct cgroup_ops *ops) |
ccb4cabe | 291 | { |
77c3e9a2 | 292 | struct hierarchy **hlist; |
ccb4cabe | 293 | |
2202afc9 | 294 | if (!ops->cgroup_use) |
ccb4cabe | 295 | return true; |
c2712f64 | 296 | |
77c3e9a2 CB |
297 | hlist = ops->hierarchies; |
298 | for (char **cur = ops->cgroup_use; cur && *cur; cur++) | |
c7a1f72a CB |
299 | if (!controller_available(hlist, *cur)) |
300 | return log_error(false, "The %s controller found", *cur); | |
c2712f64 | 301 | |
ccb4cabe SH |
302 | return true; |
303 | } | |
304 | ||
63ba9eaf | 305 | static char **list_new(void) |
ccb4cabe | 306 | { |
63ba9eaf CB |
307 | __do_free_string_list char **list = NULL; |
308 | int idx; | |
309 | ||
310 | idx = list_add((void ***)&list); | |
311 | if (idx < 0) | |
312 | return NULL; | |
a55f31bd | 313 | |
63ba9eaf CB |
314 | list[idx] = NULL; |
315 | return move_ptr(list); | |
35ec1a38 | 316 | } |
d6337a5f | 317 | |
63ba9eaf | 318 | static int list_add_string(char ***list, char *entry) |
35ec1a38 | 319 | { |
63ba9eaf CB |
320 | __do_free char *dup = NULL; |
321 | int idx; | |
322 | ||
323 | dup = strdup(entry); | |
324 | if (!dup) | |
325 | return ret_errno(ENOMEM); | |
326 | ||
327 | idx = list_add((void ***)list); | |
328 | if (idx < 0) | |
329 | return idx; | |
330 | ||
331 | (*list)[idx] = move_ptr(dup); | |
332 | return 0; | |
333 | } | |
334 | ||
335 | static char **list_add_controllers(char *controllers) | |
336 | { | |
337 | __do_free_string_list char **list = NULL; | |
35ec1a38 | 338 | char *it; |
6328fd9c | 339 | |
35ec1a38 | 340 | lxc_iterate_parts(it, controllers, " \t\n") { |
63ba9eaf | 341 | int ret; |
d97919ab | 342 | |
63ba9eaf CB |
343 | ret = list_add_string(&list, it); |
344 | if (ret < 0) | |
d6337a5f | 345 | return NULL; |
411ac6d8 | 346 | } |
f205f10c | 347 | |
63ba9eaf | 348 | return move_ptr(list); |
d6337a5f CB |
349 | } |
350 | ||
35ec1a38 | 351 | static char **unified_controllers(int dfd, const char *file) |
d6337a5f | 352 | { |
d97919ab | 353 | __do_free char *buf = NULL; |
d6337a5f | 354 | |
46bf13b7 | 355 | buf = read_file_at(dfd, file, PROTECT_OPEN, 0); |
d6337a5f | 356 | if (!buf) |
411ac6d8 | 357 | return NULL; |
6328fd9c | 358 | |
63ba9eaf | 359 | return list_add_controllers(buf); |
ccb4cabe SH |
360 | } |
361 | ||
35ec1a38 | 362 | static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers) |
060e54d6 CB |
363 | { |
364 | if (!ops->cgroup_use) | |
35ec1a38 | 365 | return false; |
060e54d6 CB |
366 | |
367 | for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) { | |
368 | bool found = false; | |
369 | ||
370 | for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) { | |
371 | if (!strequal(*cur_use, *cur_ctrl)) | |
372 | continue; | |
373 | ||
374 | found = true; | |
375 | break; | |
376 | } | |
377 | ||
378 | if (found) | |
379 | continue; | |
380 | ||
35ec1a38 | 381 | return true; |
060e54d6 CB |
382 | } |
383 | ||
35ec1a38 | 384 | return false; |
060e54d6 CB |
385 | } |
386 | ||
179754a2 CB |
387 | static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt, |
388 | int dfd_base, char *base_cgroup, | |
b8572e8c | 389 | char **controllers, cgroupfs_type_magic_t fs_type) |
ccb4cabe | 390 | { |
600a0163 | 391 | __do_free struct hierarchy *new = NULL; |
701be30e | 392 | int idx; |
ccb4cabe | 393 | |
35ec1a38 | 394 | if (abspath(base_cgroup)) |
fc4612cb | 395 | return syserrno_set(-EINVAL, "Container base path must be relative to controller mount"); |
060e54d6 | 396 | |
1973b62a | 397 | new = zalloc(sizeof(*new)); |
6e214b74 | 398 | if (!new) |
060e54d6 | 399 | return ret_errno(ENOMEM); |
c72e7cb5 | 400 | |
e33870e5 | 401 | new->dfd_con = -EBADF; |
c0af7b1c | 402 | new->dfd_lim = -EBADF; |
6a32c817 | 403 | new->dfd_mon = -EBADF; |
600a0163 | 404 | |
44585f1a CB |
405 | new->fs_type = fs_type; |
406 | new->controllers = controllers; | |
a58be2ad | 407 | new->at_mnt = mnt; |
44585f1a | 408 | new->at_base = base_cgroup; |
35ec1a38 | 409 | |
44585f1a CB |
410 | new->dfd_mnt = dfd_mnt; |
411 | new->dfd_base = dfd_base; | |
35ec1a38 CB |
412 | |
413 | TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s", | |
414 | mnt, maybe_empty(base_cgroup)); | |
060e54d6 | 415 | for (char *const *it = new->controllers; it && *it; it++) |
35ec1a38 | 416 | TRACE("The hierarchy contains the %s controller", *it); |
6328fd9c | 417 | |
35ec1a38 | 418 | idx = list_add((void ***)&ops->hierarchies); |
63ba9eaf CB |
419 | if (idx < 0) |
420 | return ret_errno(idx); | |
421 | ||
b8572e8c | 422 | if (fs_type == UNIFIED_HIERARCHY) |
060e54d6 | 423 | ops->unified = new; |
701be30e | 424 | (ops->hierarchies)[idx] = move_ptr(new); |
ccb4cabe | 425 | |
63ba9eaf | 426 | return 0; |
ccb4cabe SH |
427 | } |
428 | ||
c55fe36d | 429 | static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune) |
c71d83e1 | 430 | { |
c55fe36d | 431 | if (!path_prune || !hierarchies) |
2202afc9 | 432 | return 0; |
d6337a5f | 433 | |
8e64b673 | 434 | for (int i = 0; hierarchies[i]; i++) { |
2202afc9 | 435 | struct hierarchy *h = hierarchies[i]; |
77c3e9a2 | 436 | int ret; |
d6337a5f | 437 | |
c55fe36d | 438 | ret = cgroup_tree_prune(h->dfd_base, path_prune); |
2202afc9 | 439 | if (ret < 0) |
c55fe36d CB |
440 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune); |
441 | else | |
442 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune); | |
2202afc9 | 443 | |
b1b1a60f | 444 | free_equal(h->path_lim, h->path_con); |
2202afc9 | 445 | } |
d6337a5f | 446 | |
c71d83e1 | 447 | return 0; |
d6337a5f CB |
448 | } |
449 | ||
2202afc9 CB |
450 | struct generic_userns_exec_data { |
451 | struct hierarchy **hierarchies; | |
c55fe36d | 452 | const char *path_prune; |
2202afc9 CB |
453 | struct lxc_conf *conf; |
454 | uid_t origuid; /* target uid in parent namespace */ | |
455 | char *path; | |
456 | }; | |
d6337a5f | 457 | |
de6fe132 | 458 | static int cgroup_tree_remove_wrapper(void *data) |
2202afc9 | 459 | { |
2202afc9 CB |
460 | struct generic_userns_exec_data *arg = data; |
461 | uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; | |
462 | gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; | |
8e64b673 | 463 | int ret; |
d6337a5f | 464 | |
8917c382 | 465 | if (!lxc_drop_groups() && errno != EPERM) |
b58214ac CB |
466 | return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); |
467 | ||
2202afc9 | 468 | ret = setresgid(nsgid, nsgid, nsgid); |
8e64b673 | 469 | if (ret < 0) |
77c3e9a2 | 470 | return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", |
8e64b673 | 471 | (int)nsgid, (int)nsgid, (int)nsgid); |
d6337a5f | 472 | |
2202afc9 | 473 | ret = setresuid(nsuid, nsuid, nsuid); |
8e64b673 | 474 | if (ret < 0) |
77c3e9a2 | 475 | return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", |
8e64b673 | 476 | (int)nsuid, (int)nsuid, (int)nsuid); |
d6337a5f | 477 | |
c55fe36d | 478 | return cgroup_tree_remove(arg->hierarchies, arg->path_prune); |
d6337a5f CB |
479 | } |
480 | ||
434c8e15 CB |
481 | __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, |
482 | struct lxc_handler *handler) | |
d6337a5f CB |
483 | { |
484 | int ret; | |
bd8ef4e4 | 485 | |
fc3b9533 CB |
486 | if (!ops) { |
487 | ERROR("Called with uninitialized cgroup operations"); | |
488 | return; | |
489 | } | |
fc1c3af9 | 490 | |
69b4a4bb CB |
491 | if (!ops->hierarchies) |
492 | return; | |
493 | ||
fc3b9533 CB |
494 | if (!handler) { |
495 | ERROR("Called with uninitialized handler"); | |
496 | return; | |
497 | } | |
fc1c3af9 | 498 | |
fc3b9533 CB |
499 | if (!handler->conf) { |
500 | ERROR("Called with uninitialized conf"); | |
501 | return; | |
502 | } | |
fc1c3af9 | 503 | |
a6aeb9f1 CB |
504 | if (!ops->container_limit_cgroup) { |
505 | WARN("Uninitialized limit cgroup"); | |
506 | return; | |
507 | } | |
508 | ||
31b84c7a | 509 | ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices); |
bf651989 CB |
510 | if (ret < 0) |
511 | WARN("Failed to detach bpf program from cgroup"); | |
bf651989 | 512 | |
bb6dbaf0 | 513 | if (!lxc_list_empty(&handler->conf->id_map)) { |
8e64b673 | 514 | struct generic_userns_exec_data wrap = { |
77c3e9a2 | 515 | .conf = handler->conf, |
c55fe36d | 516 | .path_prune = ops->container_limit_cgroup, |
77c3e9a2 CB |
517 | .hierarchies = ops->hierarchies, |
518 | .origuid = 0, | |
8e64b673 | 519 | }; |
de6fe132 CB |
520 | ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper, |
521 | &wrap, "cgroup_tree_remove_wrapper"); | |
8e64b673 | 522 | } else { |
c55fe36d | 523 | ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup); |
ccb4cabe | 524 | } |
8e64b673 | 525 | if (ret < 0) |
fc3b9533 | 526 | SYSWARN("Failed to destroy cgroups"); |
ccb4cabe SH |
527 | } |
528 | ||
033267c9 CB |
529 | #define __ISOL_CPUS "/sys/devices/system/cpu/isolated" |
530 | #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline" | |
531 | static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child, | |
532 | bool am_initialized) | |
434c8e15 | 533 | { |
033267c9 CB |
534 | __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL, |
535 | *offlinecpus = NULL, *posscpus = NULL; | |
536 | __do_free uint32_t *isolmask = NULL, *offlinemask = NULL, | |
537 | *possmask = NULL; | |
538 | int ret; | |
539 | ssize_t i; | |
540 | ssize_t maxisol = 0, maxoffline = 0, maxposs = 0; | |
541 | bool flipped_bit = false; | |
b376d3d0 | 542 | |
033267c9 CB |
543 | posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0); |
544 | if (!posscpus) | |
545 | return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath); | |
546 | ||
547 | /* Get maximum number of cpus found in possible cpuset. */ | |
548 | maxposs = get_max_cpus(posscpus); | |
549 | if (maxposs < 0 || maxposs >= INT_MAX - 1) | |
550 | return false; | |
551 | ||
552 | if (file_exists(__ISOL_CPUS)) { | |
553 | isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0); | |
554 | if (!isolcpus) | |
555 | return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS); | |
556 | ||
557 | if (isdigit(isolcpus[0])) { | |
558 | /* Get maximum number of cpus found in isolated cpuset. */ | |
559 | maxisol = get_max_cpus(isolcpus); | |
560 | if (maxisol < 0 || maxisol >= INT_MAX - 1) | |
561 | return false; | |
562 | } | |
563 | ||
564 | if (maxposs < maxisol) | |
565 | maxposs = maxisol; | |
566 | maxposs++; | |
567 | } else { | |
568 | TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist"); | |
fc3b9533 | 569 | } |
434c8e15 | 570 | |
033267c9 CB |
571 | if (file_exists(__OFFLINE_CPUS)) { |
572 | offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0); | |
573 | if (!offlinecpus) | |
574 | return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS); | |
434c8e15 | 575 | |
033267c9 CB |
576 | if (isdigit(offlinecpus[0])) { |
577 | /* Get maximum number of cpus found in offline cpuset. */ | |
578 | maxoffline = get_max_cpus(offlinecpus); | |
579 | if (maxoffline < 0 || maxoffline >= INT_MAX - 1) | |
580 | return false; | |
581 | } | |
582 | ||
583 | if (maxposs < maxoffline) | |
584 | maxposs = maxoffline; | |
585 | maxposs++; | |
586 | } else { | |
587 | TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist"); | |
fc3b9533 | 588 | } |
b376d3d0 | 589 | |
033267c9 CB |
590 | if ((maxisol == 0) && (maxoffline == 0)) { |
591 | cpulist = move_ptr(posscpus); | |
592 | goto copy_parent; | |
fc3b9533 | 593 | } |
1973b62a | 594 | |
033267c9 CB |
595 | possmask = lxc_cpumask(posscpus, maxposs); |
596 | if (!possmask) | |
597 | return log_error_errno(false, errno, "Failed to create cpumask for possible cpus"); | |
434c8e15 | 598 | |
033267c9 CB |
599 | if (maxisol > 0) { |
600 | isolmask = lxc_cpumask(isolcpus, maxposs); | |
601 | if (!isolmask) | |
602 | return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus"); | |
603 | } | |
434c8e15 | 604 | |
033267c9 CB |
605 | if (maxoffline > 0) { |
606 | offlinemask = lxc_cpumask(offlinecpus, maxposs); | |
607 | if (!offlinemask) | |
608 | return log_error_errno(false, errno, "Failed to create cpumask for offline cpus"); | |
609 | } | |
610 | ||
611 | for (i = 0; i <= maxposs; i++) { | |
612 | if ((isolmask && !is_set(i, isolmask)) || | |
613 | (offlinemask && !is_set(i, offlinemask)) || | |
614 | !is_set(i, possmask)) | |
434c8e15 CB |
615 | continue; |
616 | ||
033267c9 CB |
617 | flipped_bit = true; |
618 | clear_bit(i, possmask); | |
619 | } | |
c468e4d4 | 620 | |
033267c9 CB |
621 | if (!flipped_bit) { |
622 | cpulist = lxc_cpumask_to_cpulist(possmask, maxposs); | |
623 | TRACE("No isolated or offline cpus present in cpuset"); | |
624 | } else { | |
625 | cpulist = move_ptr(posscpus); | |
626 | TRACE("Removed isolated or offline cpus from cpuset"); | |
627 | } | |
628 | if (!cpulist) | |
629 | return log_error_errno(false, errno, "Failed to create cpu list"); | |
1973b62a | 630 | |
033267c9 CB |
631 | copy_parent: |
632 | if (!am_initialized) { | |
633 | ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist)); | |
634 | if (ret < 0) | |
635 | return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child); | |
77ffeed2 | 636 | |
033267c9 CB |
637 | TRACE("Copied cpu settings of parent cgroup"); |
638 | } | |
77ffeed2 | 639 | |
033267c9 CB |
640 | return true; |
641 | } | |
1973b62a | 642 | |
033267c9 CB |
643 | static bool cpuset1_initialize(int dfd_base, int dfd_next) |
644 | { | |
645 | char mems[PATH_MAX]; | |
646 | ssize_t bytes; | |
647 | char v; | |
434c8e15 | 648 | |
033267c9 CB |
649 | /* |
650 | * Determine whether the base cgroup has cpuset | |
651 | * inheritance turned on. | |
652 | */ | |
653 | bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1); | |
654 | if (bytes < 0) | |
655 | return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base); | |
656 | ||
657 | /* | |
658 | * Initialize cpuset.cpus and make remove any isolated | |
659 | * and offline cpus. | |
660 | */ | |
661 | if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1')) | |
662 | return syserrno(false, "Failed to initialize cpuset.cpus"); | |
663 | ||
664 | /* Read cpuset.mems from parent... */ | |
665 | bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems)); | |
666 | if (bytes < 0) | |
667 | return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base); | |
668 | ||
669 | /* ... and copy to first cgroup in the tree... */ | |
670 | bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes); | |
671 | if (bytes < 0) | |
672 | return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next); | |
673 | ||
674 | /* ... and finally turn on cpuset inheritance. */ | |
675 | bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1); | |
676 | if (bytes < 0) | |
677 | return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next); | |
678 | ||
679 | return log_trace(true, "Initialized cpuset in the legacy hierarchy"); | |
434c8e15 CB |
680 | } |
681 | ||
033267c9 CB |
682 | static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode, |
683 | bool cpuset_v1, bool eexist_ignore) | |
6099dd5a | 684 | { |
da42ac7b CB |
685 | __do_close int dfd_final = -EBADF; |
686 | int dfd_cur = dfd_base; | |
687 | int ret = 0; | |
688 | size_t len; | |
689 | char *cur; | |
690 | char buf[PATH_MAX]; | |
6099dd5a | 691 | |
da42ac7b | 692 | if (is_empty_string(path)) |
bce04069 | 693 | return ret_errno(EINVAL); |
6099dd5a | 694 | |
da42ac7b CB |
695 | len = strlcpy(buf, path, sizeof(buf)); |
696 | if (len >= sizeof(buf)) | |
bce04069 | 697 | return ret_errno(E2BIG); |
6099dd5a | 698 | |
da42ac7b CB |
699 | lxc_iterate_parts(cur, buf, "/") { |
700 | /* | |
701 | * Even though we vetted the paths when we parsed the config | |
702 | * we're paranoid here and check that the path is neither | |
703 | * absolute nor walks upwards. | |
704 | */ | |
e4db08ed | 705 | if (abspath(cur)) |
da42ac7b | 706 | return syserrno_set(-EINVAL, "No absolute paths allowed"); |
6099dd5a | 707 | |
e4db08ed | 708 | if (strnequal(cur, "..", STRLITERALLEN(".."))) |
da42ac7b | 709 | return syserrno_set(-EINVAL, "No upward walking paths allowed"); |
6099dd5a | 710 | |
da42ac7b CB |
711 | ret = mkdirat(dfd_cur, cur, mode); |
712 | if (ret < 0) { | |
713 | if (errno != EEXIST) | |
714 | return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur); | |
715 | ||
716 | ret = -EEXIST; | |
717 | } | |
718 | TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur); | |
719 | ||
720 | dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0); | |
721 | if (dfd_final < 0) | |
722 | return syserrno(-errno, "Fail to open%s directory %d(%s)", | |
723 | !ret ? " newly created" : "", dfd_base, cur); | |
724 | if (dfd_cur != dfd_base) | |
725 | close(dfd_cur); | |
033267c9 CB |
726 | else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final)) |
727 | return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy"); | |
da42ac7b | 728 | /* |
033267c9 CB |
729 | * Leave dfd_final pointing to the last fd we opened so |
730 | * it will be automatically zapped if we return early. | |
da42ac7b CB |
731 | */ |
732 | dfd_cur = dfd_final; | |
733 | } | |
734 | ||
735 | /* The final cgroup must be succesfully creatd by us. */ | |
033267c9 CB |
736 | if (ret) { |
737 | if (ret != -EEXIST || !eexist_ignore) | |
738 | return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path); | |
739 | } | |
da42ac7b CB |
740 | |
741 | return move_fd(dfd_final); | |
6099dd5a CB |
742 | } |
743 | ||
432faf20 | 744 | static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf, |
a6aeb9f1 CB |
745 | struct hierarchy *h, const char *cgroup_limit_dir, |
746 | const char *cgroup_leaf, bool payload) | |
72068e74 | 747 | { |
da42ac7b | 748 | __do_close int fd_limit = -EBADF, fd_final = -EBADF; |
432faf20 | 749 | __do_free char *path = NULL, *limit_path = NULL; |
033267c9 | 750 | bool cpuset_v1 = false; |
72068e74 | 751 | |
033267c9 CB |
752 | /* |
753 | * The legacy cpuset controller needs massaging in case inheriting | |
754 | * settings from its immediate ancestor cgroup hasn't been turned on. | |
755 | */ | |
756 | cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset"); | |
0c3deb94 | 757 | |
a6aeb9f1 | 758 | if (payload && cgroup_leaf) { |
da42ac7b | 759 | /* With isolation both parts need to not already exist. */ |
033267c9 | 760 | fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); |
da42ac7b CB |
761 | if (fd_limit < 0) |
762 | return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir); | |
432faf20 | 763 | |
a6aeb9f1 CB |
764 | TRACE("Created limit cgroup %d->%d(%s)", |
765 | fd_limit, h->dfd_base, cgroup_limit_dir); | |
432faf20 WB |
766 | |
767 | /* | |
768 | * With isolation the devices legacy cgroup needs to be | |
769 | * iinitialized early, as it typically contains an 'a' (all) | |
770 | * line, which is not possible once a subdirectory has been | |
771 | * created. | |
772 | */ | |
ec4d463d CB |
773 | if (string_in_list(h->controllers, "devices") && |
774 | !ops->setup_limits_legacy(ops, conf, true)) | |
775 | return log_error(false, "Failed to setup legacy device limits"); | |
432faf20 | 776 | |
44585f1a | 777 | limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL); |
a6aeb9f1 CB |
778 | path = must_make_path(limit_path, cgroup_leaf, NULL); |
779 | ||
780 | /* | |
781 | * If we use a separate limit cgroup, the leaf cgroup, i.e. the | |
782 | * cgroup the container actually resides in, is below fd_limit. | |
783 | */ | |
784 | fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false); | |
e2035358 CB |
785 | if (fd_final < 0) { |
786 | /* Ensure we don't leave any garbage behind. */ | |
787 | if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir)) | |
788 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir); | |
789 | else | |
790 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir); | |
791 | } | |
a6aeb9f1 | 792 | } else { |
44585f1a | 793 | path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL); |
9981107f CB |
794 | |
795 | fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false); | |
a6aeb9f1 | 796 | } |
033267c9 CB |
797 | if (fd_final < 0) |
798 | return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir); | |
0c3deb94 | 799 | |
1973b62a | 800 | if (payload) { |
e33870e5 | 801 | h->dfd_con = move_fd(fd_final); |
67ed60ce | 802 | h->path_con = move_ptr(path); |
da42ac7b CB |
803 | |
804 | if (fd_limit < 0) | |
c0af7b1c | 805 | h->dfd_lim = h->dfd_con; |
da42ac7b | 806 | else |
c0af7b1c | 807 | h->dfd_lim = move_fd(fd_limit); |
da42ac7b | 808 | |
a6aeb9f1 | 809 | if (limit_path) |
b1b1a60f | 810 | h->path_lim = move_ptr(limit_path); |
a6aeb9f1 | 811 | else |
b1b1a60f | 812 | h->path_lim = h->path_con; |
1973b62a | 813 | } else { |
6a32c817 | 814 | h->dfd_mon = move_fd(fd_final); |
1973b62a | 815 | } |
fe70edee | 816 | |
c581d2a6 | 817 | return true; |
ccb4cabe SH |
818 | } |
819 | ||
6c880cdf CB |
820 | static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune, |
821 | bool payload) | |
ccb4cabe | 822 | { |
c1ece895 | 823 | bool prune = true; |
72068e74 | 824 | |
1973b62a | 825 | if (payload) { |
c1ece895 | 826 | /* Check whether we actually created the cgroup to prune. */ |
c0af7b1c | 827 | if (h->dfd_lim < 0) |
c1ece895 CB |
828 | prune = false; |
829 | ||
b1b1a60f | 830 | free_equal(h->path_con, h->path_lim); |
c0af7b1c | 831 | close_equal(h->dfd_con, h->dfd_lim); |
1973b62a | 832 | } else { |
c1ece895 | 833 | /* Check whether we actually created the cgroup to prune. */ |
6a32c817 | 834 | if (h->dfd_mon < 0) |
c1ece895 CB |
835 | prune = false; |
836 | ||
6a32c817 | 837 | close_prot_errno_disarm(h->dfd_mon); |
1973b62a | 838 | } |
e56639fb | 839 | |
c1ece895 CB |
840 | /* We didn't create this cgroup. */ |
841 | if (!prune) | |
842 | return; | |
843 | ||
844 | if (cgroup_tree_prune(h->dfd_base, path_prune)) | |
cb423bd3 CB |
845 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune); |
846 | else | |
847 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune); | |
a900cbaf WB |
848 | } |
849 | ||
033267c9 CB |
850 | __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops, |
851 | struct lxc_handler *handler) | |
852 | { | |
853 | int len; | |
854 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; | |
855 | const struct lxc_conf *conf; | |
856 | ||
857 | if (!ops) { | |
858 | ERROR("Called with uninitialized cgroup operations"); | |
859 | return; | |
860 | } | |
861 | ||
862 | if (!ops->hierarchies) | |
863 | return; | |
864 | ||
865 | if (!handler) { | |
866 | ERROR("Called with uninitialized handler"); | |
867 | return; | |
868 | } | |
869 | ||
870 | if (!handler->conf) { | |
871 | ERROR("Called with uninitialized conf"); | |
872 | return; | |
873 | } | |
874 | conf = handler->conf; | |
875 | ||
1e058855 CB |
876 | if (!ops->monitor_cgroup) { |
877 | WARN("Uninitialized monitor cgroup"); | |
878 | return; | |
879 | } | |
880 | ||
033267c9 CB |
881 | len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid); |
882 | if (len < 0) | |
883 | return; | |
884 | ||
885 | for (int i = 0; ops->hierarchies[i]; i++) { | |
886 | __do_close int fd_pivot = -EBADF; | |
887 | __do_free char *pivot_path = NULL; | |
888 | struct hierarchy *h = ops->hierarchies[i]; | |
889 | bool cpuset_v1 = false; | |
890 | int ret; | |
891 | ||
033267c9 CB |
892 | /* Monitor might have died before we entered the cgroup. */ |
893 | if (handler->monitor_pid <= 0) { | |
894 | WARN("No valid monitor process found while destroying cgroups"); | |
c55fe36d | 895 | goto cgroup_prune_tree; |
033267c9 CB |
896 | } |
897 | ||
898 | if (conf->cgroup_meta.monitor_pivot_dir) | |
899 | pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL); | |
033267c9 CB |
900 | else if (conf->cgroup_meta.dir) |
901 | pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL); | |
902 | else | |
903 | pivot_path = must_make_path(CGROUP_PIVOT, NULL); | |
904 | ||
905 | cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset"); | |
906 | ||
907 | fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true); | |
908 | if (fd_pivot < 0) { | |
909 | SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path); | |
910 | continue; | |
911 | } | |
912 | ||
913 | ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len); | |
914 | if (ret != 0) { | |
915 | SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path); | |
916 | continue; | |
917 | } | |
918 | ||
c55fe36d CB |
919 | cgroup_prune_tree: |
920 | ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup); | |
033267c9 | 921 | if (ret < 0) |
c55fe36d CB |
922 | SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup); |
923 | else | |
924 | TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup); | |
033267c9 CB |
925 | } |
926 | } | |
927 | ||
a900cbaf WB |
928 | /* |
929 | * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a | |
930 | * proper prefix directory of lxc.cgroup.dir.payload. | |
931 | * | |
932 | * Returns the prefix length if it is set, otherwise zero on success. | |
933 | */ | |
934 | static bool check_cgroup_dir_config(struct lxc_conf *conf) | |
935 | { | |
936 | const char *monitor_dir = conf->cgroup_meta.monitor_dir, | |
937 | *container_dir = conf->cgroup_meta.container_dir, | |
938 | *namespace_dir = conf->cgroup_meta.namespace_dir; | |
a900cbaf WB |
939 | |
940 | /* none of the new options are set, all is fine */ | |
941 | if (!monitor_dir && !container_dir && !namespace_dir) | |
942 | return true; | |
943 | ||
944 | /* some are set, make sure lxc.cgroup.dir is not also set*/ | |
945 | if (conf->cgroup_meta.dir) | |
946 | return log_error_errno(false, EINVAL, | |
947 | "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor"); | |
948 | ||
949 | /* make sure both monitor and payload are set */ | |
950 | if (!monitor_dir || !container_dir) | |
951 | return log_error_errno(false, EINVAL, | |
952 | "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set"); | |
953 | ||
954 | /* namespace_dir may be empty */ | |
955 | return true; | |
72068e74 CB |
956 | } |
957 | ||
59eac805 | 958 | __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler) |
72068e74 | 959 | { |
dcf6a5c7 | 960 | __do_free char *monitor_cgroup = NULL; |
fe70edee CB |
961 | int idx = 0; |
962 | int i; | |
5ce03bc0 | 963 | size_t len; |
a900cbaf | 964 | char *suffix = NULL; |
0d66e29a | 965 | struct lxc_conf *conf; |
72068e74 | 966 | |
0d66e29a CB |
967 | if (!ops) |
968 | return ret_set_errno(false, ENOENT); | |
e56639fb | 969 | |
69b4a4bb CB |
970 | if (!ops->hierarchies) |
971 | return true; | |
972 | ||
0d66e29a CB |
973 | if (ops->monitor_cgroup) |
974 | return ret_set_errno(false, EEXIST); | |
975 | ||
976 | if (!handler || !handler->conf) | |
977 | return ret_set_errno(false, EINVAL); | |
978 | ||
979 | conf = handler->conf; | |
980 | ||
a900cbaf WB |
981 | if (!check_cgroup_dir_config(conf)) |
982 | return false; | |
983 | ||
984 | if (conf->cgroup_meta.monitor_dir) { | |
a900cbaf WB |
985 | monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir); |
986 | } else if (conf->cgroup_meta.dir) { | |
fe70edee CB |
987 | monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/", |
988 | DEFAULT_MONITOR_CGROUP_PREFIX, | |
989 | handler->name, | |
990 | CGROUP_CREATE_RETRY, NULL); | |
b3ed2061 | 991 | } else if (ops->cgroup_pattern) { |
dcf6a5c7 CB |
992 | __do_free char *cgroup_tree = NULL; |
993 | ||
994 | cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern); | |
995 | if (!cgroup_tree) | |
d6bdd182 CB |
996 | return ret_set_errno(false, ENOMEM); |
997 | ||
d6bdd182 CB |
998 | monitor_cgroup = must_concat(&len, cgroup_tree, "/", |
999 | DEFAULT_MONITOR_CGROUP, | |
b3ed2061 CB |
1000 | CGROUP_CREATE_RETRY, NULL); |
1001 | } else { | |
fe70edee CB |
1002 | monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX, |
1003 | handler->name, | |
1004 | CGROUP_CREATE_RETRY, NULL); | |
b3ed2061 | 1005 | } |
fe70edee | 1006 | if (!monitor_cgroup) |
0d66e29a | 1007 | return ret_set_errno(false, ENOMEM); |
72068e74 | 1008 | |
a900cbaf WB |
1009 | if (!conf->cgroup_meta.monitor_dir) { |
1010 | suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN; | |
1011 | *suffix = '\0'; | |
1012 | } | |
5ce03bc0 | 1013 | do { |
a900cbaf | 1014 | if (idx && suffix) |
fe70edee | 1015 | sprintf(suffix, "-%d", idx); |
72068e74 | 1016 | |
ebc10afe | 1017 | for (i = 0; ops->hierarchies[i]; i++) { |
432faf20 | 1018 | if (cgroup_tree_create(ops, handler->conf, |
dcf6a5c7 | 1019 | ops->hierarchies[i], |
6fec4327 | 1020 | monitor_cgroup, NULL, false)) |
fe70edee CB |
1021 | continue; |
1022 | ||
7064ee3a | 1023 | DEBUG("Failed to create cgroup %s)", monitor_cgroup); |
6c880cdf CB |
1024 | for (int j = 0; j <= i; j++) |
1025 | cgroup_tree_prune_leaf(ops->hierarchies[j], | |
1026 | monitor_cgroup, false); | |
fe70edee CB |
1027 | |
1028 | idx++; | |
1029 | break; | |
5ce03bc0 | 1030 | } |
a900cbaf | 1031 | } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix); |
5ce03bc0 | 1032 | |
a900cbaf | 1033 | if (idx == 1000 || (!suffix && idx != 0)) |
04a49a14 | 1034 | return log_error_errno(false, ERANGE, "Failed to create monitor cgroup"); |
72068e74 | 1035 | |
c581d2a6 | 1036 | ops->monitor_cgroup = move_ptr(monitor_cgroup); |
6e8703a4 | 1037 | return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup); |
ccb4cabe SH |
1038 | } |
1039 | ||
fe70edee CB |
1040 | /* |
1041 | * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern; | |
cecad0c1 | 1042 | * next cgroup_pattern-1, -2, ..., -999. |
ccb4cabe | 1043 | */ |
59eac805 | 1044 | __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler) |
ccb4cabe | 1045 | { |
a6aeb9f1 CB |
1046 | __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL; |
1047 | char *limit_cgroup; | |
f3839f12 | 1048 | int idx = 0; |
fe70edee | 1049 | int i; |
ccb4cabe | 1050 | size_t len; |
a900cbaf | 1051 | char *suffix = NULL; |
f3839f12 | 1052 | struct lxc_conf *conf; |
43654d34 | 1053 | |
f3839f12 CB |
1054 | if (!ops) |
1055 | return ret_set_errno(false, ENOENT); | |
ccb4cabe | 1056 | |
69b4a4bb CB |
1057 | if (!ops->hierarchies) |
1058 | return true; | |
1059 | ||
471929c6 | 1060 | if (ops->container_cgroup || ops->container_limit_cgroup) |
f3839f12 CB |
1061 | return ret_set_errno(false, EEXIST); |
1062 | ||
1063 | if (!handler || !handler->conf) | |
1064 | return ret_set_errno(false, EINVAL); | |
1065 | ||
1066 | conf = handler->conf; | |
1067 | ||
a900cbaf WB |
1068 | if (!check_cgroup_dir_config(conf)) |
1069 | return false; | |
1070 | ||
1071 | if (conf->cgroup_meta.container_dir) { | |
a6aeb9f1 CB |
1072 | __limit_cgroup = strdup(conf->cgroup_meta.container_dir); |
1073 | if (!__limit_cgroup) | |
a900cbaf WB |
1074 | return ret_set_errno(false, ENOMEM); |
1075 | ||
432faf20 | 1076 | if (conf->cgroup_meta.namespace_dir) { |
a6aeb9f1 | 1077 | container_cgroup = must_make_path(__limit_cgroup, |
432faf20 WB |
1078 | conf->cgroup_meta.namespace_dir, |
1079 | NULL); | |
a6aeb9f1 | 1080 | limit_cgroup = __limit_cgroup; |
432faf20 WB |
1081 | } else { |
1082 | /* explicit paths but without isolation */ | |
a6aeb9f1 CB |
1083 | limit_cgroup = move_ptr(__limit_cgroup); |
1084 | container_cgroup = limit_cgroup; | |
432faf20 | 1085 | } |
a900cbaf | 1086 | } else if (conf->cgroup_meta.dir) { |
a6aeb9f1 CB |
1087 | limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/", |
1088 | DEFAULT_PAYLOAD_CGROUP_PREFIX, | |
1089 | handler->name, | |
1090 | CGROUP_CREATE_RETRY, NULL); | |
1091 | container_cgroup = limit_cgroup; | |
b3ed2061 | 1092 | } else if (ops->cgroup_pattern) { |
dcf6a5c7 CB |
1093 | __do_free char *cgroup_tree = NULL; |
1094 | ||
1095 | cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern); | |
1096 | if (!cgroup_tree) | |
d6bdd182 CB |
1097 | return ret_set_errno(false, ENOMEM); |
1098 | ||
a6aeb9f1 CB |
1099 | limit_cgroup = must_concat(&len, cgroup_tree, "/", |
1100 | DEFAULT_PAYLOAD_CGROUP, | |
1101 | CGROUP_CREATE_RETRY, NULL); | |
1102 | container_cgroup = limit_cgroup; | |
b3ed2061 | 1103 | } else { |
a6aeb9f1 CB |
1104 | limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX, |
1105 | handler->name, | |
1106 | CGROUP_CREATE_RETRY, NULL); | |
1107 | container_cgroup = limit_cgroup; | |
b3ed2061 | 1108 | } |
a6aeb9f1 | 1109 | if (!limit_cgroup) |
fe70edee | 1110 | return ret_set_errno(false, ENOMEM); |
ccb4cabe | 1111 | |
a900cbaf WB |
1112 | if (!conf->cgroup_meta.container_dir) { |
1113 | suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN; | |
1114 | *suffix = '\0'; | |
1115 | } | |
d97919ab | 1116 | do { |
a900cbaf | 1117 | if (idx && suffix) |
fe70edee | 1118 | sprintf(suffix, "-%d", idx); |
bb30b52a | 1119 | |
d97919ab | 1120 | for (i = 0; ops->hierarchies[i]; i++) { |
432faf20 | 1121 | if (cgroup_tree_create(ops, handler->conf, |
a6aeb9f1 CB |
1122 | ops->hierarchies[i], limit_cgroup, |
1123 | conf->cgroup_meta.namespace_dir, | |
6fec4327 | 1124 | true)) |
fe70edee CB |
1125 | continue; |
1126 | ||
67ed60ce | 1127 | DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)"); |
6c880cdf CB |
1128 | for (int j = 0; j <= i; j++) |
1129 | cgroup_tree_prune_leaf(ops->hierarchies[j], | |
a6aeb9f1 | 1130 | limit_cgroup, true); |
fe70edee CB |
1131 | |
1132 | idx++; | |
1133 | break; | |
66b66624 | 1134 | } |
a900cbaf | 1135 | } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix); |
cecad0c1 | 1136 | |
a900cbaf | 1137 | if (idx == 1000 || (!suffix && idx != 0)) |
04a49a14 | 1138 | return log_error_errno(false, ERANGE, "Failed to create container cgroup"); |
cecad0c1 | 1139 | |
fe70edee | 1140 | ops->container_cgroup = move_ptr(container_cgroup); |
a6aeb9f1 CB |
1141 | if (__limit_cgroup) |
1142 | ops->container_limit_cgroup = move_ptr(__limit_cgroup); | |
c55fe36d CB |
1143 | else |
1144 | ops->container_limit_cgroup = ops->container_cgroup; | |
a6aeb9f1 CB |
1145 | INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup", |
1146 | ops->container_cgroup, ops->container_limit_cgroup); | |
ccb4cabe | 1147 | return true; |
ccb4cabe SH |
1148 | } |
1149 | ||
c581d2a6 CB |
1150 | __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, |
1151 | struct lxc_handler *handler) | |
ccb4cabe | 1152 | { |
fdb0b8ab | 1153 | int monitor_len, transient_len = 0; |
c581d2a6 CB |
1154 | char monitor[INTTYPE_TO_STRLEN(pid_t)], |
1155 | transient[INTTYPE_TO_STRLEN(pid_t)]; | |
ccb4cabe | 1156 | |
797fa65e CB |
1157 | if (!ops) |
1158 | return ret_set_errno(false, ENOENT); | |
1159 | ||
69b4a4bb CB |
1160 | if (!ops->hierarchies) |
1161 | return true; | |
1162 | ||
797fa65e CB |
1163 | if (!ops->monitor_cgroup) |
1164 | return ret_set_errno(false, ENOENT); | |
1165 | ||
1166 | if (!handler || !handler->conf) | |
1167 | return ret_set_errno(false, EINVAL); | |
1168 | ||
0bba27c1 CB |
1169 | monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid); |
1170 | if (monitor_len < 0) | |
1171 | return false; | |
1172 | ||
1173 | if (handler->transient_pid > 0) { | |
1174 | transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid); | |
1175 | if (transient_len < 0) | |
1176 | return false; | |
1177 | } | |
ccb4cabe | 1178 | |
eeef32bb | 1179 | for (int i = 0; ops->hierarchies[i]; i++) { |
1973b62a | 1180 | struct hierarchy *h = ops->hierarchies[i]; |
c581d2a6 | 1181 | int ret; |
08768001 | 1182 | |
6a32c817 | 1183 | ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len); |
1973b62a | 1184 | if (ret) |
6a32c817 | 1185 | return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon); |
c581d2a6 | 1186 | |
6a32c817 | 1187 | TRACE("Moved monitor into cgroup %d", h->dfd_mon); |
ebf88e5b | 1188 | |
34683042 | 1189 | if (handler->transient_pid <= 0) |
d1ee8719 | 1190 | continue; |
c581d2a6 | 1191 | |
6a32c817 | 1192 | ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len); |
1973b62a | 1193 | if (ret) |
6a32c817 | 1194 | return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon); |
1973b62a | 1195 | |
6a32c817 | 1196 | TRACE("Moved transient process into cgroup %d", h->dfd_mon); |
ebf88e5b | 1197 | |
1973b62a | 1198 | /* |
78eb6aa6 | 1199 | * we don't keep the fds for non-unified hierarchies around |
1973b62a | 1200 | * mainly because we don't make use of them anymore after the |
78eb6aa6 | 1201 | * core cgroup setup is done but also because there are quite a |
1973b62a CB |
1202 | * lot of them. |
1203 | */ | |
1204 | if (!is_unified_hierarchy(h)) | |
6a32c817 | 1205 | close_prot_errno_disarm(h->dfd_mon); |
ccb4cabe | 1206 | } |
c581d2a6 | 1207 | handler->transient_pid = -1; |
ccb4cabe SH |
1208 | |
1209 | return true; | |
1210 | } | |
1211 | ||
c581d2a6 CB |
1212 | __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops, |
1213 | struct lxc_handler *handler) | |
eeef32bb | 1214 | { |
c581d2a6 CB |
1215 | int len; |
1216 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; | |
eeef32bb | 1217 | |
4490328e CB |
1218 | if (!ops) |
1219 | return ret_set_errno(false, ENOENT); | |
1220 | ||
c581d2a6 CB |
1221 | if (!ops->hierarchies) |
1222 | return true; | |
1223 | ||
4490328e CB |
1224 | if (!ops->container_cgroup) |
1225 | return ret_set_errno(false, ENOENT); | |
1226 | ||
1227 | if (!handler || !handler->conf) | |
1228 | return ret_set_errno(false, EINVAL); | |
1229 | ||
0bba27c1 CB |
1230 | len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid); |
1231 | if (len < 0) | |
1232 | return false; | |
c581d2a6 CB |
1233 | |
1234 | for (int i = 0; ops->hierarchies[i]; i++) { | |
1973b62a | 1235 | struct hierarchy *h = ops->hierarchies[i]; |
c581d2a6 CB |
1236 | int ret; |
1237 | ||
b3a42865 CB |
1238 | if (is_unified_hierarchy(h) && |
1239 | (handler->clone_flags & CLONE_INTO_CGROUP)) | |
f7176c3e CB |
1240 | continue; |
1241 | ||
e33870e5 | 1242 | ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len); |
c581d2a6 | 1243 | if (ret != 0) |
67ed60ce | 1244 | return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con); |
25db3f94 | 1245 | |
67ed60ce | 1246 | TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con); |
c581d2a6 CB |
1247 | } |
1248 | ||
1249 | return true; | |
eeef32bb CB |
1250 | } |
1251 | ||
1973b62a CB |
1252 | static int fchowmodat(int dirfd, const char *path, uid_t chown_uid, |
1253 | gid_t chown_gid, mode_t chmod_mode) | |
6efacf80 CB |
1254 | { |
1255 | int ret; | |
1256 | ||
1973b62a CB |
1257 | ret = fchownat(dirfd, path, chown_uid, chown_gid, |
1258 | AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); | |
1259 | if (ret < 0) | |
1260 | return log_warn_errno(-1, | |
1261 | errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )", | |
1262 | dirfd, path, (int)chown_uid, | |
1263 | (int)chown_gid); | |
6efacf80 | 1264 | |
1973b62a CB |
1265 | ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0); |
1266 | if (ret < 0) | |
1267 | return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)", | |
1268 | dirfd, path, (int)chmod_mode); | |
6efacf80 CB |
1269 | |
1270 | return 0; | |
1271 | } | |
1272 | ||
1273 | /* chgrp the container cgroups to container group. We leave | |
c0888dfe SH |
1274 | * the container owner as cgroup owner. So we must make the |
1275 | * directories 775 so that the container can create sub-cgroups. | |
43647298 SH |
1276 | * |
1277 | * Also chown the tasks and cgroup.procs files. Those may not | |
1278 | * exist depending on kernel version. | |
c0888dfe | 1279 | */ |
ccb4cabe SH |
1280 | static int chown_cgroup_wrapper(void *data) |
1281 | { | |
6a720d74 | 1282 | int ret; |
4160c3a0 CB |
1283 | uid_t destuid; |
1284 | struct generic_userns_exec_data *arg = data; | |
1285 | uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid; | |
1286 | gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid; | |
ccb4cabe | 1287 | |
8917c382 | 1288 | if (!lxc_drop_groups() && errno != EPERM) |
b58214ac CB |
1289 | return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)"); |
1290 | ||
6efacf80 | 1291 | ret = setresgid(nsgid, nsgid, nsgid); |
803e4123 | 1292 | if (ret < 0) |
77c3e9a2 | 1293 | return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)", |
803e4123 | 1294 | (int)nsgid, (int)nsgid, (int)nsgid); |
6efacf80 CB |
1295 | |
1296 | ret = setresuid(nsuid, nsuid, nsuid); | |
803e4123 | 1297 | if (ret < 0) |
77c3e9a2 | 1298 | return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)", |
803e4123 | 1299 | (int)nsuid, (int)nsuid, (int)nsuid); |
6efacf80 | 1300 | |
ccb4cabe | 1301 | destuid = get_ns_uid(arg->origuid); |
b962868f CB |
1302 | if (destuid == LXC_INVALID_UID) |
1303 | destuid = 0; | |
ccb4cabe | 1304 | |
6a720d74 | 1305 | for (int i = 0; arg->hierarchies[i]; i++) { |
e33870e5 | 1306 | int dirfd = arg->hierarchies[i]->dfd_con; |
43647298 | 1307 | |
7f02fd24 CB |
1308 | if (dirfd < 0) |
1309 | return syserrno_set(-EBADF, "Invalid cgroup file descriptor"); | |
1310 | ||
1973b62a | 1311 | (void)fchowmodat(dirfd, "", destuid, nsgid, 0775); |
c0888dfe | 1312 | |
1973b62a CB |
1313 | /* |
1314 | * Failures to chown() these are inconvenient but not | |
6efacf80 CB |
1315 | * detrimental We leave these owned by the container launcher, |
1316 | * so that container root can write to the files to attach. We | |
1317 | * chmod() them 664 so that container systemd can write to the | |
1318 | * files (which systemd in wily insists on doing). | |
ab8f5424 | 1319 | */ |
6efacf80 | 1320 | |
b8572e8c | 1321 | if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY) |
1973b62a | 1322 | (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664); |
43647298 | 1323 | |
1973b62a | 1324 | (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664); |
0e17357c | 1325 | |
b8572e8c | 1326 | if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY) |
0e17357c CB |
1327 | continue; |
1328 | ||
042f9e9c | 1329 | for (char **p = arg->hierarchies[i]->delegate; p && *p; p++) |
1973b62a | 1330 | (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664); |
ccb4cabe SH |
1331 | } |
1332 | ||
1333 | return 0; | |
1334 | } | |
1335 | ||
b857f4be | 1336 | __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops, |
c98bbf71 | 1337 | struct lxc_conf *conf) |
ccb4cabe | 1338 | { |
4160c3a0 | 1339 | struct generic_userns_exec_data wrap; |
ccb4cabe | 1340 | |
c98bbf71 CB |
1341 | if (!ops) |
1342 | return ret_set_errno(false, ENOENT); | |
ccb4cabe | 1343 | |
69b4a4bb CB |
1344 | if (!ops->hierarchies) |
1345 | return true; | |
1346 | ||
c98bbf71 CB |
1347 | if (!ops->container_cgroup) |
1348 | return ret_set_errno(false, ENOENT); | |
1349 | ||
1350 | if (!conf) | |
1351 | return ret_set_errno(false, EINVAL); | |
1352 | ||
1353 | if (lxc_list_empty(&conf->id_map)) | |
1354 | return true; | |
1355 | ||
ccb4cabe | 1356 | wrap.origuid = geteuid(); |
4160c3a0 | 1357 | wrap.path = NULL; |
2202afc9 | 1358 | wrap.hierarchies = ops->hierarchies; |
4160c3a0 | 1359 | wrap.conf = conf; |
ccb4cabe | 1360 | |
c98bbf71 CB |
1361 | if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0) |
1362 | return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace"); | |
ccb4cabe SH |
1363 | |
1364 | return true; | |
1365 | } | |
1366 | ||
840eec19 | 1367 | __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops) |
78eb6aa6 CB |
1368 | { |
1369 | if (!ops) | |
1370 | return; | |
1371 | ||
1372 | if (!ops->hierarchies) | |
1373 | return; | |
1374 | ||
840eec19 CB |
1375 | for (int i = 0; ops->hierarchies[i]; i++) { |
1376 | struct hierarchy *h = ops->hierarchies[i]; | |
1377 | ||
1378 | /* Close all monitor cgroup file descriptors. */ | |
1379 | close_prot_errno_disarm(h->dfd_mon); | |
1380 | } | |
1381 | /* Close the cgroup root file descriptor. */ | |
1382 | close_prot_errno_disarm(ops->dfd_mnt); | |
1383 | ||
6dcd6f02 CB |
1384 | /* |
1385 | * The checking for freezer support should obviously be done at cgroup | |
1386 | * initialization time but that doesn't work reliable. The freezer | |
1387 | * controller has been demoted (rightly so) to a simple file located in | |
1388 | * each non-root cgroup. At the time when the container is created we | |
1389 | * might still be located in /sys/fs/cgroup and so checking for | |
1390 | * cgroup.freeze won't tell us anything because this file doesn't exist | |
1391 | * in the root cgroup. We could then iterate through /sys/fs/cgroup and | |
1392 | * find an already existing cgroup and then check within that cgroup | |
1393 | * for the existence of cgroup.freeze but that will only work on | |
1394 | * systemd based hosts. Other init systems might not manage cgroups and | |
1395 | * so no cgroup will exist. So we defer until we have created cgroups | |
1396 | * for our container which means we check here. | |
1397 | */ | |
1398 | if (pure_unified_layout(ops) && | |
e33870e5 | 1399 | !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK, |
6dcd6f02 CB |
1400 | AT_SYMLINK_NOFOLLOW)) { |
1401 | TRACE("Unified hierarchy supports freezer"); | |
ca72ccb5 | 1402 | ops->unified->utilities |= FREEZER_CONTROLLER; |
6dcd6f02 | 1403 | } |
78eb6aa6 CB |
1404 | } |
1405 | ||
8aa1044f | 1406 | /* cgroup-full:* is done, no need to create subdirs */ |
bd09ee98 | 1407 | static inline bool cg_mount_needs_subdirs(int cgroup_automount_type) |
8aa1044f | 1408 | { |
bd09ee98 | 1409 | switch (cgroup_automount_type) { |
51feb8db CB |
1410 | case LXC_AUTO_CGROUP_RO: |
1411 | return true; | |
1412 | case LXC_AUTO_CGROUP_RW: | |
1413 | return true; | |
1414 | case LXC_AUTO_CGROUP_MIXED: | |
1415 | return true; | |
1416 | } | |
1417 | ||
1418 | return false; | |
8aa1044f SH |
1419 | } |
1420 | ||
886cac86 CB |
1421 | /* After $rootfs/sys/fs/container/controller/the/cg/path has been created, |
1422 | * remount controller ro if needed and bindmount the cgroupfs onto | |
25fa6f8c | 1423 | * control/the/cg/path. |
8aa1044f | 1424 | */ |
bd09ee98 | 1425 | static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h, |
a9db9474 | 1426 | char *hierarchy_mnt, char *cgpath, |
6812d833 | 1427 | const char *container_cgroup) |
8aa1044f | 1428 | { |
d97919ab | 1429 | __do_free char *sourcepath = NULL; |
5285689c | 1430 | int ret, remount_flags; |
886cac86 CB |
1431 | int flags = MS_BIND; |
1432 | ||
bd09ee98 CB |
1433 | if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || |
1434 | (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) { | |
a9db9474 | 1435 | ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL); |
77c3e9a2 CB |
1436 | if (ret < 0) |
1437 | return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"", | |
a9db9474 | 1438 | hierarchy_mnt, hierarchy_mnt); |
886cac86 | 1439 | |
a9db9474 CB |
1440 | remount_flags = add_required_remount_flags(hierarchy_mnt, |
1441 | hierarchy_mnt, | |
5285689c | 1442 | flags | MS_REMOUNT); |
a9db9474 | 1443 | ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", |
8186c5c7 CB |
1444 | remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY, |
1445 | NULL); | |
77c3e9a2 | 1446 | if (ret < 0) |
a9db9474 | 1447 | return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt); |
886cac86 | 1448 | |
a9db9474 | 1449 | INFO("Remounted %s read-only", hierarchy_mnt); |
8aa1044f | 1450 | } |
886cac86 | 1451 | |
44585f1a | 1452 | sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL); |
bd09ee98 | 1453 | if (cgroup_automount_type == LXC_AUTO_CGROUP_RO) |
8aa1044f | 1454 | flags |= MS_RDONLY; |
886cac86 CB |
1455 | |
1456 | ret = mount(sourcepath, cgpath, "cgroup", flags, NULL); | |
77c3e9a2 CB |
1457 | if (ret < 0) |
1458 | return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"", | |
1459 | h->controllers[0], cgpath); | |
886cac86 | 1460 | INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath); |
f8c40ffa L |
1461 | |
1462 | if (flags & MS_RDONLY) { | |
5285689c CB |
1463 | remount_flags = add_required_remount_flags(sourcepath, cgpath, |
1464 | flags | MS_REMOUNT); | |
1465 | ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL); | |
77c3e9a2 CB |
1466 | if (ret < 0) |
1467 | return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath); | |
5285689c | 1468 | INFO("Remounted %s read-only", cgpath); |
f8c40ffa L |
1469 | } |
1470 | ||
886cac86 | 1471 | INFO("Completed second stage cgroup automounts for \"%s\"", cgpath); |
8aa1044f SH |
1472 | return 0; |
1473 | } | |
1474 | ||
44234ae1 | 1475 | /* __cgroupfs_mount |
6812d833 CB |
1476 | * |
1477 | * Mount cgroup hierarchies directly without using bind-mounts. The main | |
1478 | * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting | |
1479 | * cgroups for the LXC_AUTO_CGROUP_FULL option. | |
1480 | */ | |
bd09ee98 | 1481 | static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, |
44234ae1 CB |
1482 | struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs, |
1483 | const char *hierarchy_mnt) | |
b635e92d | 1484 | { |
a099c5db CB |
1485 | __do_close int fd_fs = -EBADF; |
1486 | unsigned int flags = 0; | |
02efd041 CB |
1487 | char *fstype; |
1488 | int ret; | |
1489 | ||
1490 | if (dfd_mnt_cgroupfs < 0) | |
1491 | return ret_errno(EINVAL); | |
1492 | ||
a099c5db CB |
1493 | flags |= MOUNT_ATTR_NOSUID; |
1494 | flags |= MOUNT_ATTR_NOEXEC; | |
1495 | flags |= MOUNT_ATTR_NODEV; | |
1496 | flags |= MOUNT_ATTR_RELATIME; | |
02efd041 | 1497 | |
bd09ee98 CB |
1498 | if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) || |
1499 | (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO)) | |
a099c5db | 1500 | flags |= MOUNT_ATTR_RDONLY; |
02efd041 | 1501 | |
bd09ee98 | 1502 | if (is_unified_hierarchy(h)) |
02efd041 | 1503 | fstype = "cgroup2"; |
bd09ee98 | 1504 | else |
02efd041 | 1505 | fstype = "cgroup"; |
b635e92d | 1506 | |
de7f9f33 | 1507 | if (can_use_mount_api()) { |
635e7bac CB |
1508 | fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0); |
1509 | if (fd_fs < 0) | |
1510 | return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype); | |
1511 | ||
1512 | if (!is_unified_hierarchy(h)) { | |
1513 | for (const char **it = (const char **)h->controllers; it && *it; it++) { | |
aa72fbe7 | 1514 | if (strnequal(*it, "name=", STRLITERALLEN("name="))) |
635e7bac CB |
1515 | ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name=")); |
1516 | else | |
1517 | ret = fs_set_property(fd_fs, *it, ""); | |
1518 | if (ret < 0) | |
1519 | return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs); | |
1520 | } | |
1521 | } | |
1522 | ||
1523 | ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt, | |
1524 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, | |
1525 | flags); | |
1526 | } else { | |
a099c5db CB |
1527 | __do_free char *controllers = NULL, *target = NULL; |
1528 | unsigned int old_flags = 0; | |
02efd041 CB |
1529 | const char *rootfs_mnt; |
1530 | ||
a099c5db CB |
1531 | if (!is_unified_hierarchy(h)) { |
1532 | controllers = lxc_string_join(",", (const char **)h->controllers, false); | |
1533 | if (!controllers) | |
1534 | return ret_errno(ENOMEM); | |
1535 | } | |
1536 | ||
02efd041 | 1537 | rootfs_mnt = get_rootfs_mnt(rootfs); |
a099c5db CB |
1538 | ret = mnt_attributes_old(flags, &old_flags); |
1539 | if (ret) | |
1540 | return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified"); | |
1541 | ||
02efd041 | 1542 | target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL); |
a099c5db | 1543 | ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt); |
02efd041 | 1544 | } |
77c3e9a2 | 1545 | if (ret < 0) |
02efd041 CB |
1546 | return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)", |
1547 | fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); | |
b635e92d | 1548 | |
02efd041 CB |
1549 | DEBUG("Mounted cgroup filesystem %s onto %d(%s)", |
1550 | fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt)); | |
b635e92d CB |
1551 | return 0; |
1552 | } | |
1553 | ||
bd09ee98 | 1554 | static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h, |
074af890 CB |
1555 | struct lxc_rootfs *rootfs, |
1556 | int dfd_mnt_cgroupfs, const char *hierarchy_mnt) | |
6812d833 | 1557 | { |
bd09ee98 CB |
1558 | return __cgroupfs_mount(cgroup_automount_type, h, rootfs, |
1559 | dfd_mnt_cgroupfs, hierarchy_mnt); | |
6812d833 CB |
1560 | } |
1561 | ||
bd09ee98 | 1562 | static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h, |
14111650 CB |
1563 | struct lxc_rootfs *rootfs, |
1564 | int dfd_mnt_cgroupfs, | |
1565 | const char *hierarchy_mnt) | |
6812d833 | 1566 | { |
bd09ee98 | 1567 | switch (cgroup_automount_type) { |
51feb8db CB |
1568 | case LXC_AUTO_CGROUP_FULL_RO: |
1569 | break; | |
1570 | case LXC_AUTO_CGROUP_FULL_RW: | |
1571 | break; | |
1572 | case LXC_AUTO_CGROUP_FULL_MIXED: | |
1573 | break; | |
1574 | default: | |
6812d833 | 1575 | return 0; |
51feb8db | 1576 | } |
6812d833 | 1577 | |
bd09ee98 CB |
1578 | return __cgroupfs_mount(cgroup_automount_type, h, rootfs, |
1579 | dfd_mnt_cgroupfs, hierarchy_mnt); | |
6812d833 CB |
1580 | } |
1581 | ||
b857f4be | 1582 | __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops, |
cdd3b77d | 1583 | struct lxc_handler *handler, int cg_flags) |
ccb4cabe | 1584 | { |
9bca62b3 | 1585 | __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF; |
6607d6e9 | 1586 | __do_free char *cgroup_root = NULL; |
bd09ee98 | 1587 | int cgroup_automount_type; |
937a3af9 | 1588 | bool in_cgroup_ns = false, wants_force_mount = false; |
ab8cd5d9 | 1589 | struct lxc_conf *conf = handler->conf; |
315f8a4e | 1590 | struct lxc_rootfs *rootfs = &conf->rootfs; |
02efd041 | 1591 | const char *rootfs_mnt = get_rootfs_mnt(rootfs); |
dfa835ac | 1592 | int ret; |
8aa1044f | 1593 | |
9585ccb3 CB |
1594 | if (!ops) |
1595 | return ret_set_errno(false, ENOENT); | |
1596 | ||
69b4a4bb CB |
1597 | if (!ops->hierarchies) |
1598 | return true; | |
1599 | ||
315f8a4e | 1600 | if (!conf) |
9585ccb3 CB |
1601 | return ret_set_errno(false, EINVAL); |
1602 | ||
cdd3b77d | 1603 | if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0) |
c581c8a3 | 1604 | return log_trace(true, "No cgroup mounts requested"); |
8aa1044f | 1605 | |
69c29673 CB |
1606 | if (cg_flags & LXC_AUTO_CGROUP_FORCE) { |
1607 | cg_flags &= ~LXC_AUTO_CGROUP_FORCE; | |
3f69fb12 | 1608 | wants_force_mount = true; |
69c29673 CB |
1609 | } |
1610 | ||
1611 | switch (cg_flags) { | |
1612 | case LXC_AUTO_CGROUP_RO: | |
1613 | TRACE("Read-only cgroup mounts requested"); | |
1614 | break; | |
1615 | case LXC_AUTO_CGROUP_RW: | |
1616 | TRACE("Read-write cgroup mounts requested"); | |
1617 | break; | |
1618 | case LXC_AUTO_CGROUP_MIXED: | |
1619 | TRACE("Mixed cgroup mounts requested"); | |
1620 | break; | |
1621 | case LXC_AUTO_CGROUP_FULL_RO: | |
1622 | TRACE("Full read-only cgroup mounts requested"); | |
1623 | break; | |
1624 | case LXC_AUTO_CGROUP_FULL_RW: | |
1625 | TRACE("Full read-write cgroup mounts requested"); | |
1626 | break; | |
1627 | case LXC_AUTO_CGROUP_FULL_MIXED: | |
1628 | TRACE("Full mixed cgroup mounts requested"); | |
1629 | break; | |
1630 | default: | |
1631 | return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified"); | |
1632 | } | |
bd09ee98 | 1633 | cgroup_automount_type = cg_flags; |
b635e92d | 1634 | |
4547e73e | 1635 | if (!wants_force_mount) { |
315f8a4e | 1636 | wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf); |
4547e73e CB |
1637 | |
1638 | /* | |
1639 | * Most recent distro versions currently have init system that | |
1640 | * do support cgroup2 but do not mount it by default unless | |
1641 | * explicitly told so even if the host is cgroup2 only. That | |
1642 | * means they often will fail to boot. Fix this by pre-mounting | |
1643 | * cgroup2 by default. We will likely need to be doing this a | |
1644 | * few years until all distros have switched over to cgroup2 at | |
1645 | * which point we can safely assume that their init systems | |
1646 | * will mount it themselves. | |
1647 | */ | |
1648 | if (pure_unified_layout(ops)) | |
1649 | wants_force_mount = true; | |
3f69fb12 | 1650 | } |
8aa1044f | 1651 | |
2c4348bd | 1652 | if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP)) |
937a3af9 | 1653 | in_cgroup_ns = true; |
6768700d | 1654 | |
937a3af9 | 1655 | if (in_cgroup_ns && !wants_force_mount) |
3a86fb37 | 1656 | return log_trace(true, "Mounting cgroups not requested or needed"); |
8aa1044f | 1657 | |
02efd041 CB |
1658 | /* This is really the codepath that we want. */ |
1659 | if (pure_unified_layout(ops)) { | |
9bca62b3 CB |
1660 | __do_close int dfd_mnt_unified = -EBADF; |
1661 | ||
1662 | dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, | |
1663 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
1664 | if (dfd_mnt_unified < 0) | |
1665 | return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt, | |
1666 | DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); | |
e7e45fdf CB |
1667 | /* |
1668 | * If cgroup namespaces are supported but the container will | |
1669 | * not have CAP_SYS_ADMIN after it has started we need to mount | |
1670 | * the cgroups manually. | |
a3e5ec26 CB |
1671 | * |
1672 | * Note that here we know that wants_force_mount is true. | |
1673 | * Otherwise we would've returned early above. | |
e7e45fdf | 1674 | */ |
a3e5ec26 CB |
1675 | if (in_cgroup_ns) { |
1676 | /* | |
1677 | * 1. cgroup:rw:force -> Mount the cgroup2 filesystem. | |
1678 | * 2. cgroup:ro:force -> Mount the cgroup2 filesystem read-only. | |
1679 | * 3. cgroup:mixed:force -> See comment above how this | |
1680 | * does not apply so | |
1681 | * cgroup:mixed is equal to | |
1682 | * cgroup:rw when cgroup | |
1683 | * namespaces are supported. | |
1684 | ||
1685 | * 4. cgroup:rw -> No-op; init system responsible for mounting. | |
1686 | * 5. cgroup:ro -> No-op; init system responsible for mounting. | |
1687 | * 6. cgroup:mixed -> No-op; init system responsible for mounting. | |
1688 | * | |
1689 | * 7. cgroup-full:rw -> Not supported. | |
1690 | * 8. cgroup-full:ro -> Not supported. | |
1691 | * 9. cgroup-full:mixed -> Not supported. | |
1692 | ||
1693 | * 10. cgroup-full:rw:force -> Not supported. | |
1694 | * 11. cgroup-full:ro:force -> Not supported. | |
1695 | * 12. cgroup-full:mixed:force -> Not supported. | |
1696 | */ | |
bd09ee98 | 1697 | ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, ""); |
a3e5ec26 CB |
1698 | if (ret < 0) |
1699 | return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace"); | |
1700 | ||
1701 | return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace"); | |
1702 | } else { | |
1703 | /* | |
1704 | * Either no cgroup namespace supported (highly | |
1705 | * unlikely unless we're dealing with a Frankenkernel. | |
1706 | * Or the user requested to keep the cgroup namespace | |
1707 | * of the host or another container. | |
1708 | */ | |
1709 | if (wants_force_mount) { | |
1710 | /* | |
1711 | * 1. cgroup:rw:force -> Bind-mount the cgroup2 filesystem writable. | |
1712 | * 2. cgroup:ro:force -> Bind-mount the cgroup2 filesystem read-only. | |
1713 | * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and | |
1714 | * and make the parent directory of the | |
1715 | * container's cgroup read-only but the | |
1716 | * container's cgroup writable. | |
1717 | * | |
1718 | * 10. cgroup-full:rw:force -> | |
1719 | * 11. cgroup-full:ro:force -> | |
1720 | * 12. cgroup-full:mixed:force -> | |
1721 | */ | |
1722 | errno = EOPNOTSUPP; | |
1723 | SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); | |
1724 | } else { | |
1725 | errno = EOPNOTSUPP; | |
1726 | SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported"); | |
1727 | } | |
1728 | } | |
8d661d38 | 1729 | |
a3e5ec26 | 1730 | return syserrno(false, "Failed to mount cgroups"); |
8d661d38 CB |
1731 | } |
1732 | ||
e6d4df78 CB |
1733 | /* |
1734 | * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're | |
1735 | * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the | |
1736 | * DEFAULT_CGROUP_MOUNTPOINT define. | |
1737 | */ | |
de7f9f33 | 1738 | if (can_use_mount_api()) { |
635e7bac CB |
1739 | fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0); |
1740 | if (fd_fs < 0) | |
1741 | return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs"); | |
1742 | ||
23a20dbe CB |
1743 | ret = fs_set_property(fd_fs, "mode", "0755"); |
1744 | if (ret < 0) | |
1745 | return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); | |
1746 | ||
1747 | ret = fs_set_property(fd_fs, "size", "10240k"); | |
1748 | if (ret < 0) | |
1749 | return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs); | |
1750 | ||
1751 | ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, | |
1752 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, | |
1753 | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | | |
1754 | MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME); | |
635e7bac CB |
1755 | } else { |
1756 | cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); | |
1757 | ret = safe_mount(NULL, cgroup_root, "tmpfs", | |
1758 | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, | |
1759 | "size=10240k,mode=755", rootfs_mnt); | |
8b1f4dd9 | 1760 | } |
3f69fb12 | 1761 | if (ret < 0) |
02efd041 CB |
1762 | return log_error_errno(false, errno, "Failed to mount tmpfs on %s", |
1763 | DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); | |
8aa1044f | 1764 | |
9bca62b3 CB |
1765 | dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE, |
1766 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
1767 | if (dfd_mnt_tmpfs < 0) | |
1768 | return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt, | |
1769 | DEFAULT_CGROUP_MOUNTPOINT_RELATIVE); | |
1770 | ||
dfa835ac | 1771 | for (int i = 0; ops->hierarchies[i]; i++) { |
a9db9474 | 1772 | __do_free char *hierarchy_mnt = NULL, *path2 = NULL; |
2202afc9 | 1773 | struct hierarchy *h = ops->hierarchies[i]; |
8aa1044f | 1774 | |
a58be2ad | 1775 | ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000); |
d7314671 | 1776 | if (ret < 0) |
a58be2ad | 1777 | return syserrno(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt); |
b635e92d | 1778 | |
937a3af9 | 1779 | if (in_cgroup_ns && wants_force_mount) { |
02efd041 CB |
1780 | /* |
1781 | * If cgroup namespaces are supported but the container | |
b635e92d CB |
1782 | * will not have CAP_SYS_ADMIN after it has started we |
1783 | * need to mount the cgroups manually. | |
1784 | */ | |
a9db9474 | 1785 | ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, |
a58be2ad | 1786 | dfd_mnt_tmpfs, h->at_mnt); |
3f69fb12 | 1787 | if (ret < 0) |
d7314671 | 1788 | return false; |
3f69fb12 | 1789 | |
b635e92d CB |
1790 | continue; |
1791 | } | |
1792 | ||
02efd041 | 1793 | /* Here is where the ancient kernel section begins. */ |
a9db9474 | 1794 | ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, |
a58be2ad | 1795 | dfd_mnt_tmpfs, h->at_mnt); |
d97919ab | 1796 | if (ret < 0) |
d7314671 | 1797 | return false; |
3f69fb12 | 1798 | |
bd09ee98 | 1799 | if (!cg_mount_needs_subdirs(cgroup_automount_type)) |
8aa1044f | 1800 | continue; |
3f69fb12 | 1801 | |
f1921f35 CB |
1802 | if (!cgroup_root) |
1803 | cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL); | |
1804 | ||
a58be2ad | 1805 | hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL); |
44585f1a | 1806 | path2 = must_make_path(hierarchy_mnt, h->at_base, |
a9db9474 | 1807 | ops->container_cgroup, NULL); |
3f69fb12 | 1808 | ret = mkdir_p(path2, 0755); |
77410c98 | 1809 | if (ret < 0 && (errno != EEXIST)) |
d7314671 | 1810 | return false; |
2f62fb00 | 1811 | |
a9db9474 CB |
1812 | ret = cg_legacy_mount_controllers(cgroup_automount_type, h, |
1813 | hierarchy_mnt, path2, | |
1814 | ops->container_cgroup); | |
3f69fb12 | 1815 | if (ret < 0) |
d7314671 | 1816 | return false; |
8aa1044f | 1817 | } |
8aa1044f | 1818 | |
d7314671 | 1819 | return true; |
ccb4cabe SH |
1820 | } |
1821 | ||
11c23867 | 1822 | /* Only root needs to escape to the cgroup of its init. */ |
ff9edd2d CB |
1823 | __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops, |
1824 | struct lxc_conf *conf) | |
ccb4cabe | 1825 | { |
52d08ab0 CB |
1826 | if (!ops) |
1827 | return ret_set_errno(false, ENOENT); | |
1828 | ||
1829 | if (!ops->hierarchies) | |
1830 | return true; | |
1831 | ||
1832 | if (!conf) | |
1833 | return ret_set_errno(false, EINVAL); | |
1834 | ||
1835 | if (conf->cgroup_meta.relative || geteuid()) | |
ccb4cabe SH |
1836 | return true; |
1837 | ||
779b3d82 | 1838 | for (int i = 0; ops->hierarchies[i]; i++) { |
88396101 | 1839 | __do_free char *fullpath = NULL; |
52d08ab0 | 1840 | int ret; |
11c23867 | 1841 | |
35ec1a38 | 1842 | fullpath = make_cgroup_path(ops->hierarchies[i], |
44585f1a | 1843 | ops->hierarchies[i]->at_base, |
35ec1a38 | 1844 | "cgroup.procs", NULL); |
7cea5905 | 1845 | ret = lxc_write_to_file(fullpath, "0", 2, false, 0666); |
52d08ab0 | 1846 | if (ret != 0) |
77c3e9a2 | 1847 | return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath); |
ccb4cabe SH |
1848 | } |
1849 | ||
6df334d1 | 1850 | return true; |
ccb4cabe SH |
1851 | } |
1852 | ||
ff9edd2d | 1853 | __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops) |
36662416 | 1854 | { |
69b4a4bb CB |
1855 | int i = 0; |
1856 | ||
e3ffb28b CB |
1857 | if (!ops) |
1858 | return ret_set_errno(-1, ENOENT); | |
1859 | ||
69b4a4bb CB |
1860 | if (!ops->hierarchies) |
1861 | return 0; | |
36662416 | 1862 | |
69b4a4bb | 1863 | for (; ops->hierarchies[i]; i++) |
36662416 TA |
1864 | ; |
1865 | ||
1866 | return i; | |
1867 | } | |
1868 | ||
ff9edd2d CB |
1869 | __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops, |
1870 | int n, char ***out) | |
36662416 TA |
1871 | { |
1872 | int i; | |
1873 | ||
aa48a34f CB |
1874 | if (!ops) |
1875 | return ret_set_errno(false, ENOENT); | |
1876 | ||
69b4a4bb | 1877 | if (!ops->hierarchies) |
77c3e9a2 | 1878 | return ret_set_errno(false, ENOENT); |
69b4a4bb | 1879 | |
36662416 | 1880 | /* sanity check n */ |
6b38e644 | 1881 | for (i = 0; i < n; i++) |
2202afc9 | 1882 | if (!ops->hierarchies[i]) |
aa48a34f | 1883 | return ret_set_errno(false, ENOENT); |
36662416 | 1884 | |
2202afc9 | 1885 | *out = ops->hierarchies[i]->controllers; |
36662416 TA |
1886 | |
1887 | return true; | |
1888 | } | |
1889 | ||
b8a4fe12 | 1890 | static int cg_legacy_freeze(struct cgroup_ops *ops) |
ccb4cabe | 1891 | { |
d6337a5f | 1892 | struct hierarchy *h; |
ccb4cabe | 1893 | |
ee3a7775 CB |
1894 | h = get_hierarchy(ops, "freezer"); |
1895 | if (!h) | |
d2203230 | 1896 | return ret_set_errno(-1, ENOENT); |
81468ea7 | 1897 | |
67ed60ce | 1898 | return lxc_write_openat(h->path_con, "freezer.state", |
c04a6d4e | 1899 | "FROZEN", STRLITERALLEN("FROZEN")); |
ee3a7775 | 1900 | } |
942e193e | 1901 | |
018051e3 CB |
1902 | static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata, |
1903 | struct lxc_epoll_descr *descr) | |
ee3a7775 | 1904 | { |
018051e3 | 1905 | __do_free char *line = NULL; |
ee3a7775 | 1906 | __do_fclose FILE *f = NULL; |
018051e3 CB |
1907 | int state = PTR_TO_INT(cbdata); |
1908 | size_t len; | |
1909 | const char *state_string; | |
1910 | ||
c8af3332 | 1911 | f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH); |
018051e3 CB |
1912 | if (!f) |
1913 | return LXC_MAINLOOP_ERROR; | |
018051e3 CB |
1914 | |
1915 | if (state == 1) | |
1916 | state_string = "frozen 1"; | |
1917 | else | |
1918 | state_string = "frozen 0"; | |
1919 | ||
1920 | while (getline(&line, &len, f) != -1) | |
aa72fbe7 | 1921 | if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2)) |
018051e3 CB |
1922 | return LXC_MAINLOOP_CLOSE; |
1923 | ||
281c3645 CB |
1924 | rewind(f); |
1925 | ||
018051e3 CB |
1926 | return LXC_MAINLOOP_CONTINUE; |
1927 | } | |
1928 | ||
443be565 WB |
1929 | static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout, |
1930 | const char *state_string, | |
1931 | int state_num, | |
1932 | const char *epoll_error, | |
1933 | const char *wait_error) | |
018051e3 | 1934 | { |
f62cf1d4 | 1935 | __do_close int fd = -EBADF; |
eafc1bb6 | 1936 | call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL; |
018051e3 CB |
1937 | int ret; |
1938 | struct lxc_epoll_descr descr; | |
ee3a7775 | 1939 | struct hierarchy *h; |
942e193e CB |
1940 | |
1941 | h = ops->unified; | |
457ca9aa | 1942 | if (!h) |
d2203230 | 1943 | return ret_set_errno(-1, ENOENT); |
d6337a5f | 1944 | |
67ed60ce | 1945 | if (!h->path_con) |
d2203230 | 1946 | return ret_set_errno(-1, EEXIST); |
d6337a5f | 1947 | |
018051e3 CB |
1948 | if (timeout != 0) { |
1949 | __do_free char *events_file = NULL; | |
942e193e | 1950 | |
67ed60ce | 1951 | events_file = must_make_path(h->path_con, "cgroup.events", NULL); |
018051e3 CB |
1952 | fd = open(events_file, O_RDONLY | O_CLOEXEC); |
1953 | if (fd < 0) | |
d2203230 | 1954 | return log_error_errno(-1, errno, "Failed to open cgroup.events file"); |
942e193e | 1955 | |
018051e3 CB |
1956 | ret = lxc_mainloop_open(&descr); |
1957 | if (ret) | |
443be565 | 1958 | return log_error_errno(-1, errno, "%s", epoll_error); |
942e193e | 1959 | |
018051e3 CB |
1960 | /* automatically cleaned up now */ |
1961 | descr_ptr = &descr; | |
942e193e | 1962 | |
385e58e8 | 1963 | ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num)); |
018051e3 | 1964 | if (ret < 0) |
d2203230 | 1965 | return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); |
018051e3 | 1966 | } |
942e193e | 1967 | |
67ed60ce | 1968 | ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1); |
018051e3 | 1969 | if (ret < 0) |
d2203230 | 1970 | return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); |
018051e3 CB |
1971 | |
1972 | if (timeout != 0 && lxc_mainloop(&descr, timeout)) | |
443be565 | 1973 | return log_error_errno(-1, errno, "%s", wait_error); |
018051e3 CB |
1974 | |
1975 | return 0; | |
942e193e CB |
1976 | } |
1977 | ||
443be565 WB |
1978 | static int cg_unified_freeze(struct cgroup_ops *ops, int timeout) |
1979 | { | |
1980 | return cg_unified_freeze_do(ops, timeout, "1", 1, | |
1981 | "Failed to create epoll instance to wait for container freeze", | |
1982 | "Failed to wait for container to be frozen"); | |
1983 | } | |
1984 | ||
018051e3 | 1985 | __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout) |
942e193e | 1986 | { |
81468ea7 | 1987 | if (!ops->hierarchies) |
d2203230 | 1988 | return ret_set_errno(-1, ENOENT); |
81468ea7 | 1989 | |
ee3a7775 CB |
1990 | if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) |
1991 | return cg_legacy_freeze(ops); | |
942e193e | 1992 | |
018051e3 | 1993 | return cg_unified_freeze(ops, timeout); |
ee3a7775 CB |
1994 | } |
1995 | ||
018051e3 | 1996 | static int cg_legacy_unfreeze(struct cgroup_ops *ops) |
ee3a7775 | 1997 | { |
ee3a7775 CB |
1998 | struct hierarchy *h; |
1999 | ||
2000 | h = get_hierarchy(ops, "freezer"); | |
2001 | if (!h) | |
d2203230 | 2002 | return ret_set_errno(-1, ENOENT); |
ee3a7775 | 2003 | |
67ed60ce | 2004 | return lxc_write_openat(h->path_con, "freezer.state", |
c04a6d4e | 2005 | "THAWED", STRLITERALLEN("THAWED")); |
ee3a7775 CB |
2006 | } |
2007 | ||
018051e3 | 2008 | static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout) |
ee3a7775 | 2009 | { |
443be565 WB |
2010 | return cg_unified_freeze_do(ops, timeout, "0", 0, |
2011 | "Failed to create epoll instance to wait for container unfreeze", | |
2012 | "Failed to wait for container to be unfrozen"); | |
ee3a7775 CB |
2013 | } |
2014 | ||
018051e3 | 2015 | __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout) |
ee3a7775 CB |
2016 | { |
2017 | if (!ops->hierarchies) | |
d2203230 | 2018 | return ret_set_errno(-1, ENOENT); |
ee3a7775 CB |
2019 | |
2020 | if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED) | |
2021 | return cg_legacy_unfreeze(ops); | |
2022 | ||
018051e3 | 2023 | return cg_unified_unfreeze(ops, timeout); |
ccb4cabe SH |
2024 | } |
2025 | ||
a900cbaf WB |
2026 | static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops, |
2027 | const char *controller, bool limiting) | |
ccb4cabe | 2028 | { |
d6337a5f | 2029 | struct hierarchy *h; |
35ec1a38 CB |
2030 | size_t len; |
2031 | const char *path; | |
d6337a5f | 2032 | |
2202afc9 | 2033 | h = get_hierarchy(ops, controller); |
6bdf9691 | 2034 | if (!h) |
35ec1a38 CB |
2035 | return log_warn_errno(NULL, ENOENT, |
2036 | "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller)); | |
ccb4cabe | 2037 | |
a900cbaf | 2038 | if (limiting) |
b1b1a60f | 2039 | path = h->path_lim; |
35ec1a38 | 2040 | else |
67ed60ce | 2041 | path = h->path_con; |
35ec1a38 CB |
2042 | if (!path) |
2043 | return NULL; | |
a900cbaf | 2044 | |
a58be2ad CB |
2045 | len = strlen(h->at_mnt); |
2046 | if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT, | |
35ec1a38 CB |
2047 | STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) { |
2048 | path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT); | |
2049 | path += strspn(path, "/"); | |
2050 | } | |
2051 | return path += len; | |
371f834d SH |
2052 | } |
2053 | ||
a900cbaf WB |
2054 | __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops, |
2055 | const char *controller) | |
2056 | { | |
2057 | return cgfsng_get_cgroup_do(ops, controller, false); | |
2058 | } | |
2059 | ||
2060 | __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops, | |
2061 | const char *controller) | |
2062 | { | |
2063 | return cgfsng_get_cgroup_do(ops, controller, true); | |
2064 | } | |
2065 | ||
c40c8209 CB |
2066 | /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path, |
2067 | * which must be freed by the caller. | |
371f834d | 2068 | */ |
c40c8209 CB |
2069 | static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h, |
2070 | const char *inpath, | |
2071 | const char *filename) | |
371f834d | 2072 | { |
35ec1a38 | 2073 | return make_cgroup_path(h, inpath, filename, NULL); |
ccb4cabe SH |
2074 | } |
2075 | ||
4b86fefd | 2076 | static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid) |
c2aed66d | 2077 | { |
ad275c16 | 2078 | int idx = 1; |
c2aed66d | 2079 | int ret; |
900b6606 | 2080 | char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; |
6e2078de | 2081 | ssize_t pidstr_len; |
c2aed66d | 2082 | |
ad275c16 | 2083 | /* Create leaf cgroup. */ |
275e8ef8 | 2084 | ret = mkdirat(unified_fd, ".lxc", 0755); |
ad275c16 | 2085 | if (ret < 0 && errno != EEXIST) |
6e2078de CB |
2086 | return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\""); |
2087 | ||
0bba27c1 CB |
2088 | pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid); |
2089 | if (pidstr_len < 0) | |
2090 | return pidstr_len; | |
ad275c16 | 2091 | |
275e8ef8 | 2092 | ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len); |
ad275c16 CB |
2093 | if (ret < 0) |
2094 | ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len); | |
c2aed66d | 2095 | if (ret == 0) |
6e2078de | 2096 | return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd); |
ad275c16 | 2097 | |
bad788b0 CB |
2098 | /* this is a non-leaf node */ |
2099 | if (errno != EBUSY) | |
6e2078de | 2100 | return log_error_errno(-errno, errno, "Failed to attach to unified cgroup"); |
c2aed66d | 2101 | |
c2aed66d | 2102 | do { |
7581a82f | 2103 | bool rm = false; |
c80c9a70 | 2104 | char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1]; |
9fd047d1 | 2105 | char *slash = attach_cgroup; |
c2aed66d | 2106 | |
0bba27c1 CB |
2107 | ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx); |
2108 | if (ret < 0) | |
2109 | return ret; | |
5045306b | 2110 | |
c80c9a70 CB |
2111 | /* |
2112 | * This shouldn't really happen but the compiler might complain | |
2113 | * that a short write would cause a buffer overrun. So be on | |
2114 | * the safe side. | |
2115 | */ | |
2116 | if (ret < STRLITERALLEN(".lxc-/cgroup.procs")) | |
2117 | return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun"); | |
2118 | ||
9fd047d1 | 2119 | slash += (ret - STRLITERALLEN("/cgroup.procs")); |
bad788b0 | 2120 | *slash = '\0'; |
ad275c16 | 2121 | |
bad788b0 | 2122 | ret = mkdirat(unified_fd, attach_cgroup, 0755); |
c2aed66d | 2123 | if (ret < 0 && errno != EEXIST) |
d2203230 | 2124 | return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup); |
7581a82f CB |
2125 | if (ret == 0) |
2126 | rm = true; | |
c2aed66d | 2127 | |
bad788b0 | 2128 | *slash = '/'; |
ad275c16 | 2129 | |
bad788b0 | 2130 | ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len); |
c2aed66d | 2131 | if (ret == 0) |
6e2078de | 2132 | return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup); |
c2aed66d | 2133 | |
7581a82f CB |
2134 | if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR)) |
2135 | SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup); | |
2136 | ||
c2aed66d CB |
2137 | /* this is a non-leaf node */ |
2138 | if (errno != EBUSY) | |
d2203230 | 2139 | return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); |
c2aed66d | 2140 | |
edae86e9 CB |
2141 | idx++; |
2142 | } while (idx < 1000); | |
c2aed66d | 2143 | |
ad275c16 | 2144 | return log_error_errno(-1, errno, "Failed to attach to unified cgroup"); |
c2aed66d CB |
2145 | } |
2146 | ||
d1783ef4 CB |
2147 | static int cgroup_attach_create_leaf(const struct lxc_conf *conf, |
2148 | int unified_fd, int *sk_fd) | |
2149 | { | |
7d849163 CB |
2150 | __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; |
2151 | int target_fds[2]; | |
d1783ef4 CB |
2152 | ssize_t ret; |
2153 | ||
2154 | /* Create leaf cgroup. */ | |
2155 | ret = mkdirat(unified_fd, ".lxc", 0755); | |
2156 | if (ret < 0 && errno != EEXIST) | |
2157 | return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\""); | |
2158 | ||
7043e2b4 | 2159 | target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); |
7d849163 | 2160 | if (target_fd0 < 0) |
d1783ef4 | 2161 | return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\""); |
7d849163 | 2162 | target_fds[0] = target_fd0; |
d1783ef4 | 2163 | |
7043e2b4 | 2164 | target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0); |
7d849163 | 2165 | if (target_fd1 < 0) |
49df620b | 2166 | return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\""); |
7d849163 | 2167 | target_fds[1] = target_fd1; |
49df620b CB |
2168 | |
2169 | ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0); | |
d1783ef4 | 2170 | if (ret <= 0) |
49df620b | 2171 | return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d", |
7d849163 | 2172 | target_fd0, target_fd1); |
d1783ef4 | 2173 | |
7d849163 | 2174 | return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1); |
d1783ef4 CB |
2175 | } |
2176 | ||
2177 | static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf, | |
2178 | int *sk_fd, pid_t pid) | |
2179 | { | |
7d849163 CB |
2180 | __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF; |
2181 | int target_fds[2]; | |
d1783ef4 CB |
2182 | char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1]; |
2183 | size_t pidstr_len; | |
2184 | ssize_t ret; | |
2185 | ||
d17c815d CB |
2186 | ret = lxc_abstract_unix_recv_two_fds(sk, target_fds); |
2187 | if (ret < 0) | |
d1783ef4 | 2188 | return log_error_errno(-1, errno, "Failed to receive target cgroup fd"); |
7d849163 CB |
2189 | target_fd0 = target_fds[0]; |
2190 | target_fd1 = target_fds[1]; | |
d1783ef4 CB |
2191 | |
2192 | pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid); | |
2193 | ||
7d849163 CB |
2194 | ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len); |
2195 | if (ret > 0 && ret == pidstr_len) | |
2196 | return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0); | |
2197 | ||
49df620b | 2198 | ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len); |
7d849163 CB |
2199 | if (ret > 0 && ret == pidstr_len) |
2200 | return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1); | |
d1783ef4 | 2201 | |
7d849163 CB |
2202 | return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d", |
2203 | target_fd0, target_fd1); | |
d1783ef4 CB |
2204 | } |
2205 | ||
4b86fefd CB |
2206 | struct userns_exec_unified_attach_data { |
2207 | const struct lxc_conf *conf; | |
2208 | int unified_fd; | |
d1783ef4 | 2209 | int sk_pair[2]; |
4b86fefd CB |
2210 | pid_t pid; |
2211 | }; | |
2212 | ||
d1783ef4 CB |
2213 | static int cgroup_unified_attach_child_wrapper(void *data) |
2214 | { | |
2215 | struct userns_exec_unified_attach_data *args = data; | |
2216 | ||
2217 | if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || | |
2218 | args->sk_pair[0] < 0 || args->sk_pair[1] < 0) | |
2219 | return ret_errno(EINVAL); | |
2220 | ||
2221 | close_prot_errno_disarm(args->sk_pair[0]); | |
2222 | return cgroup_attach_create_leaf(args->conf, args->unified_fd, | |
2223 | &args->sk_pair[1]); | |
2224 | } | |
2225 | ||
2226 | static int cgroup_unified_attach_parent_wrapper(void *data) | |
4b86fefd CB |
2227 | { |
2228 | struct userns_exec_unified_attach_data *args = data; | |
4b86fefd | 2229 | |
d1783ef4 CB |
2230 | if (!args->conf || args->unified_fd < 0 || args->pid <= 0 || |
2231 | args->sk_pair[0] < 0 || args->sk_pair[1] < 0) | |
4b86fefd CB |
2232 | return ret_errno(EINVAL); |
2233 | ||
d1783ef4 CB |
2234 | close_prot_errno_disarm(args->sk_pair[1]); |
2235 | return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0], | |
2236 | args->pid); | |
4b86fefd CB |
2237 | } |
2238 | ||
900b6606 CB |
2239 | /* Technically, we're always at a delegation boundary here (This is especially |
2240 | * true when cgroup namespaces are available.). The reasoning is that in order | |
2241 | * for us to have been able to start a container in the first place the root | |
2242 | * cgroup must have been a leaf node. Now, either the container's init system | |
2243 | * has populated the cgroup and kept it as a leaf node or it has created | |
2244 | * subtrees. In the former case we will simply attach to the leaf node we | |
2245 | * created when we started the container in the latter case we create our own | |
2246 | * cgroup for the attaching process. | |
2247 | */ | |
7581a82f CB |
2248 | static int __cg_unified_attach(const struct hierarchy *h, |
2249 | const struct lxc_conf *conf, const char *name, | |
900b6606 CB |
2250 | const char *lxcpath, pid_t pid, |
2251 | const char *controller) | |
2252 | { | |
f62cf1d4 | 2253 | __do_close int unified_fd = -EBADF; |
32908bfd | 2254 | __do_free char *path = NULL, *cgroup = NULL; |
900b6606 CB |
2255 | int ret; |
2256 | ||
7581a82f CB |
2257 | if (!conf || !name || !lxcpath || pid <= 0) |
2258 | return ret_errno(EINVAL); | |
2259 | ||
2260 | ret = cgroup_attach(conf, name, lxcpath, pid); | |
32908bfd CB |
2261 | if (ret == 0) |
2262 | return log_trace(0, "Attached to unified cgroup via command handler"); | |
59114d80 | 2263 | if (ret != -ENOCGROUP2) |
32908bfd CB |
2264 | return log_error_errno(ret, errno, "Failed to attach to unified cgroup"); |
2265 | ||
2266 | /* Fall back to retrieving the path for the unified cgroup. */ | |
2267 | cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller); | |
2268 | /* not running */ | |
2269 | if (!cgroup) | |
2270 | return 0; | |
900b6606 | 2271 | |
35ec1a38 | 2272 | path = make_cgroup_path(h, cgroup, NULL); |
900b6606 | 2273 | |
32908bfd | 2274 | unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC); |
900b6606 | 2275 | if (unified_fd < 0) |
7581a82f CB |
2276 | return ret_errno(EBADF); |
2277 | ||
4b86fefd CB |
2278 | if (!lxc_list_empty(&conf->id_map)) { |
2279 | struct userns_exec_unified_attach_data args = { | |
2280 | .conf = conf, | |
2281 | .unified_fd = unified_fd, | |
2282 | .pid = pid, | |
2283 | }; | |
2284 | ||
d1783ef4 CB |
2285 | ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); |
2286 | if (ret < 0) | |
2287 | return -errno; | |
2288 | ||
2289 | ret = userns_exec_minimal(conf, | |
2290 | cgroup_unified_attach_parent_wrapper, | |
2291 | &args, | |
2292 | cgroup_unified_attach_child_wrapper, | |
2293 | &args); | |
4b86fefd CB |
2294 | } else { |
2295 | ret = cgroup_attach_leaf(conf, unified_fd, pid); | |
2296 | } | |
2297 | ||
2298 | return ret; | |
900b6606 CB |
2299 | } |
2300 | ||
7581a82f CB |
2301 | __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, |
2302 | const struct lxc_conf *conf, | |
2303 | const char *name, const char *lxcpath, | |
2304 | pid_t pid) | |
ccb4cabe | 2305 | { |
81b5d48a | 2306 | int len, ret; |
a3650c0c | 2307 | char pidstr[INTTYPE_TO_STRLEN(pid_t)]; |
ccb4cabe | 2308 | |
ab9a452d CB |
2309 | if (!ops) |
2310 | return ret_set_errno(false, ENOENT); | |
2311 | ||
69b4a4bb CB |
2312 | if (!ops->hierarchies) |
2313 | return true; | |
2314 | ||
0bba27c1 CB |
2315 | len = strnprintf(pidstr, sizeof(pidstr), "%d", pid); |
2316 | if (len < 0) | |
ccb4cabe SH |
2317 | return false; |
2318 | ||
81b5d48a | 2319 | for (int i = 0; ops->hierarchies[i]; i++) { |
c05b17bd | 2320 | __do_free char *fullpath = NULL, *path = NULL; |
2202afc9 | 2321 | struct hierarchy *h = ops->hierarchies[i]; |
ccb4cabe | 2322 | |
b8572e8c | 2323 | if (h->fs_type == UNIFIED_HIERARCHY) { |
7581a82f | 2324 | ret = __cg_unified_attach(h, conf, name, lxcpath, pid, |
a3926f6a | 2325 | h->controllers[0]); |
c2aed66d CB |
2326 | if (ret < 0) |
2327 | return false; | |
2328 | ||
2329 | continue; | |
2330 | } | |
2331 | ||
ccb4cabe | 2332 | path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]); |
c2aed66d CB |
2333 | /* not running */ |
2334 | if (!path) | |
e2cb2e74 | 2335 | return false; |
ccb4cabe | 2336 | |
371f834d | 2337 | fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs"); |
7cea5905 | 2338 | ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666); |
ab9a452d | 2339 | if (ret < 0) |
77c3e9a2 | 2340 | return log_error_errno(false, errno, "Failed to attach %d to %s", |
ab9a452d | 2341 | (int)pid, fullpath); |
ccb4cabe SH |
2342 | } |
2343 | ||
ccb4cabe SH |
2344 | return true; |
2345 | } | |
2346 | ||
e2bd2b13 CB |
2347 | /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits. Here we |
2348 | * don't have a cgroup_data set up, so we ask the running container through the | |
2349 | * commands API for the cgroup path. | |
ccb4cabe | 2350 | */ |
b857f4be | 2351 | __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename, |
fb55e009 CB |
2352 | char *value, size_t len, const char *name, |
2353 | const char *lxcpath) | |
ccb4cabe | 2354 | { |
d97919ab | 2355 | __do_free char *path = NULL; |
88396101 | 2356 | __do_free char *controller = NULL; |
d97919ab | 2357 | char *p; |
0069cc61 | 2358 | struct hierarchy *h; |
861cb8c2 | 2359 | int ret = -1; |
ccb4cabe | 2360 | |
a358028a CB |
2361 | if (!ops) |
2362 | return ret_set_errno(-1, ENOENT); | |
2363 | ||
63ba9eaf CB |
2364 | controller = strdup(filename); |
2365 | if (!controller) | |
2366 | return ret_errno(ENOMEM); | |
2367 | ||
0069cc61 CB |
2368 | p = strchr(controller, '.'); |
2369 | if (p) | |
ccb4cabe SH |
2370 | *p = '\0'; |
2371 | ||
a900cbaf | 2372 | path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller); |
0069cc61 CB |
2373 | /* not running */ |
2374 | if (!path) | |
ccb4cabe SH |
2375 | return -1; |
2376 | ||
2202afc9 | 2377 | h = get_hierarchy(ops, controller); |
ccb4cabe | 2378 | if (h) { |
88396101 | 2379 | __do_free char *fullpath = NULL; |
0069cc61 CB |
2380 | |
2381 | fullpath = build_full_cgpath_from_monitorpath(h, path, filename); | |
ccb4cabe | 2382 | ret = lxc_read_from_file(fullpath, value, len); |
ccb4cabe | 2383 | } |
ccb4cabe SH |
2384 | |
2385 | return ret; | |
2386 | } | |
2387 | ||
cb3fc90c CB |
2388 | static int device_cgroup_parse_access(struct device_item *device, const char *val) |
2389 | { | |
2390 | for (int count = 0; count < 3; count++, val++) { | |
2391 | switch (*val) { | |
2392 | case 'r': | |
2393 | device->access[count] = *val; | |
2394 | break; | |
2395 | case 'w': | |
2396 | device->access[count] = *val; | |
2397 | break; | |
2398 | case 'm': | |
2399 | device->access[count] = *val; | |
2400 | break; | |
2401 | case '\n': | |
2402 | case '\0': | |
2403 | count = 3; | |
2404 | break; | |
2405 | default: | |
2406 | return ret_errno(EINVAL); | |
2407 | } | |
2408 | } | |
2409 | ||
2410 | return 0; | |
2411 | } | |
2412 | ||
2a63b5cb CB |
2413 | static int device_cgroup_rule_parse(struct device_item *device, const char *key, |
2414 | const char *val) | |
2415 | { | |
2416 | int count, ret; | |
2417 | char temp[50]; | |
2418 | ||
8b99a20a | 2419 | if (strequal("devices.allow", key)) |
69885a76 | 2420 | device->allow = 1; /* allow the device */ |
2a63b5cb | 2421 | else |
69885a76 | 2422 | device->allow = 0; /* deny the device */ |
2a63b5cb | 2423 | |
8b99a20a | 2424 | if (strequal(val, "a")) { |
2a63b5cb CB |
2425 | /* global rule */ |
2426 | device->type = 'a'; | |
2427 | device->major = -1; | |
2428 | device->minor = -1; | |
2a63b5cb | 2429 | return 0; |
2a63b5cb CB |
2430 | } |
2431 | ||
2432 | switch (*val) { | |
2433 | case 'a': | |
2434 | __fallthrough; | |
2435 | case 'b': | |
2436 | __fallthrough; | |
2437 | case 'c': | |
2438 | device->type = *val; | |
2439 | break; | |
2440 | default: | |
2441 | return -1; | |
2442 | } | |
2443 | ||
2444 | val++; | |
2445 | if (!isspace(*val)) | |
2446 | return -1; | |
2447 | val++; | |
2448 | if (*val == '*') { | |
2449 | device->major = -1; | |
2450 | val++; | |
2451 | } else if (isdigit(*val)) { | |
2452 | memset(temp, 0, sizeof(temp)); | |
2453 | for (count = 0; count < sizeof(temp) - 1; count++) { | |
2454 | temp[count] = *val; | |
2455 | val++; | |
2456 | if (!isdigit(*val)) | |
2457 | break; | |
2458 | } | |
2459 | ret = lxc_safe_int(temp, &device->major); | |
2460 | if (ret) | |
2461 | return -1; | |
2462 | } else { | |
2463 | return -1; | |
2464 | } | |
2465 | if (*val != ':') | |
2466 | return -1; | |
2467 | val++; | |
2468 | ||
2469 | /* read minor */ | |
2470 | if (*val == '*') { | |
2471 | device->minor = -1; | |
2472 | val++; | |
2473 | } else if (isdigit(*val)) { | |
2474 | memset(temp, 0, sizeof(temp)); | |
2475 | for (count = 0; count < sizeof(temp) - 1; count++) { | |
2476 | temp[count] = *val; | |
2477 | val++; | |
2478 | if (!isdigit(*val)) | |
2479 | break; | |
2480 | } | |
2481 | ret = lxc_safe_int(temp, &device->minor); | |
2482 | if (ret) | |
2483 | return -1; | |
2484 | } else { | |
2485 | return -1; | |
2486 | } | |
2487 | if (!isspace(*val)) | |
2488 | return -1; | |
2a63b5cb | 2489 | |
cb3fc90c | 2490 | return device_cgroup_parse_access(device, ++val); |
2a63b5cb CB |
2491 | } |
2492 | ||
eec533e3 CB |
2493 | /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits. Here we |
2494 | * don't have a cgroup_data set up, so we ask the running container through the | |
2495 | * commands API for the cgroup path. | |
ccb4cabe | 2496 | */ |
b857f4be | 2497 | __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops, |
2a63b5cb | 2498 | const char *key, const char *value, |
fb55e009 | 2499 | const char *name, const char *lxcpath) |
ccb4cabe | 2500 | { |
d97919ab | 2501 | __do_free char *path = NULL; |
88396101 | 2502 | __do_free char *controller = NULL; |
d97919ab | 2503 | char *p; |
87777968 | 2504 | struct hierarchy *h; |
861cb8c2 | 2505 | int ret = -1; |
ccb4cabe | 2506 | |
b7aeda96 CB |
2507 | if (!ops || is_empty_string(key) || is_empty_string(value) || |
2508 | is_empty_string(name) || is_empty_string(lxcpath)) | |
2509 | return ret_errno(EINVAL); | |
a358028a | 2510 | |
63ba9eaf CB |
2511 | controller = strdup(key); |
2512 | if (!controller) | |
2513 | return ret_errno(ENOMEM); | |
2514 | ||
87777968 CB |
2515 | p = strchr(controller, '.'); |
2516 | if (p) | |
ccb4cabe SH |
2517 | *p = '\0'; |
2518 | ||
8b99a20a | 2519 | if (pure_unified_layout(ops) && strequal(controller, "devices")) { |
50329f28 | 2520 | struct device_item device = {}; |
2a63b5cb CB |
2521 | |
2522 | ret = device_cgroup_rule_parse(&device, key, value); | |
2523 | if (ret < 0) | |
d2203230 | 2524 | return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", |
2a63b5cb CB |
2525 | key, value); |
2526 | ||
2527 | ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device); | |
2528 | if (ret < 0) | |
2529 | return -1; | |
2530 | ||
2531 | return 0; | |
2532 | } | |
2533 | ||
a900cbaf | 2534 | path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller); |
87777968 CB |
2535 | /* not running */ |
2536 | if (!path) | |
ccb4cabe SH |
2537 | return -1; |
2538 | ||
2202afc9 | 2539 | h = get_hierarchy(ops, controller); |
ccb4cabe | 2540 | if (h) { |
88396101 | 2541 | __do_free char *fullpath = NULL; |
87777968 | 2542 | |
2a63b5cb | 2543 | fullpath = build_full_cgpath_from_monitorpath(h, path, key); |
7cea5905 | 2544 | ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666); |
ccb4cabe | 2545 | } |
ccb4cabe SH |
2546 | |
2547 | return ret; | |
2548 | } | |
2549 | ||
91d1a13a | 2550 | /* take devices cgroup line |
72add155 SH |
2551 | * /dev/foo rwx |
2552 | * and convert it to a valid | |
2553 | * type major:minor mode | |
91d1a13a CB |
2554 | * line. Return <0 on error. Dest is a preallocated buffer long enough to hold |
2555 | * the output. | |
72add155 | 2556 | */ |
cb3fc90c CB |
2557 | static int device_cgroup_rule_parse_devpath(struct device_item *device, |
2558 | const char *devpath) | |
72add155 | 2559 | { |
88396101 | 2560 | __do_free char *path = NULL; |
2a06d041 | 2561 | char *mode = NULL; |
cb3fc90c CB |
2562 | int n_parts, ret; |
2563 | char *p; | |
2564 | struct stat sb; | |
72add155 | 2565 | |
63ba9eaf CB |
2566 | path = strdup(devpath); |
2567 | if (!path) | |
2568 | return ret_errno(ENOMEM); | |
72add155 | 2569 | |
cb3fc90c CB |
2570 | /* |
2571 | * Read path followed by mode. Ignore any trailing text. | |
91d1a13a CB |
2572 | * A ' # comment' would be legal. Technically other text is not |
2573 | * legal, we could check for that if we cared to. | |
72add155 | 2574 | */ |
0dbdb99e | 2575 | for (n_parts = 1, p = path; *p; p++) { |
2c2d6c49 SH |
2576 | if (*p != ' ') |
2577 | continue; | |
2578 | *p = '\0'; | |
91d1a13a | 2579 | |
2c2d6c49 SH |
2580 | if (n_parts != 1) |
2581 | break; | |
2582 | p++; | |
2583 | n_parts++; | |
91d1a13a | 2584 | |
2c2d6c49 SH |
2585 | while (*p == ' ') |
2586 | p++; | |
91d1a13a | 2587 | |
2c2d6c49 | 2588 | mode = p; |
91d1a13a | 2589 | |
2c2d6c49 | 2590 | if (*p == '\0') |
cb3fc90c | 2591 | return ret_set_errno(-1, EINVAL); |
72add155 | 2592 | } |
2c2d6c49 | 2593 | |
83b25c4d CB |
2594 | if (!mode) |
2595 | return ret_errno(EINVAL); | |
2596 | ||
cb3fc90c CB |
2597 | if (device_cgroup_parse_access(device, mode) < 0) |
2598 | return -1; | |
2599 | ||
72add155 SH |
2600 | ret = stat(path, &sb); |
2601 | if (ret < 0) | |
cb3fc90c | 2602 | return ret_set_errno(-1, errno); |
72add155 | 2603 | |
72add155 SH |
2604 | mode_t m = sb.st_mode & S_IFMT; |
2605 | switch (m) { | |
2606 | case S_IFBLK: | |
cb3fc90c | 2607 | device->type = 'b'; |
72add155 SH |
2608 | break; |
2609 | case S_IFCHR: | |
cb3fc90c | 2610 | device->type = 'c'; |
72add155 | 2611 | break; |
2c2d6c49 | 2612 | default: |
77c3e9a2 | 2613 | return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path); |
72add155 | 2614 | } |
2c2d6c49 | 2615 | |
cb3fc90c CB |
2616 | device->major = MAJOR(sb.st_rdev); |
2617 | device->minor = MINOR(sb.st_rdev); | |
2618 | device->allow = 1; | |
72add155 | 2619 | |
cb3fc90c CB |
2620 | return 0; |
2621 | } | |
2622 | ||
2623 | static int convert_devpath(const char *invalue, char *dest) | |
2624 | { | |
50329f28 | 2625 | struct device_item device = {}; |
cb3fc90c CB |
2626 | int ret; |
2627 | ||
2628 | ret = device_cgroup_rule_parse_devpath(&device, invalue); | |
2629 | if (ret < 0) | |
2630 | return -1; | |
2631 | ||
0bba27c1 CB |
2632 | ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major, |
2633 | device.minor, device.access); | |
2634 | if (ret < 0) | |
2635 | return log_error_errno(ret, -ret, | |
2636 | "Error on configuration value \"%c %d:%d %s\" (max 50 chars)", | |
2637 | device.type, device.major, device.minor, | |
2638 | device.access); | |
cb3fc90c CB |
2639 | |
2640 | return 0; | |
72add155 SH |
2641 | } |
2642 | ||
90e97284 CB |
2643 | /* Called from setup_limits - here we have the container's cgroup_data because |
2644 | * we created the cgroups. | |
ccb4cabe | 2645 | */ |
2202afc9 | 2646 | static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename, |
a900cbaf | 2647 | const char *value, bool is_cpuset) |
ccb4cabe | 2648 | { |
88396101 | 2649 | __do_free char *controller = NULL; |
d97919ab | 2650 | char *p; |
1a0e70ac CB |
2651 | /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */ |
2652 | char converted_value[50]; | |
b3646d7e | 2653 | struct hierarchy *h; |
64e82f8b | 2654 | |
63ba9eaf CB |
2655 | controller = strdup(filename); |
2656 | if (!controller) | |
2657 | return ret_errno(ENOMEM); | |
2658 | ||
ab1a6cac CB |
2659 | p = strchr(controller, '.'); |
2660 | if (p) | |
ccb4cabe SH |
2661 | *p = '\0'; |
2662 | ||
8b99a20a | 2663 | if (strequal("devices.allow", filename) && value[0] == '/') { |
c04a6d4e CB |
2664 | int ret; |
2665 | ||
72add155 SH |
2666 | ret = convert_devpath(value, converted_value); |
2667 | if (ret < 0) | |
c8bf519d | 2668 | return ret; |
72add155 | 2669 | value = converted_value; |
c8bf519d | 2670 | } |
2671 | ||
2202afc9 | 2672 | h = get_hierarchy(ops, controller); |
77c3e9a2 CB |
2673 | if (!h) |
2674 | return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller); | |
b3646d7e | 2675 | |
a900cbaf | 2676 | if (is_cpuset) { |
67ed60ce | 2677 | int ret = lxc_write_openat(h->path_con, filename, value, strlen(value)); |
a900cbaf WB |
2678 | if (ret) |
2679 | return ret; | |
2680 | } | |
b1b1a60f | 2681 | return lxc_write_openat(h->path_lim, filename, value, strlen(value)); |
ccb4cabe SH |
2682 | } |
2683 | ||
c581d2a6 CB |
2684 | __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops, |
2685 | struct lxc_conf *conf, | |
2686 | bool do_devices) | |
ccb4cabe | 2687 | { |
d97919ab | 2688 | __do_free struct lxc_list *sorted_cgroup_settings = NULL; |
c581d2a6 | 2689 | struct lxc_list *cgroup_settings = &conf->cgroup; |
d97919ab | 2690 | struct lxc_list *iterator, *next; |
ccb4cabe | 2691 | struct lxc_cgroup *cg; |
ccb4cabe SH |
2692 | bool ret = false; |
2693 | ||
92ca7eb5 CB |
2694 | if (!ops) |
2695 | return ret_set_errno(false, ENOENT); | |
2696 | ||
2697 | if (!conf) | |
2698 | return ret_set_errno(false, EINVAL); | |
2699 | ||
2700 | cgroup_settings = &conf->cgroup; | |
ccb4cabe SH |
2701 | if (lxc_list_empty(cgroup_settings)) |
2702 | return true; | |
2703 | ||
69b4a4bb | 2704 | if (!ops->hierarchies) |
92ca7eb5 | 2705 | return ret_set_errno(false, EINVAL); |
69b4a4bb | 2706 | |
92afbe74 | 2707 | if (pure_unified_layout(ops)) |
b96aa96f CB |
2708 | return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system"); |
2709 | ||
ccb4cabe | 2710 | sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings); |
6b38e644 | 2711 | if (!sorted_cgroup_settings) |
ccb4cabe | 2712 | return false; |
ccb4cabe | 2713 | |
ccb4cabe SH |
2714 | lxc_list_for_each(iterator, sorted_cgroup_settings) { |
2715 | cg = iterator->elem; | |
2716 | ||
aa72fbe7 CB |
2717 | if (do_devices == strnequal("devices", cg->subsystem, 7)) { |
2718 | if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) { | |
fc3b9533 CB |
2719 | if (do_devices && (errno == EACCES || errno == EPERM)) { |
2720 | SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value); | |
2721 | continue; | |
2722 | } | |
2723 | SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value); | |
2724 | goto out; | |
ccb4cabe | 2725 | } |
77c3e9a2 | 2726 | DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value); |
ccb4cabe | 2727 | } |
ccb4cabe SH |
2728 | } |
2729 | ||
2730 | ret = true; | |
6b38e644 | 2731 | INFO("Limits for the legacy cgroup hierarchies have been setup"); |
ccb4cabe | 2732 | out: |
ccb4cabe SH |
2733 | lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) { |
2734 | lxc_list_del(iterator); | |
2735 | free(iterator); | |
2736 | } | |
d97919ab | 2737 | |
ccb4cabe SH |
2738 | return ret; |
2739 | } | |
2740 | ||
bf651989 CB |
2741 | /* |
2742 | * Some of the parsing logic comes from the original cgroup device v1 | |
2743 | * implementation in the kernel. | |
2744 | */ | |
4bfb655e CB |
2745 | static int bpf_device_cgroup_prepare(struct cgroup_ops *ops, |
2746 | struct lxc_conf *conf, const char *key, | |
bf651989 CB |
2747 | const char *val) |
2748 | { | |
50329f28 | 2749 | struct device_item device_item = {}; |
2a63b5cb | 2750 | int ret; |
bf651989 | 2751 | |
30bfbd3f | 2752 | if (strequal("devices.allow", key) && abspath(val)) |
cb3fc90c CB |
2753 | ret = device_cgroup_rule_parse_devpath(&device_item, val); |
2754 | else | |
2755 | ret = device_cgroup_rule_parse(&device_item, key, val); | |
2a63b5cb | 2756 | if (ret < 0) |
30bfbd3f | 2757 | return syserrno_set(EINVAL, "Failed to parse device rule %s=%s", key, val); |
4bfb655e | 2758 | |
60532b18 | 2759 | /* |
15970277 CB |
2760 | * Note that bpf_list_add_device() returns 1 if it altered the device |
2761 | * list and 0 if it didn't; both return values indicate success. | |
2762 | * Only a negative return value indicates an error. | |
60532b18 | 2763 | */ |
a134099d | 2764 | ret = bpf_list_add_device(&conf->bpf_devices, &device_item); |
2a63b5cb | 2765 | if (ret < 0) |
4bfb655e | 2766 | return -1; |
a134099d | 2767 | |
bf651989 CB |
2768 | return 0; |
2769 | } | |
2770 | ||
c581d2a6 CB |
2771 | __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops, |
2772 | struct lxc_handler *handler) | |
6b38e644 | 2773 | { |
7e31931f CB |
2774 | struct lxc_list *cgroup_settings, *iterator; |
2775 | struct hierarchy *h; | |
2776 | struct lxc_conf *conf; | |
6b38e644 | 2777 | |
7e31931f CB |
2778 | if (!ops) |
2779 | return ret_set_errno(false, ENOENT); | |
2780 | ||
2781 | if (!ops->hierarchies) | |
6b38e644 CB |
2782 | return true; |
2783 | ||
7e31931f CB |
2784 | if (!ops->container_cgroup) |
2785 | return ret_set_errno(false, EINVAL); | |
2786 | ||
2787 | if (!handler || !handler->conf) | |
2788 | return ret_set_errno(false, EINVAL); | |
2789 | conf = handler->conf; | |
2790 | ||
7e31931f | 2791 | cgroup_settings = &conf->cgroup2; |
0e7a013e CB |
2792 | if (lxc_list_empty(cgroup_settings)) |
2793 | return true; | |
2794 | ||
2795 | if (!pure_unified_layout(ops)) | |
2796 | return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system"); | |
7e31931f CB |
2797 | |
2798 | if (!ops->unified) | |
6b38e644 | 2799 | return false; |
7e31931f | 2800 | h = ops->unified; |
6b38e644 | 2801 | |
bf651989 | 2802 | lxc_list_for_each (iterator, cgroup_settings) { |
6b38e644 | 2803 | struct lxc_cgroup *cg = iterator->elem; |
c04a6d4e | 2804 | int ret; |
6b38e644 | 2805 | |
aa72fbe7 | 2806 | if (strnequal("devices", cg->subsystem, 7)) |
ee9d3ef0 CB |
2807 | ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value); |
2808 | else | |
b1b1a60f | 2809 | ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value)); |
ee9d3ef0 CB |
2810 | if (ret < 0) |
2811 | return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value); | |
2812 | ||
6b38e644 CB |
2813 | TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value); |
2814 | } | |
2815 | ||
7e31931f | 2816 | return log_info(true, "Limits for the unified cgroup hierarchy have been setup"); |
6b38e644 CB |
2817 | } |
2818 | ||
59eac805 | 2819 | __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler) |
bf651989 | 2820 | { |
e552bd1a CB |
2821 | struct lxc_conf *conf; |
2822 | struct hierarchy *unified; | |
bf651989 | 2823 | |
e552bd1a CB |
2824 | if (!ops) |
2825 | return ret_set_errno(false, ENOENT); | |
2826 | ||
2827 | if (!ops->hierarchies) | |
2828 | return true; | |
2829 | ||
2830 | if (!ops->container_cgroup) | |
2831 | return ret_set_errno(false, EEXIST); | |
2832 | ||
2833 | if (!handler || !handler->conf) | |
2834 | return ret_set_errno(false, EINVAL); | |
2835 | conf = handler->conf; | |
2836 | ||
2837 | unified = ops->unified; | |
ca72ccb5 | 2838 | if (!unified || !device_utility_controller(unified) || |
67ed60ce | 2839 | !unified->path_con || |
a134099d | 2840 | lxc_list_empty(&(conf->bpf_devices).device_item)) |
bf651989 CB |
2841 | return true; |
2842 | ||
a134099d | 2843 | return bpf_cgroup_devices_attach(ops, &conf->bpf_devices); |
bf651989 CB |
2844 | } |
2845 | ||
59eac805 | 2846 | static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup) |
6b38e644 | 2847 | { |
95ab26af CB |
2848 | __do_close int dfd_final = -EBADF; |
2849 | __do_free char *add_controllers = NULL, *copy = NULL; | |
c581d2a6 | 2850 | size_t full_len = 0; |
0954f6ce CB |
2851 | struct hierarchy *unified; |
2852 | int dfd_cur, ret; | |
95ab26af CB |
2853 | char *cur; |
2854 | char **it; | |
6b38e644 | 2855 | |
0954f6ce CB |
2856 | if (!ops->hierarchies || !pure_unified_layout(ops)) |
2857 | return true; | |
2858 | ||
2859 | unified = ops->unified; | |
2860 | if (!unified->controllers[0]) | |
bf651989 CB |
2861 | return true; |
2862 | ||
c581d2a6 CB |
2863 | /* For now we simply enable all controllers that we have detected by |
2864 | * creating a string like "+memory +pids +cpu +io". | |
2865 | * TODO: In the near future we might want to support "-<controller>" | |
2866 | * etc. but whether supporting semantics like this make sense will need | |
2867 | * some thinking. | |
2868 | */ | |
2869 | for (it = unified->controllers; it && *it; it++) { | |
2870 | full_len += strlen(*it) + 2; | |
2871 | add_controllers = must_realloc(add_controllers, full_len + 1); | |
2872 | ||
2873 | if (unified->controllers[0] == *it) | |
2874 | add_controllers[0] = '\0'; | |
2875 | ||
2876 | (void)strlcat(add_controllers, "+", full_len + 1); | |
2877 | (void)strlcat(add_controllers, *it, full_len + 1); | |
2878 | ||
2879 | if ((it + 1) && *(it + 1)) | |
2880 | (void)strlcat(add_controllers, " ", full_len + 1); | |
2881 | } | |
2882 | ||
95ab26af CB |
2883 | copy = strdup(cgroup); |
2884 | if (!copy) | |
f761d24d | 2885 | return false; |
c581d2a6 | 2886 | |
95ab26af CB |
2887 | /* |
2888 | * Placing the write to cgroup.subtree_control before the open() is | |
2889 | * intentional because of the cgroup2 delegation model. It enforces | |
2890 | * that leaf cgroups don't have any controllers enabled for delegation. | |
2891 | */ | |
0954f6ce | 2892 | dfd_cur = unified->dfd_base; |
95ab26af CB |
2893 | lxc_iterate_parts(cur, copy, "/") { |
2894 | /* | |
2895 | * Even though we vetted the paths when we parsed the config | |
2896 | * we're paranoid here and check that the path is neither | |
2897 | * absolute nor walks upwards. | |
2898 | */ | |
2899 | if (abspath(cur)) | |
2900 | return syserrno_set(-EINVAL, "No absolute paths allowed"); | |
ac01a9b8 | 2901 | |
95ab26af CB |
2902 | if (strnequal(cur, "..", STRLITERALLEN(".."))) |
2903 | return syserrno_set(-EINVAL, "No upward walking paths allowed"); | |
ac01a9b8 | 2904 | |
95ab26af | 2905 | ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len); |
61fbc369 | 2906 | if (ret < 0) |
95ab26af CB |
2907 | return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur); |
2908 | ||
2909 | TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur); | |
ac01a9b8 | 2910 | |
95ab26af CB |
2911 | dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0); |
2912 | if (dfd_final < 0) | |
2913 | return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur); | |
2914 | if (dfd_cur != unified->dfd_base) | |
2915 | close(dfd_cur); | |
2916 | /* | |
2917 | * Leave dfd_final pointing to the last fd we opened so | |
2918 | * it will be automatically zapped if we return early. | |
2919 | */ | |
2920 | dfd_cur = dfd_final; | |
c581d2a6 CB |
2921 | } |
2922 | ||
f761d24d | 2923 | return true; |
c581d2a6 CB |
2924 | } |
2925 | ||
59eac805 | 2926 | __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops) |
c581d2a6 | 2927 | { |
61fbc369 CB |
2928 | if (!ops) |
2929 | return ret_set_errno(false, ENOENT); | |
2930 | ||
c581d2a6 CB |
2931 | return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup); |
2932 | } | |
2933 | ||
59eac805 | 2934 | __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops) |
c581d2a6 | 2935 | { |
61fbc369 CB |
2936 | if (!ops) |
2937 | return ret_set_errno(false, ENOENT); | |
2938 | ||
c581d2a6 | 2939 | return __cgfsng_delegate_controllers(ops, ops->container_cgroup); |
2202afc9 CB |
2940 | } |
2941 | ||
0da35ac7 CB |
2942 | static inline bool unified_cgroup(const char *line) |
2943 | { | |
2944 | return *line == '0'; | |
2945 | } | |
2946 | ||
2947 | static inline char *current_unified_cgroup(bool relative, char *line) | |
2948 | { | |
2949 | char *current_cgroup; | |
2950 | ||
2951 | line += STRLITERALLEN("0::"); | |
2952 | ||
2953 | if (!abspath(line)) | |
2954 | return ERR_PTR(-EINVAL); | |
2955 | ||
2956 | /* remove init.scope */ | |
2957 | if (!relative) | |
2958 | line = prune_init_scope(line); | |
2959 | ||
2960 | /* create a relative path */ | |
2961 | line = deabs(line); | |
2962 | ||
2963 | current_cgroup = strdup(line); | |
2964 | if (!current_cgroup) | |
2965 | return ERR_PTR(-ENOMEM); | |
2966 | ||
2967 | return current_cgroup; | |
2968 | } | |
2969 | ||
2970 | static inline const char *unprefix(const char *controllers) | |
2971 | { | |
2972 | if (strnequal(controllers, "name=", STRLITERALLEN("name="))) | |
2973 | return controllers + STRLITERALLEN("name="); | |
2974 | return controllers; | |
2975 | } | |
2976 | ||
2977 | static int __list_cgroup_delegate(char ***delegate) | |
a6ca2ed8 | 2978 | { |
63ba9eaf | 2979 | __do_free char **list = NULL; |
d606c4e9 | 2980 | __do_free char *buf = NULL; |
35ec1a38 CB |
2981 | char *standard[] = { |
2982 | "cgroup.procs", | |
2983 | "cgroup.threads", | |
2984 | "cgroup.subtree_control", | |
2985 | "memory.oom.group", | |
2986 | NULL, | |
2987 | }; | |
d606c4e9 | 2988 | char *token; |
63ba9eaf | 2989 | int ret; |
a6ca2ed8 | 2990 | |
46bf13b7 | 2991 | buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0); |
d606c4e9 | 2992 | if (!buf) { |
a6ca2ed8 | 2993 | for (char **p = standard; p && *p; p++) { |
63ba9eaf CB |
2994 | ret = list_add_string(&list, *p); |
2995 | if (ret < 0) | |
2996 | return ret; | |
a6ca2ed8 | 2997 | } |
35ec1a38 | 2998 | |
63ba9eaf | 2999 | *delegate = move_ptr(list); |
35ec1a38 | 3000 | return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate"); |
d606c4e9 | 3001 | } |
a6ca2ed8 | 3002 | |
257f04ec | 3003 | lxc_iterate_parts(token, buf, " \t\n") { |
d606c4e9 CB |
3004 | /* |
3005 | * We always need to chown this for both cgroup and | |
3006 | * cgroup2. | |
3007 | */ | |
8b99a20a | 3008 | if (strequal(token, "cgroup.procs")) |
d606c4e9 CB |
3009 | continue; |
3010 | ||
63ba9eaf CB |
3011 | ret = list_add_string(&list, token); |
3012 | if (ret < 0) | |
3013 | return ret; | |
a6ca2ed8 | 3014 | } |
2202afc9 | 3015 | |
63ba9eaf | 3016 | *delegate = move_ptr(list); |
341e6516 | 3017 | return 0; |
2202afc9 CB |
3018 | } |
3019 | ||
0da35ac7 | 3020 | static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files) |
0e3af26b | 3021 | { |
0da35ac7 CB |
3022 | __do_free_string_list char **list = NULL; |
3023 | int ret; | |
0e3af26b | 3024 | |
0da35ac7 CB |
3025 | ret = __list_cgroup_delegate(&list); |
3026 | if (ret < 0) | |
3027 | return syserrno(ret, "Failed to determine unified cgroup delegation requirements"); | |
0e3af26b | 3028 | |
0da35ac7 CB |
3029 | for (char *const *s = list; s && *s; s++) { |
3030 | if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT) | |
3031 | continue; | |
0e3af26b | 3032 | |
0da35ac7 CB |
3033 | return sysinfo(false, "The %s file is not writable, skipping unified hierarchy", *s); |
3034 | } | |
0e3af26b | 3035 | |
0da35ac7 CB |
3036 | *ret_files = move_ptr(list); |
3037 | return true; | |
0e3af26b CB |
3038 | } |
3039 | ||
0da35ac7 | 3040 | static bool legacy_hierarchy_delegated(int dfd_base) |
35ec1a38 | 3041 | { |
0da35ac7 CB |
3042 | if (faccessat(dfd_base, "cgroup.procs", W_OK, 0) && errno != ENOENT) |
3043 | return sysinfo(false, "The cgroup.procs file is not writable, skipping legacy hierarchy"); | |
3044 | ||
3045 | return true; | |
35ec1a38 CB |
3046 | } |
3047 | ||
3048 | static int __initialize_cgroups(struct cgroup_ops *ops, bool relative, | |
3049 | bool unprivileged) | |
2202afc9 | 3050 | { |
8033666c CB |
3051 | __do_free char *cgroup_info = NULL; |
3052 | char *it; | |
2202afc9 | 3053 | |
35ec1a38 CB |
3054 | /* |
3055 | * Root spawned containers escape the current cgroup, so use init's | |
3056 | * cgroups as our base in that case. | |
3057 | */ | |
9caee129 | 3058 | if (!relative && (geteuid() == 0)) |
8033666c | 3059 | cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0); |
2202afc9 | 3060 | else |
8033666c CB |
3061 | cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0); |
3062 | if (!cgroup_info) | |
35ec1a38 | 3063 | return ret_errno(ENOMEM); |
2202afc9 | 3064 | |
8033666c | 3065 | lxc_iterate_parts(it, cgroup_info, "\n") { |
35ec1a38 CB |
3066 | __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF; |
3067 | __do_free char *controllers = NULL, *current_cgroup = NULL; | |
3068 | __do_free_string_list char **controller_list = NULL, | |
3069 | **delegate = NULL; | |
3070 | char *line; | |
3071 | int dfd, ret, type; | |
3072 | ||
3073 | /* Handle the unified cgroup hierarchy. */ | |
3074 | line = it; | |
3075 | if (unified_cgroup(line)) { | |
3076 | char *unified_mnt; | |
3077 | ||
b8572e8c CB |
3078 | type = UNIFIED_HIERARCHY; |
3079 | ||
35ec1a38 CB |
3080 | current_cgroup = current_unified_cgroup(relative, line); |
3081 | if (IS_ERR(current_cgroup)) | |
3082 | return PTR_ERR(current_cgroup); | |
3083 | ||
e18e9053 CB |
3084 | if (unified_cgroup_fd(ops->dfd_mnt)) { |
3085 | dfd_mnt = dup_cloexec(ops->dfd_mnt); | |
35ec1a38 CB |
3086 | unified_mnt = ""; |
3087 | } else { | |
e18e9053 | 3088 | dfd_mnt = open_at(ops->dfd_mnt, |
35ec1a38 CB |
3089 | "unified", |
3090 | PROTECT_OPATH_DIRECTORY, | |
3091 | PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); | |
3092 | unified_mnt = "unified"; | |
3093 | } | |
3094 | if (dfd_mnt < 0) { | |
3095 | if (errno != ENOENT) | |
e18e9053 | 3096 | return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt); |
2202afc9 | 3097 | |
35ec1a38 CB |
3098 | SYSTRACE("Unified cgroup not mounted"); |
3099 | continue; | |
3100 | } | |
3101 | dfd = dfd_mnt; | |
3102 | ||
3103 | if (!is_empty_string(current_cgroup)) { | |
3104 | dfd_base = open_at(dfd_mnt, current_cgroup, | |
3105 | PROTECT_OPATH_DIRECTORY, | |
3106 | PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
3107 | if (dfd_base < 0) | |
3108 | return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup); | |
3109 | dfd = dfd_base; | |
3110 | } | |
8033666c | 3111 | |
0da35ac7 CB |
3112 | if (!unified_hierarchy_delegated(dfd, &delegate)) |
3113 | continue; | |
3114 | ||
35ec1a38 CB |
3115 | controller_list = unified_controllers(dfd, "cgroup.controllers"); |
3116 | if (!controller_list) { | |
3117 | TRACE("No controllers are enabled for delegation in the unified hierarchy"); | |
63ba9eaf CB |
3118 | controller_list = list_new(); |
3119 | if (!controller_list) | |
3120 | return syserrno(-ENOMEM, "Failed to create empty controller list"); | |
35ec1a38 | 3121 | } |
8033666c | 3122 | |
35ec1a38 CB |
3123 | controllers = strdup(unified_mnt); |
3124 | if (!controllers) | |
3125 | return ret_errno(ENOMEM); | |
3126 | } else { | |
3127 | char *__controllers, *__current_cgroup; | |
2202afc9 | 3128 | |
b8572e8c CB |
3129 | type = LEGACY_HIERARCHY; |
3130 | ||
35ec1a38 CB |
3131 | __controllers = strchr(line, ':'); |
3132 | if (!__controllers) | |
3133 | return ret_errno(EINVAL); | |
3134 | __controllers++; | |
3135 | ||
3136 | __current_cgroup = strchr(__controllers, ':'); | |
3137 | if (!__current_cgroup) | |
3138 | return ret_errno(EINVAL); | |
3139 | *__current_cgroup = '\0'; | |
3140 | __current_cgroup++; | |
3141 | ||
3142 | controllers = strdup(unprefix(__controllers)); | |
3143 | if (!controllers) | |
3144 | return ret_errno(ENOMEM); | |
3145 | ||
e18e9053 | 3146 | dfd_mnt = open_at(ops->dfd_mnt, |
35ec1a38 CB |
3147 | controllers, PROTECT_OPATH_DIRECTORY, |
3148 | PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); | |
3149 | if (dfd_mnt < 0) { | |
3150 | if (errno != ENOENT) | |
3151 | return syserrno(-errno, "Failed to open %d/%s", | |
e18e9053 | 3152 | ops->dfd_mnt, controllers); |
2202afc9 | 3153 | |
35ec1a38 CB |
3154 | SYSTRACE("%s not mounted", controllers); |
3155 | continue; | |
3156 | } | |
3157 | dfd = dfd_mnt; | |
3158 | ||
3159 | if (!abspath(__current_cgroup)) | |
3160 | return ret_errno(EINVAL); | |
3161 | ||
3162 | /* remove init.scope */ | |
3163 | if (!relative) | |
3164 | __current_cgroup = prune_init_scope(__current_cgroup); | |
3165 | ||
3166 | /* create a relative path */ | |
3167 | __current_cgroup = deabs(__current_cgroup); | |
6e214b74 | 3168 | |
35ec1a38 CB |
3169 | current_cgroup = strdup(__current_cgroup); |
3170 | if (!current_cgroup) | |
3171 | return ret_errno(ENOMEM); | |
2202afc9 | 3172 | |
35ec1a38 CB |
3173 | if (!is_empty_string(current_cgroup)) { |
3174 | dfd_base = open_at(dfd_mnt, current_cgroup, | |
3175 | PROTECT_OPATH_DIRECTORY, | |
3176 | PROTECT_LOOKUP_BENEATH_XDEV, 0); | |
3177 | if (dfd_base < 0) | |
3178 | return syserrno(-errno, "Failed to open %d/%s", | |
3179 | dfd_mnt, current_cgroup); | |
3180 | dfd = dfd_base; | |
3181 | } | |
2a63b5cb | 3182 | |
0da35ac7 CB |
3183 | if (!legacy_hierarchy_delegated(dfd)) |
3184 | continue; | |
35ec1a38 CB |
3185 | |
3186 | /* | |
3187 | * We intentionally pass __current_cgroup here and not | |
3188 | * controllers because we would otherwise chop the | |
3189 | * mountpoint. | |
3190 | */ | |
63ba9eaf CB |
3191 | controller_list = list_add_controllers(__controllers); |
3192 | if (!controller_list) | |
3193 | return syserrno(-ENOMEM, "Failed to create controller list from %s", __controllers); | |
35ec1a38 CB |
3194 | |
3195 | if (skip_hierarchy(ops, controller_list)) | |
3196 | continue; | |
3197 | ||
35ec1a38 CB |
3198 | ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; |
3199 | } | |
3200 | ||
179754a2 CB |
3201 | ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd, |
3202 | current_cgroup, controller_list, type); | |
35ec1a38 CB |
3203 | if (ret < 0) |
3204 | return syserrno(ret, "Failed to add %s hierarchy", controllers); | |
3205 | ||
3206 | /* Transfer ownership. */ | |
3207 | move_fd(dfd_mnt); | |
3208 | move_fd(dfd_base); | |
3209 | move_ptr(current_cgroup); | |
3210 | move_ptr(controllers); | |
3211 | move_ptr(controller_list); | |
b8572e8c | 3212 | if (type == UNIFIED_HIERARCHY) |
042f9e9c | 3213 | ops->unified->delegate = move_ptr(delegate); |
35ec1a38 CB |
3214 | } |
3215 | ||
3216 | /* determine cgroup layout */ | |
3217 | if (ops->unified) { | |
3218 | if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { | |
3219 | ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; | |
3220 | } else { | |
3221 | if (bpf_devices_cgroup_supported()) | |
ca72ccb5 | 3222 | ops->unified->utilities |= DEVICES_CONTROLLER; |
35ec1a38 CB |
3223 | ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; |
3224 | } | |
3225 | } | |
3226 | ||
c7a1f72a CB |
3227 | if (!controllers_available(ops)) |
3228 | return syserrno_set(-ENOENT, "One or more requested controllers unavailable or not delegated"); | |
3229 | ||
35ec1a38 | 3230 | return 0; |
2202afc9 CB |
3231 | } |
3232 | ||
35ec1a38 | 3233 | static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf) |
2202afc9 | 3234 | { |
d4cff352 | 3235 | __do_close int dfd = -EBADF; |
2202afc9 | 3236 | int ret; |
0fbf99d6 | 3237 | const char *controllers_use; |
d4cff352 | 3238 | |
e18e9053 | 3239 | if (ops->dfd_mnt >= 0) |
a96be3c3 | 3240 | return ret_errno(EBUSY); |
d4cff352 CB |
3241 | |
3242 | /* | |
3243 | * I don't see the need for allowing symlinks here. If users want to | |
3244 | * have their hierarchy available in different locations I strongly | |
3245 | * suggest bind-mounts. | |
3246 | */ | |
3247 | dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT, | |
3248 | PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0); | |
3249 | if (dfd < 0) | |
3250 | return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT); | |
2202afc9 | 3251 | |
0fbf99d6 CB |
3252 | controllers_use = lxc_global_config_value("lxc.cgroup.use"); |
3253 | if (controllers_use) { | |
3254 | __do_free char *dup = NULL; | |
3255 | char *it; | |
b7b18fc5 | 3256 | |
0fbf99d6 CB |
3257 | dup = strdup(controllers_use); |
3258 | if (!dup) | |
7a0c8ed3 | 3259 | return -errno; |
b7b18fc5 | 3260 | |
63ba9eaf CB |
3261 | lxc_iterate_parts(it, dup, ",") { |
3262 | ret = list_add_string(&ops->cgroup_use, it); | |
3263 | if (ret < 0) | |
3264 | return ret; | |
3265 | } | |
b7b18fc5 | 3266 | } |
2202afc9 | 3267 | |
d4cff352 CB |
3268 | /* |
3269 | * Keep dfd referenced by the cleanup function and actually move the fd | |
3270 | * once we know the initialization succeeded. So if we fail we clean up | |
3271 | * the dfd. | |
3272 | */ | |
e18e9053 | 3273 | ops->dfd_mnt = dfd; |
2202afc9 | 3274 | |
35ec1a38 | 3275 | ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map)); |
d4cff352 CB |
3276 | if (ret < 0) |
3277 | return syserrno(ret, "Failed to initialize cgroups"); | |
2202afc9 | 3278 | |
d4cff352 CB |
3279 | /* Transfer ownership to cgroup_ops. */ |
3280 | move_fd(dfd); | |
3281 | return 0; | |
2202afc9 CB |
3282 | } |
3283 | ||
341e6516 | 3284 | __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops) |
2202afc9 CB |
3285 | { |
3286 | const char *cgroup_pattern; | |
3287 | ||
341e6516 CB |
3288 | if (!ops) |
3289 | return ret_set_errno(-1, ENOENT); | |
3290 | ||
2202afc9 CB |
3291 | /* copy system-wide cgroup information */ |
3292 | cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern"); | |
63ba9eaf CB |
3293 | if (cgroup_pattern && !strequal(cgroup_pattern, "")) { |
3294 | ops->cgroup_pattern = strdup(cgroup_pattern); | |
3295 | if (!ops->cgroup_pattern) | |
3296 | return ret_errno(ENOMEM); | |
3297 | } | |
2202afc9 | 3298 | |
341e6516 | 3299 | return 0; |
2202afc9 CB |
3300 | } |
3301 | ||
35ec1a38 | 3302 | struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf) |
2202afc9 | 3303 | { |
a64edc1c | 3304 | __do_free struct cgroup_ops *cgfsng_ops = NULL; |
2202afc9 | 3305 | |
c5d0238a | 3306 | cgfsng_ops = zalloc(sizeof(struct cgroup_ops)); |
2202afc9 | 3307 | if (!cgfsng_ops) |
341e6516 | 3308 | return ret_set_errno(NULL, ENOMEM); |
2202afc9 | 3309 | |
2202afc9 | 3310 | cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; |
e18e9053 | 3311 | cgfsng_ops->dfd_mnt = -EBADF; |
2202afc9 | 3312 | |
35ec1a38 | 3313 | if (initialize_cgroups(cgfsng_ops, conf)) |
2202afc9 | 3314 | return NULL; |
2202afc9 | 3315 | |
ca76baed CB |
3316 | cgfsng_ops->data_init = cgfsng_data_init; |
3317 | cgfsng_ops->payload_destroy = cgfsng_payload_destroy; | |
3318 | cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy; | |
3319 | cgfsng_ops->monitor_create = cgfsng_monitor_create; | |
3320 | cgfsng_ops->monitor_enter = cgfsng_monitor_enter; | |
3321 | cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers; | |
3322 | cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers; | |
3323 | cgfsng_ops->payload_create = cgfsng_payload_create; | |
3324 | cgfsng_ops->payload_enter = cgfsng_payload_enter; | |
840eec19 | 3325 | cgfsng_ops->finalize = cgfsng_finalize; |
ca76baed CB |
3326 | cgfsng_ops->get_cgroup = cgfsng_get_cgroup; |
3327 | cgfsng_ops->get = cgfsng_get; | |
3328 | cgfsng_ops->set = cgfsng_set; | |
3329 | cgfsng_ops->freeze = cgfsng_freeze; | |
3330 | cgfsng_ops->unfreeze = cgfsng_unfreeze; | |
3331 | cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy; | |
3332 | cgfsng_ops->setup_limits = cgfsng_setup_limits; | |
3333 | cgfsng_ops->driver = "cgfsng"; | |
3334 | cgfsng_ops->version = "1.0.0"; | |
3335 | cgfsng_ops->attach = cgfsng_attach; | |
3336 | cgfsng_ops->chown = cgfsng_chown; | |
3337 | cgfsng_ops->mount = cgfsng_mount; | |
3338 | cgfsng_ops->devices_activate = cgfsng_devices_activate; | |
3339 | cgfsng_ops->get_limiting_cgroup = cgfsng_get_limiting_cgroup; | |
2202afc9 | 3340 | |
ff9edd2d CB |
3341 | cgfsng_ops->criu_escape = cgfsng_criu_escape; |
3342 | cgfsng_ops->criu_num_hierarchies = cgfsng_criu_num_hierarchies; | |
3343 | cgfsng_ops->criu_get_hierarchies = cgfsng_criu_get_hierarchies; | |
3344 | ||
a64edc1c | 3345 | return move_ptr(cgfsng_ops); |
2202afc9 | 3346 | } |
be835470 | 3347 | |
029d8e88 CB |
3348 | int cgroup_attach(const struct lxc_conf *conf, const char *name, |
3349 | const char *lxcpath, pid_t pid) | |
3350 | { | |
3351 | __do_close int unified_fd = -EBADF; | |
3352 | int ret; | |
3353 | ||
88c27c53 | 3354 | if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0) |
029d8e88 CB |
3355 | return ret_errno(EINVAL); |
3356 | ||
3357 | unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath); | |
3358 | if (unified_fd < 0) | |
6b55ce0e | 3359 | return ret_errno(ENOCGROUP2); |
029d8e88 CB |
3360 | |
3361 | if (!lxc_list_empty(&conf->id_map)) { | |
3362 | struct userns_exec_unified_attach_data args = { | |
3363 | .conf = conf, | |
3364 | .unified_fd = unified_fd, | |
3365 | .pid = pid, | |
3366 | }; | |
3367 | ||
3368 | ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair); | |
3369 | if (ret < 0) | |
3370 | return -errno; | |
3371 | ||
3372 | ret = userns_exec_minimal(conf, | |
3373 | cgroup_unified_attach_parent_wrapper, | |
3374 | &args, | |
3375 | cgroup_unified_attach_child_wrapper, | |
3376 | &args); | |
3377 | } else { | |
3378 | ret = cgroup_attach_leaf(conf, unified_fd, pid); | |
3379 | } | |
3380 | ||
3381 | return ret; | |
3382 | } | |
3383 | ||
751a624f | 3384 | /* Connects to command socket therefore isn't callable from command handler. */ |
bfe2971a | 3385 | int cgroup_get(const char *name, const char *lxcpath, |
be835470 CB |
3386 | const char *filename, char *buf, size_t len) |
3387 | { | |
3388 | __do_close int unified_fd = -EBADF; | |
3389 | ssize_t ret; | |
3390 | ||
bfe2971a | 3391 | if (is_empty_string(filename) || is_empty_string(name) || |
be835470 CB |
3392 | is_empty_string(lxcpath)) |
3393 | return ret_errno(EINVAL); | |
3394 | ||
3395 | if ((buf && !len) || (len && !buf)) | |
3396 | return ret_errno(EINVAL); | |
3397 | ||
ae4fcc7b | 3398 | unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath); |
be835470 CB |
3399 | if (unified_fd < 0) |
3400 | return ret_errno(ENOCGROUP2); | |
3401 | ||
3402 | ret = lxc_read_try_buf_at(unified_fd, filename, buf, len); | |
3403 | if (ret < 0) | |
3404 | SYSERROR("Failed to read cgroup value"); | |
3405 | ||
3406 | return ret; | |
3407 | } | |
3408 | ||
751a624f | 3409 | /* Connects to command socket therefore isn't callable from command handler. */ |
bfe2971a | 3410 | int cgroup_set(const char *name, const char *lxcpath, |
be835470 CB |
3411 | const char *filename, const char *value) |
3412 | { | |
3413 | __do_close int unified_fd = -EBADF; | |
3414 | ssize_t ret; | |
3415 | ||
bfe2971a | 3416 | if (is_empty_string(filename) || is_empty_string(value) || |
be835470 CB |
3417 | is_empty_string(name) || is_empty_string(lxcpath)) |
3418 | return ret_errno(EINVAL); | |
3419 | ||
ae4fcc7b | 3420 | unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath); |
be835470 CB |
3421 | if (unified_fd < 0) |
3422 | return ret_errno(ENOCGROUP2); | |
3423 | ||
aa72fbe7 | 3424 | if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) { |
be835470 CB |
3425 | struct device_item device = {}; |
3426 | ||
3427 | ret = device_cgroup_rule_parse(&device, filename, value); | |
3428 | if (ret < 0) | |
3429 | return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value); | |
3430 | ||
3431 | ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device); | |
3432 | } else { | |
3433 | ret = lxc_writeat(unified_fd, filename, value, strlen(value)); | |
3434 | } | |
3435 | ||
3436 | return ret; | |
3437 | } | |
c8af3332 | 3438 | |
c9c814f4 CB |
3439 | static int do_cgroup_freeze(int unified_fd, |
3440 | const char *state_string, | |
3441 | int state_num, | |
3442 | int timeout, | |
3443 | const char *epoll_error, | |
3444 | const char *wait_error) | |
c8af3332 CB |
3445 | { |
3446 | __do_close int events_fd = -EBADF; | |
3447 | call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL; | |
3448 | int ret; | |
3449 | struct lxc_epoll_descr descr = {}; | |
3450 | ||
3451 | if (timeout != 0) { | |
3452 | ret = lxc_mainloop_open(&descr); | |
3453 | if (ret) | |
3454 | return log_error_errno(-1, errno, "%s", epoll_error); | |
3455 | ||
3456 | /* automatically cleaned up now */ | |
3457 | descr_ptr = &descr; | |
3458 | ||
3459 | events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0); | |
3460 | if (events_fd < 0) | |
3461 | return log_error_errno(-errno, errno, "Failed to open cgroup.events file"); | |
3462 | ||
3463 | ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num)); | |
3464 | if (ret < 0) | |
3465 | return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop"); | |
3466 | } | |
3467 | ||
3468 | ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1); | |
3469 | if (ret < 0) | |
3470 | return log_error_errno(-1, errno, "Failed to open cgroup.freeze file"); | |
3471 | ||
3472 | if (timeout != 0) { | |
3473 | ret = lxc_mainloop(&descr, timeout); | |
3474 | if (ret) | |
3475 | return log_error_errno(-1, errno, "%s", wait_error); | |
3476 | } | |
3477 | ||
3478 | return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen"); | |
3479 | } | |
3480 | ||
c9c814f4 CB |
3481 | static inline int __cgroup_freeze(int unified_fd, int timeout) |
3482 | { | |
3483 | return do_cgroup_freeze(unified_fd, "1", 1, timeout, | |
3484 | "Failed to create epoll instance to wait for container freeze", | |
3485 | "Failed to wait for container to be frozen"); | |
3486 | } | |
3487 | ||
5ef7547f | 3488 | int cgroup_freeze(const char *name, const char *lxcpath, int timeout) |
c8af3332 CB |
3489 | { |
3490 | __do_close int unified_fd = -EBADF; | |
3491 | int ret; | |
3492 | ||
b57f9b13 CB |
3493 | if (is_empty_string(name) || is_empty_string(lxcpath)) |
3494 | return ret_errno(EINVAL); | |
3495 | ||
ae4fcc7b | 3496 | unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath); |
c8af3332 CB |
3497 | if (unified_fd < 0) |
3498 | return ret_errno(ENOCGROUP2); | |
3499 | ||
3500 | lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING); | |
c9c814f4 | 3501 | ret = __cgroup_freeze(unified_fd, timeout); |
c8af3332 | 3502 | lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING); |
5ef7547f | 3503 | return ret; |
c8af3332 CB |
3504 | } |
3505 | ||
c9c814f4 CB |
3506 | int __cgroup_unfreeze(int unified_fd, int timeout) |
3507 | { | |
3508 | return do_cgroup_freeze(unified_fd, "0", 0, timeout, | |
3509 | "Failed to create epoll instance to wait for container freeze", | |
3510 | "Failed to wait for container to be frozen"); | |
3511 | } | |
3512 | ||
5ef7547f | 3513 | int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout) |
c8af3332 CB |
3514 | { |
3515 | __do_close int unified_fd = -EBADF; | |
3516 | int ret; | |
3517 | ||
b57f9b13 CB |
3518 | if (is_empty_string(name) || is_empty_string(lxcpath)) |
3519 | return ret_errno(EINVAL); | |
3520 | ||
ae4fcc7b | 3521 | unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath); |
c8af3332 CB |
3522 | if (unified_fd < 0) |
3523 | return ret_errno(ENOCGROUP2); | |
3524 | ||
3525 | lxc_cmd_notify_state_listeners(name, lxcpath, THAWED); | |
c9c814f4 | 3526 | ret = __cgroup_unfreeze(unified_fd, timeout); |
c8af3332 | 3527 | lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN); |
5ef7547f | 3528 | return ret; |
c8af3332 | 3529 | } |