1 /* SPDX-License-Identifier: LGPL-2.1+ */
8 #include <linux/magic.h>
10 #include <netinet/in.h>
13 #include <sys/param.h>
14 #include <sys/types.h>
17 #include "attach_options.h"
22 #include "lxcseccomp.h"
23 #include "memory_utils.h"
24 #include "mount_utils.h"
25 #include "namespace.h"
29 #include "storage/storage.h"
30 #include "string_utils.h"
31 #include "syscall_wrappers.h"
34 #if HAVE_SYS_RESOURCE_H
35 #include <sys/resource.h>
38 #if HAVE_SCMP_FILTER_CTX
39 typedef void * scmp_filter_ctx
;
42 typedef signed long personality_t
;
44 /* worth moving to configure.ac? */
45 #define subuidfile "/etc/subuid"
46 #define subgidfile "/etc/subgid"
49 * Defines a generic struct to configure the control group. It is up to the
50 * programmer to specify the right subsystem.
51 * @subsystem : the targeted subsystem
52 * @value : the value to set
53 * @version : The version of the cgroup filesystem on which the controller
56 * @controllers : The controllers to use for this container.
57 * @dir : The name of the directory containing the container's cgroup.
58 * Not that this is a per-container setting.
62 /* information about a specific controller */
63 struct /* controller */ {
69 /* meta information about cgroup configuration */
74 char *monitor_pivot_dir
;
78 /* If an unpriv user in pure unified-only hierarchy
79 * starts a container, then we ask systemd to create
80 * a scope for us, and create the monitor and container
82 * This will ignore the above things like monitor_dir
88 struct list_head head
;
91 static void free_lxc_cgroup(struct lxc_cgroup
*ptr
)
99 define_cleanup_function(struct lxc_cgroup
*, free_lxc_cgroup
);
101 #if !HAVE_SYS_RESOURCE_H
102 #define RLIM_INFINITY ((unsigned long)-1)
104 unsigned long rlim_cur
;
105 unsigned long rlim_max
;
106 struct list_head head
;
111 * Defines a structure to configure resource limits to set via setrlimit().
112 * @resource : the resource name in lowercase without the RLIMIT_ prefix
113 * @limit : the limit to set
118 struct list_head head
;
121 static void free_lxc_limit(struct lxc_limit
*ptr
)
124 free_disarm(ptr
->resource
);
128 define_cleanup_function(struct lxc_limit
*, free_lxc_limit
);
136 * Defines a structure to configure kernel parameters at runtime.
137 * @key : the kernel parameters will be configured without the "lxc.sysctl" prefix
138 * @value : the value to set
143 struct list_head head
;
146 static void free_lxc_sysctl(struct lxc_sysctl
*ptr
)
154 define_cleanup_function(struct lxc_sysctl
*, free_lxc_sysctl
);
157 * Defines a structure to configure proc filesystem at runtime.
158 * @filename : the proc filesystem will be configured without the "lxc.proc" prefix
159 * @value : the value to set
164 struct list_head head
;
167 static void free_lxc_proc(struct lxc_proc
*ptr
)
175 define_cleanup_function(struct lxc_proc
*, free_lxc_proc
);
178 * id_map is an id map entry. Form in confile is:
179 * lxc.idmap = u 0 9800 100
180 * lxc.idmap = u 1000 9900 100
181 * lxc.idmap = g 0 9800 100
182 * lxc.idmap = g 1000 9900 100
183 * meaning the container can use uids and gids 0-99 and 1000-1099,
184 * with [ug]id 0 mapping to [ug]id 9800 on the host, and [ug]id 1000 to
185 * [ug]id 9900 on the host.
189 unsigned long hostid
, nsid
, range
;
190 struct list_head head
;
193 /* Defines the number of tty configured and contains the
195 * @max = number of configured ttys
197 struct lxc_tty_info
{
201 struct lxc_terminal_info
*tty
;
204 typedef enum lxc_mount_options_t
{
205 LXC_MOUNT_CREATE_DIR
= 0,
206 LXC_MOUNT_CREATE_FILE
= 1,
207 LXC_MOUNT_OPTIONAL
= 2,
208 LXC_MOUNT_RELATIVE
= 3,
211 } lxc_mount_options_t
;
213 __hidden
extern const char *lxc_mount_options_info
[LXC_MOUNT_MAX
];
215 struct lxc_mount_options
{
216 unsigned int create_dir
: 1;
217 unsigned int create_file
: 1;
218 unsigned int optional
: 1;
219 unsigned int relative
: 1;
220 unsigned int bind_recursively
: 1;
221 unsigned int propagate_recursively
: 1;
222 unsigned int bind
: 1;
223 char userns_path
[PATH_MAX
];
224 unsigned long mnt_flags
;
225 unsigned long prop_flags
;
227 struct mount_attr attr
;
231 /* Defines a structure to store the rootfs location, the
232 * optionals pivot_root, rootfs mount paths
233 * @path : the rootfs source (directory or device)
234 * @mount : where it is mounted
235 * @buf : static buffer to construct paths
236 * @bdev_type : optional backing store type
237 * @managed : whether it is managed by LXC
238 * @dfd_mnt : fd for @mount
239 * @dfd_dev : fd for /dev of the container
256 struct lxc_mount_options mnt_opts
;
257 struct lxc_storage
*storage
;
261 * Automatic mounts for LXC to perform inside the container
264 /* /proc read-write */
265 LXC_AUTO_PROC_RW
= BIT(0),
266 /* /proc/sys and /proc/sysrq-trigger read-only */
267 LXC_AUTO_PROC_MIXED
= BIT(1),
268 LXC_AUTO_PROC_MASK
= LXC_AUTO_PROC_RW
|
270 /* /sys read-write */
271 LXC_AUTO_SYS_RW
= BIT(2),
273 LXC_AUTO_SYS_RO
= BIT(3),
274 /* /sys read-only and /sys/class/net read-write */
275 LXC_AUTO_SYS_MIXED
= LXC_AUTO_SYS_RW
|
277 LXC_AUTO_SYS_MASK
= LXC_AUTO_SYS_MIXED
,
279 /* /sys/fs/cgroup (partial mount, read-only) */
280 LXC_AUTO_CGROUP_RO
= BIT(4),
281 /* /sys/fs/cgroup (partial mount, read-write) */
282 LXC_AUTO_CGROUP_RW
= BIT(5),
283 /* /sys/fs/cgroup (partial mount, paths r/o, cgroup r/w) */
284 LXC_AUTO_CGROUP_MIXED
= LXC_AUTO_CGROUP_RO
|
286 /* /sys/fs/cgroup (full mount, read-only) */
287 LXC_AUTO_CGROUP_FULL_RO
= BIT(6),
288 /* /sys/fs/cgroup (full mount, read-write) */
289 LXC_AUTO_CGROUP_FULL_RW
= BIT(7),
290 /* /sys/fs/cgroup (full mount, parent r/o, own r/w) */
291 LXC_AUTO_CGROUP_FULL_MIXED
= LXC_AUTO_CGROUP_FULL_RO
|
292 LXC_AUTO_CGROUP_FULL_RW
,
295 * Mount a pure read-write cgroup2 layout in the container independent
296 * of the cgroup layout used on the host.
298 LXC_AUTO_CGROUP2_RW
= BIT(8),
300 * Mount a pure read-only cgroup2 layout in the container independent
301 * of the cgroup layout used on the host.
303 LXC_AUTO_CGROUP2_RO
= BIT(9),
306 * These are defined in such a way as to retain binary compatibility
307 * with earlier versions of this code. If the previous mask is applied,
308 * both of these will default back to the _MIXED variants, which is
311 /* /sys/fs/cgroup (partial mount, r/w or mixed, depending on caps) */
312 LXC_AUTO_CGROUP_NOSPEC
= 0x0B0,
313 /* /sys/fs/cgroup (full mount, r/w or mixed, depending on caps) */
314 LXC_AUTO_CGROUP_FULL_NOSPEC
= 0x0E0,
315 /* mount cgroups even when cgroup namespaces are supported */
316 LXC_AUTO_CGROUP_FORCE
= BIT(10),
317 /* all known cgroup options */
318 LXC_AUTO_CGROUP_MASK
= LXC_AUTO_CGROUP_MIXED
|
319 LXC_AUTO_CGROUP_FULL_MIXED
|
320 LXC_AUTO_CGROUP_NOSPEC
|
321 LXC_AUTO_CGROUP_FULL_NOSPEC
|
322 LXC_AUTO_CGROUP_FORCE
|
323 LXC_AUTO_CGROUP2_RW
|
326 /* shared mount point */
327 LXC_AUTO_SHMOUNTS
= BIT(11),
328 /* shared mount point mask */
329 LXC_AUTO_SHMOUNTS_MASK
= LXC_AUTO_SHMOUNTS
,
331 /* all known settings */
332 LXC_AUTO_ALL_MASK
= LXC_AUTO_PROC_MASK
|
334 LXC_AUTO_CGROUP_MASK
,
351 __hidden
extern char *lxchook_names
[NUM_LXC_HOOKS
];
353 struct lxc_state_client
{
355 lxc_state_t states
[MAX_STATE
];
356 struct list_head head
;
359 typedef enum lxc_bpf_devices_rule_t
{
360 LXC_BPF_DEVICE_CGROUP_ALLOWLIST
= 0,
361 LXC_BPF_DEVICE_CGROUP_DENYLIST
= 1,
362 } lxc_bpf_devices_rule_t
;
370 struct list_head head
;
374 lxc_bpf_devices_rule_t list_type
;
375 struct list_head devices
;
378 struct timens_offsets
{
379 /* Currently, either s_boot or ns_boot is set, but not both. */
383 /* Currently, either s_monotonic or ns_monotonic is set, but not both. */
385 int64_t ns_monotonic
;
388 struct environment_entry
{
391 struct list_head head
;
397 struct list_head head
;
402 struct list_head list
;
405 struct string_entry
{
407 struct list_head head
;
411 /* Pointer to the name of the container. Do not free! */
415 personality_t personality
;
416 struct utsname
*utsname
;
419 struct list_head cgroup
;
420 struct list_head cgroup2
;
421 struct bpf_devices bpf_devices
;
425 struct list_head id_map
;
428 * Pointer to the idmap entry for the container's root uid in
429 * the id_map list. Do not free!
431 const struct id_map
*root_nsuid_map
;
434 * Pointer to the idmap entry for the container's root gid in
435 * the id_map list. Do not free!
437 const struct id_map
*root_nsgid_map
;
440 struct list_head netdevs
;
445 struct list_head mount_entries
;
450 /* /dev/tty<idx> devices */
451 struct lxc_tty_info ttys
;
452 /* /dev/console device */
453 struct lxc_terminal console
;
454 /* maximum pty devices allowed by devpts mount */
456 /* file descriptor for the container's /dev/pts mount */
459 /* set to true when rootfs has been setup */
461 struct lxc_rootfs rootfs
;
466 unsigned int hooks_version
;
467 struct list_head hooks
[NUM_LXC_HOOKS
];
470 char *lsm_aa_profile
;
471 char *lsm_aa_profile_computed
;
472 bool lsm_aa_profile_created
;
473 unsigned int lsm_aa_allow_nesting
;
474 unsigned int lsm_aa_allow_incomplete
;
475 struct list_head lsm_aa_raw
;
476 char *lsm_se_context
;
477 char *lsm_se_keyring_context
;
478 bool keyring_disable_session
;
479 bool transient_procfs_mnt
;
480 struct lxc_seccomp seccomp
;
482 unsigned int autodev
; /* if 1, mount and fill a /dev at start */
483 int autodevtmpfssize
; /* size of the /dev tmpfs */
484 int haltsignal
; /* signal used to halt container */
485 int rebootsignal
; /* signal used to reboot container */
486 int stopsignal
; /* signal used to hard stop container */
487 char *rcfile
; /* Copy of the top level rcfile we read */
489 /* Logfile and loglevel can be set in a container config file. Those
490 * function as defaults. The defaults can be overridden by command line.
491 * However we don't want the command line specified values to be saved
492 * on c->save_config(). So we store the config file specified values
494 char *logfile
; /* the logfile as specified in config */
495 int loglevel
; /* loglevel as specified in config (if any) */
498 unsigned int start_auto
;
499 unsigned int start_delay
;
501 struct list_head groups
;
504 /* unshare the mount namespace in the monitor */
505 unsigned int monitor_unshare
;
506 unsigned int monitor_signal_pdeath
;
508 /* list of environment variables we'll add to the container when
510 struct list_head environment
;
512 /* text representation of the config file */
513 char *unexpanded_config
;
514 size_t unexpanded_len
;
515 size_t unexpanded_alloced
;
517 /* default command for lxc-execute */
523 /* The uid to use for the container. */
525 /* The gid to use for the container. */
527 /* The groups to use for the container. */
528 lxc_groups_t init_groups
;
530 /* indicator if the container will be destroyed on shutdown */
531 unsigned int ephemeral
;
533 /* The facility to pass to syslog. Let's users establish as what type of
534 * program liblxc is supposed to write to the syslog. */
537 /* Whether PR_SET_NO_NEW_PRIVS will be set for the container. */
540 /* RLIMIT_* limits */
541 struct list_head limits
;
543 /* Contains generic info about the cgroup configuration for this
544 * container. Note that struct lxc_cgroup contains a union. It is only
545 * valid to access the members of the anonymous "meta" struct within
548 struct lxc_cgroup cgroup_meta
;
553 char *ns_share
[LXC_NS_MAX
];
556 /* init working directory */
559 /* A list of clients registered to be informed about a container state. */
560 struct list_head state_clients
;
563 struct list_head sysctls
;
566 struct list_head procs
;
569 /* Absolute path to the shared mount point on the host */
571 /* Absolute path (in the container) to the shared mount point */
575 struct timens_offsets timens
;
578 __u64 sched_core_cookie
;
581 __hidden
extern int write_id_mapping(enum idtype idtype
, pid_t pid
, const char *buf
, size_t buf_size
)
584 extern thread_local
struct lxc_conf
*current_config
;
586 __hidden
extern int run_lxc_hooks(const char *name
, char *hook
, struct lxc_conf
*conf
, char *argv
[]);
587 __hidden
extern struct lxc_conf
*lxc_conf_init(void);
588 __hidden
extern void lxc_conf_free(struct lxc_conf
*conf
);
589 __hidden
extern int lxc_storage_prepare(struct lxc_conf
*conf
);
590 __hidden
extern int lxc_rootfs_prepare(struct lxc_conf
*conf
, bool userns
);
591 __hidden
extern void lxc_storage_put(struct lxc_conf
*conf
);
592 __hidden
extern int lxc_rootfs_init(struct lxc_conf
*conf
, bool userns
);
593 __hidden
extern int lxc_rootfs_prepare_parent(struct lxc_handler
*handler
);
594 __hidden
extern int lxc_idmapped_mounts_parent(struct lxc_handler
*handler
);
595 __hidden
extern int lxc_map_ids(struct list_head
*idmap
, pid_t pid
);
596 __hidden
extern int lxc_create_tty(const char *name
, struct lxc_conf
*conf
);
597 __hidden
extern void lxc_delete_tty(struct lxc_tty_info
*ttys
);
598 __hidden
extern int lxc_clear_config_caps(struct lxc_conf
*c
);
599 __hidden
extern int lxc_clear_cgroups(struct lxc_conf
*c
, const char *key
, int version
);
600 __hidden
extern int lxc_clear_mount_entries(struct lxc_conf
*c
);
601 __hidden
extern int lxc_clear_automounts(struct lxc_conf
*c
);
602 __hidden
extern int lxc_clear_hooks(struct lxc_conf
*c
, const char *key
);
603 __hidden
extern int lxc_clear_idmaps(struct lxc_conf
*c
);
604 __hidden
extern int lxc_clear_groups(struct lxc_conf
*c
);
605 __hidden
extern int lxc_clear_environment(struct lxc_conf
*c
);
606 __hidden
extern int lxc_clear_limits(struct lxc_conf
*c
, const char *key
);
607 __hidden
extern int lxc_delete_autodev(struct lxc_handler
*handler
);
608 __hidden
extern int lxc_clear_autodev_tmpfs_size(struct lxc_conf
*c
);
609 __hidden
extern int lxc_setup_rootfs_prepare_root(struct lxc_conf
*conf
, const char *name
,
610 const char *lxcpath
);
611 __hidden
extern int lxc_setup(struct lxc_handler
*handler
);
612 __hidden
extern int lxc_setup_parent(struct lxc_handler
*handler
);
613 __hidden
extern int setup_resource_limits(struct lxc_conf
*conf
, pid_t pid
);
614 __hidden
extern int find_unmapped_nsid(const struct lxc_conf
*conf
, enum idtype idtype
);
615 __hidden
extern int mapped_hostid(unsigned id
, const struct lxc_conf
*conf
, enum idtype idtype
);
616 __hidden
extern int userns_exec_1(const struct lxc_conf
*conf
, int (*fn
)(void *), void *data
,
617 const char *fn_name
);
618 __hidden
extern int userns_exec_full(struct lxc_conf
*conf
, int (*fn
)(void *), void *data
,
619 const char *fn_name
);
620 __hidden
extern int parse_mntopts_legacy(const char *mntopts
, unsigned long *mntflags
, char **mntdata
);
621 __hidden
extern int parse_propagationopts(const char *mntopts
, unsigned long *pflags
);
622 __hidden
extern int parse_lxc_mount_attrs(struct lxc_mount_options
*opts
, char *mnt_opts
);
623 __hidden
extern int parse_mount_attrs(struct lxc_mount_options
*opts
, const char *mntopts
);
624 __hidden
extern void tmp_proc_unmount(struct lxc_conf
*lxc_conf
);
625 __hidden
extern void suggest_default_idmap(void);
626 __hidden
extern FILE *make_anonymous_mount_file(const struct list_head
*mount
,
627 bool include_nesting_helpers
);
628 __hidden
extern int run_script(const char *name
, const char *section
, const char *script
, ...);
629 __hidden
extern int run_script_argv(const char *name
, unsigned int hook_version
, const char *section
,
630 const char *script
, const char *hookname
, char **argsin
);
632 __hidden
extern bool has_cap(__u32 cap
, struct lxc_conf
*conf
);
633 static inline bool lxc_wants_cap(__u32 cap
, struct lxc_conf
*conf
)
638 ret
= lxc_caps_last_cap(&last_cap
);
645 return has_cap(cap
, conf
);
648 __hidden
extern int setup_sysctl_parameters(struct lxc_conf
*conf
);
649 __hidden
extern int lxc_clear_sysctls(struct lxc_conf
*c
, const char *key
);
650 __hidden
extern int setup_proc_filesystem(struct lxc_conf
*conf
, pid_t pid
);
651 __hidden
extern int lxc_clear_procs(struct lxc_conf
*c
, const char *key
);
652 __hidden
extern int lxc_clear_apparmor_raw(struct lxc_conf
*c
);
653 __hidden
extern int lxc_clear_namespace(struct lxc_conf
*c
);
654 __hidden
extern int userns_exec_minimal(const struct lxc_conf
*conf
, int (*fn_parent
)(void *),
655 void *fn_parent_data
, int (*fn_child
)(void *),
656 void *fn_child_data
);
657 __hidden
extern int userns_exec_mapped_root(const char *path
, int path_fd
,
658 const struct lxc_conf
*conf
);
659 static inline int chown_mapped_root(const char *path
, const struct lxc_conf
*conf
)
661 return userns_exec_mapped_root(path
, -EBADF
, conf
);
664 __hidden
extern int lxc_sync_fds_parent(struct lxc_handler
*handler
);
665 __hidden
extern int lxc_sync_fds_child(struct lxc_handler
*handler
);
667 static inline const char *get_rootfs_mnt(const struct lxc_rootfs
*rootfs
)
669 static const char *s
= "/";
671 return !is_empty_string(rootfs
->path
) ? rootfs
->mount
: s
;
674 static inline void put_lxc_mount_options(struct lxc_mount_options
*mnt_opts
)
676 mnt_opts
->create_dir
= 0;
677 mnt_opts
->create_file
= 0;
678 mnt_opts
->optional
= 0;
679 mnt_opts
->relative
= 0;
680 mnt_opts
->userns_path
[0] = '\0';
681 mnt_opts
->mnt_flags
= 0;
682 mnt_opts
->prop_flags
= 0;
684 free_disarm(mnt_opts
->data
);
685 free_disarm(mnt_opts
->raw_options
);
688 static inline void put_lxc_rootfs(struct lxc_rootfs
*rootfs
, bool unpin
)
691 close_prot_errno_disarm(rootfs
->dfd_host
);
692 close_prot_errno_disarm(rootfs
->dfd_mnt
);
693 close_prot_errno_disarm(rootfs
->dfd_dev
);
695 close_prot_errno_disarm(rootfs
->fd_path_pin
);
696 close_prot_errno_disarm(rootfs
->dfd_idmapped
);
697 put_lxc_mount_options(&rootfs
->mnt_opts
);
698 storage_put(rootfs
->storage
);
699 rootfs
->storage
= NULL
;
703 static inline void lxc_clear_cgroup2_devices(struct bpf_devices
*bpf_devices
)
705 struct device_item
*device
, *n
;
707 list_for_each_entry_safe(device
, n
, &bpf_devices
->devices
, head
)
708 list_del(&device
->head
);
710 INIT_LIST_HEAD(&bpf_devices
->devices
);
713 static inline int lxc_personality(personality_t persona
)
716 return ret_errno(EINVAL
);
718 return personality(persona
);
721 __hidden
extern int lxc_set_environment(const struct lxc_conf
*conf
);
722 __hidden
extern int parse_cap(const char *cap_name
, __u32
*cap
);
724 #endif /* __LXC_CONF_H */