+sub get_cgroup_subsystems {
+ my $v1 = {};
+ my $v2 = 0;
+ my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
+ while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
+ my $type = $1;
+ if (length($type)) {
+ $v1->{$_} = 1 foreach split(/,/, $type);
+ } else {
+ $v2 = 1;
+ }
+ }
+ return wantarray ? ($v1, $v2) : $v1;
+}
+
+# With seccomp trap to userspace we now have the ability to optionally forward
+# certain syscalls to the "host" to handle (via our pve-lxc-syscalld daemon).
+#
+# This means that there are cases where we need to create an extra seccomp
+# profile for the container to load.
+#
+# This returns a configuration snippet added to the raw lxc config.
+sub make_seccomp_config {
+ my ($conf, $vmid, $conf_dir, $unprivileged, $features) = @_;
+ # User-configured profile has precedence, note that the user's entry would
+ # be written 'after' this line anyway...
+ if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) {
+ # Warn the user if this conflicts with a feature:
+ my $warn = join(', ', grep { $features->{$_} } qw(keyctl mknod));
+ warn "explicitly configured lxc.seccomp.profile overrides the following settings: $warn\n"
+ if length($warn) > 0;
+ return '';
+ }
+
+ # Privileged containers keep using the default (which is already part of
+ # the files included via lxc.include, so we don't need to write it out,
+ # that way it stays admin-configurable via /usr/share/lxc/config/... as
+ # well)
+ return '' if !$unprivileged;
+
+ my $rules = {
+ keyctl => ['errno 38'],
+ };
+
+ my $raw_conf = '';
+
+ # Unprivileged containers will get keyctl() disabled by default as a
+ # workaround for systemd-networkd behavior. But we have an option to
+ # explicitly enable it:
+ if ($features->{keyctl}) {
+ delete $rules->{keyctl};
+ }
+
+ # By default, unprivileged containers cannot use `mknod` at all.
+ # Since lxc 3.2, we can use seccomp's trap to userspace feature for this,
+ # but for now this is experimental, so it has to be enabled via a feature
+ # flag.
+ # Note that we only handle block and char devices (like lxd), the rest we
+ # leave up to the kernel. We may in the future remove this if seccomp gets
+ # a way to tell the kernel to "continue" a syscall.
+ if ($features->{mknod}) {
+ my ($ok, $kernel) = PVE::ProcFSTools::check_kernel_release(5, 3);
+ if (!$ok) {
+ die "'mknod' feature requested, but kernel too old (found $kernel, required >= 5.3)\n";
+ }
+
+ $raw_conf .= "lxc.seccomp.notify.proxy = unix:/run/pve/lxc-syscalld.sock\n";
+ $raw_conf .= "lxc.seccomp.notify.cookie = $vmid\n";
+
+ $rules->{mknod} = [
+ # condition: (mode & S_IFMT) == S_IFCHR
+ 'notify [1,8192,SCMP_CMP_MASKED_EQ,61440]',
+ # condition: (mode & S_IFMT) == S_IFBLK
+ 'notify [1,24576,SCMP_CMP_MASKED_EQ,61440]',
+ ];
+ $rules->{mknodat} = [
+ # condition: (mode & S_IFMT) == S_IFCHR
+ 'notify [2,8192,SCMP_CMP_MASKED_EQ,61440]',
+ # condition: (mode & S_IFMT) == S_IFBLK
+ 'notify [2,24576,SCMP_CMP_MASKED_EQ,61440]',
+ ];
+ }
+
+ # Now build the custom seccomp rule text...
+ my $extra_rules = join("\n", map {
+ my $syscall = $_;
+ map { "$syscall $_" } $rules->{$syscall}->@*
+ } sort keys %$rules) . "\n";
+
+ return $raw_conf if $extra_rules eq "\n";
+
+ # We still have the "most common" config readily available, so don't write
+ # out that one:
+ if ($raw_conf eq '' && $extra_rules eq "keyctl errno 38\n") {
+ # we have no extra $raw_conf and use the same we had in pve 6.1:
+ return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n";
+ }
+
+ # Write the rule file to the container's config path:
+ my $rule_file = "$conf_dir/rules.seccomp";
+ my $rule_data = file_get_contents("$LXC_CONFIG_PATH/common.seccomp")
+ . $extra_rules;
+ file_set_contents($rule_file, $rule_data);
+ $raw_conf .= "lxc.seccomp.profile = $rule_file\n";
+
+ return $raw_conf;
+}
+
+# Since lxc-3.0.2 we can have lxc generate a profile for the container
+# automatically. The default should be equivalent to the old
+# `lxc-container-default-cgns` profile.
+#
+# Additionally this also added `lxc.apparmor.raw` which can be used to inject
+# additional lines into the profile. We can use that to allow mounting specific
+# file systems.
+sub make_apparmor_config {
+ my ($conf, $unprivileged, $features) = @_;
+
+ # user-configured profile has precedence, but first we go through our own
+ # code to figure out whether we should warn the user:
+
+ my $raw = "lxc.apparmor.profile = generated\n";
+ my @profile_uses;
+
+ if ($features->{fuse}) {
+ # For the informational warning:
+ push @profile_uses, 'features:fuse';
+ }
+
+ # There's lxc.apparmor.allow_nesting now, which will add the necessary
+ # apparmor lines, create an apparmor namespace for the container, but also
+ # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have
+ # lxcfs mounted over them, because that would prevent the container from
+ # mounting new instances of them for nested containers.
+ if ($features->{nesting}) {
+ push @profile_uses, 'features:nesting';
+ $raw .= "lxc.apparmor.allow_nesting = 1\n"
+ } else {
+ # In the default profile in /etc/apparmor.d we patch this in because
+ # otherwise a container can for example run `chown` on /sys, breaking
+ # access to it for non-CAP_DAC_OVERRIDE tools on the host:
+ $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n";
+ $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n";
+ # Preferably we could use the 'remount' flag but this does not sit well
+ # with apparmor_parser currently:
+ # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,
+ }
+
+ if (my $mount = $features->{mount}) {
+ push @profile_uses, 'features:mount';
+ foreach my $fs (PVE::Tools::split_list($mount)) {
+ $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n";
+ }
+ }
+
+ # More to come?
+
+ if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) {
+ if (length(my $used = join(', ', @profile_uses))) {
+ warn "explicitly configured lxc.apparmor.profile overrides the following settings: $used\n";
+ }
+ return '';
+ }
+
+ return $raw;
+}