+sub mountpoint_hotplug($$$) {
+ my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
+
+ my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
+
+ # We do the rest in a fork with an unshared mount namespace, because:
+ # -) change our papparmor profile to that of /usr/bin/lxc-start
+ # -) we're now going to 'stage' # the mountpoint, then grab it, then move into the
+ # container's namespace, then mount it.
+
+ PVE::Tools::run_fork(sub {
+ # Pin the container pid longer, we also need to get its monitor/parent:
+ my ($ct_pid, $ct_pidfd) = open_lxc_pid($vmid)
+ or die "failed to open pidfd of container $vmid\'s init process\n";
+
+ my ($monitor_pid, $monitor_pidfd) = open_ppid($ct_pid)
+ or die "failed to open pidfd of container $vmid\'s monitor process\n";
+
+ my $ct_mnt_ns = $get_container_namespace->($vmid, $ct_pid, 'mnt');
+ my $monitor_mnt_ns = $get_container_namespace->($vmid, $monitor_pid, 'mnt');
+
+ # Grab a file descriptor to our apparmor label file so we can change into the 'lxc-start'
+ # profile to lower our privileges to the same level we have in the start hook:
+ sysopen(my $aa_fd, "/proc/self/attr/current", O_WRONLY)
+ or die "failed to open '/proc/self/attr/current' for writing: $!\n";
+ # But switch namespaces first, to make sure the namespace switches aren't blocked by
+ # apparmor.
+
+ # Change into the monitor's mount namespace. We "pin" the mount into the monitor's
+ # namespace for it to remain active there since the container will be able to unmount
+ # hotplugged mount points and thereby potentially free up loop devices, which is a security
+ # concern.
+ PVE::Tools::setns(fileno($monitor_mnt_ns), PVE::Tools::CLONE_NEWNS);
+ chdir('/')
+ or die "failed to change root directory within the monitor's mount namespace: $!\n";
+
+ my $dir = get_staging_mount_path($opt);
+
+ # Now switch our apparmor profile before mounting:
+ my $data = 'changeprofile /usr/bin/lxc-start';
+ if (syswrite($aa_fd, $data, length($data)) != length($data)) {
+ die "failed to change apparmor profile: $!\n";
+ }
+ # Check errors on close as well:
+ close($aa_fd)
+ or die "failed to change apparmor profile (close() failed): $!\n";
+
+ my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $rootuid, $rootgid);
+
+ PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS);
+ chdir('/')
+ or die "failed to change root directory within the container's mount namespace: $!\n";
+
+ mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $rootuid, $rootgid);
+ });
+}
+
+# Create a directory in the mountpoint staging tempfs.
+sub get_staging_mount_path($) {
+ my ($opt) = @_;
+
+ my $target = get_staging_tempfs() . "/$opt";
+ if (!mkdir($target) && $! != EEXIST) {
+ die "failed to create directory $target: $!\n";
+ }
+
+ return $target;
+}
+
+# Mount /run/pve/mountpoints as tmpfs
+sub get_staging_tempfs() {
+ # We choose a path in /var/lib/lxc/ here because the lxc-start apparmor profile restricts most
+ # mounts to that.
+ my $target = '/var/lib/lxc/.pve-staged-mounts';
+ if (!mkdir($target)) {
+ return $target if $! == EEXIST;
+ die "failed to create directory $target: $!\n";
+ }
+
+ PVE::Tools::mount("none", $target, 'tmpfs', 0, "size=8k,mode=755")
+ or die "failed to mount $target as tmpfs: $!\n";
+
+ return $target;
+}
+