X-Git-Url: https://git.proxmox.com/?p=pve-container.git;a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=89f289e9113de6a4540d03e142807ed65cbfc3ab;hp=58be169edb254202ea7a64e00aacf110122def0f;hb=5a63f1c5d3b995dd682a70e7fbd1364240e09278;hpb=380962c70977fd0576538e01c082f489b80d4add diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index 58be169..89f289e 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -10,31 +10,31 @@ use Socket; use File::Path; use File::Spec; use Cwd qw(); -use Fcntl qw(O_RDONLY); +use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY); +use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED); +use IO::Socket::UNIX; use PVE::Exception qw(raise_perm_exc); use PVE::Storage; use PVE::SafeSyslog; use PVE::INotify; -use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full); +use PVE::JSONSchema qw(get_standard_option); +use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH); +use PVE::CpuSet; use PVE::Network; use PVE::AccessControl; use PVE::ProcFSTools; +use PVE::Syscall; use PVE::LXC::Config; + use Time::HiRes qw (gettimeofday); -use Data::Dumper; +my $LXC_CONFIG_PATH = '/usr/share/lxc/config'; my $nodename = PVE::INotify::nodename(); my $cpuinfo= PVE::ProcFSTools::read_cpuinfo(); -our $COMMON_TAR_FLAGS = [ '--sparse', '--numeric-owner', '--acls', - '--xattrs', - '--xattrs-include=user.*', - '--xattrs-include=security.capability', - '--warning=no-xattr-write' ]; - sub config_list { my $vmlist = PVE::Cluster::get_vmlist(); my $res = {}; @@ -46,7 +46,7 @@ sub config_list { my $d = $ids->{$vmid}; next if !$d->{node} || $d->{node} ne $nodename; next if !$d->{type} || $d->{type} ne 'lxc'; - $res->{$vmid}->{type} = 'lxc'; + $res->{$vmid} = { type => 'lxc', vmid => $vmid }; } return $res; } @@ -103,9 +103,9 @@ sub get_container_disk_usage { my $last_proc_vmid_stat; my $parse_cpuacct_stat = sub { - my ($vmid) = @_; + my ($vmid, $unprivileged) = @_; - my $raw = read_cgroup_value('cpuacct', $vmid, 'cpuacct.stat', 1); + my $raw = read_cgroup_value('cpuacct', $vmid, $unprivileged, 'cpuacct.stat', 1); my $stat = {}; @@ -119,10 +119,53 @@ my $parse_cpuacct_stat = sub { return $stat; }; +our $vmstatus_return_properties = { + vmid => get_standard_option('pve-vmid'), + status => { + description => "LXC Container status.", + type => 'string', + enum => ['stopped', 'running'], + }, + maxmem => { + description => "Maximum memory in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + maxswap => { + description => "Maximum SWAP memory in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + maxdisk => { + description => "Root disk size in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + name => { + description => "Container name.", + type => 'string', + optional => 1, + }, + uptime => { + description => "Uptime.", + type => 'integer', + optional => 1, + renderer => 'duration', + }, + cpus => { + description => "Maximum usable CPUs.", + type => 'number', + optional => 1, + }, +}; + sub vmstatus { my ($opt_vmid) = @_; - my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc' }} : config_list(); + my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => $opt_vmid }} : config_list(); my $active_hash = list_active_containers(); @@ -131,6 +174,9 @@ sub vmstatus { my $cdtime = gettimeofday; my $uptime = (PVE::ProcFSTools::read_proc_uptime(1))[0]; + my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); + + my $unprivileged = {}; foreach my $vmid (keys %$list) { my $d = $list->{$vmid}; @@ -143,10 +189,13 @@ sub vmstatus { my $cfspath = PVE::LXC::Config->cfs_config_path($vmid); my $conf = PVE::Cluster::cfs_read_file($cfspath) || {}; + $unprivileged->{$vmid} = $conf->{unprivileged}; + $d->{name} = $conf->{'hostname'} || "CT$vmid"; $d->{name} =~ s/[\s]//g; - $d->{cpus} = $conf->{cpulimit} || $cpucount; + $d->{cpus} = $conf->{cores} || $conf->{cpulimit}; + $d->{cpus} = $cpucount if !$d->{cpus}; $d->{lock} = $conf->{lock} || ''; @@ -159,7 +208,7 @@ sub vmstatus { # use 4GB by default ?? if (my $rootfs = $conf->{rootfs}) { my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($rootfs); - $d->{maxdisk} = int(($rootinfo->{size} || 4)*1024*1024)*1024; + $d->{maxdisk} = $rootinfo->{size} || (4*1024*1024*1024); } else { $d->{maxdisk} = 4*1024*1024*1024; } @@ -188,48 +237,67 @@ sub vmstatus { next if !$pid; # skip stopped CTs - my $ctime = (stat("/proc/$pid"))[10]; # 10 = ctime - $d->{uptime} = time - $ctime; # the method lxcfs uses + my $proc_pid_stat = PVE::ProcFSTools::read_proc_pid_stat($pid); + $d->{uptime} = int(($uptime - $proc_pid_stat->{starttime}) / $clock_ticks); # the method lxcfs uses - $d->{mem} = read_cgroup_value('memory', $vmid, 'memory.usage_in_bytes'); - $d->{swap} = read_cgroup_value('memory', $vmid, 'memory.memsw.usage_in_bytes') - $d->{mem}; + my $unpriv = $unprivileged->{$vmid}; - my $blkio_bytes = read_cgroup_value('blkio', $vmid, 'blkio.throttle.io_service_bytes', 1); - my @bytes = split(/\n/, $blkio_bytes); - foreach my $byte (@bytes) { - if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { - $d->{diskread} = $2 if $key eq 'Read'; - $d->{diskwrite} = $2 if $key eq 'Write'; + if (-d '/sys/fs/cgroup/memory') { + my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat'); + my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes'); + + $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; + $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + } else { + $d->{mem} = 0; + $d->{swap} = 0; + } + + if (-d '/sys/fs/cgroup/blkio') { + my $blkio_bytes = read_cgroup_value('blkio', $vmid, $unpriv, 'blkio.throttle.io_service_bytes', 1); + my @bytes = split(/\n/, $blkio_bytes); + foreach my $byte (@bytes) { + if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { + $d->{diskread} += $2 if $key eq 'Read'; + $d->{diskwrite} += $2 if $key eq 'Write'; + } } + } else { + $d->{diskread} = 0; + $d->{diskwrite} = 0; } - my $pstat = &$parse_cpuacct_stat($vmid); + if (-d '/sys/fs/cgroup/cpuacct') { + my $pstat = $parse_cpuacct_stat->($vmid, $unpriv); - my $used = $pstat->{utime} + $pstat->{stime}; + my $used = $pstat->{utime} + $pstat->{stime}; - my $old = $last_proc_vmid_stat->{$vmid}; - if (!$old) { - $last_proc_vmid_stat->{$vmid} = { - time => $cdtime, - used => $used, - cpu => 0, - }; - next; - } + my $old = $last_proc_vmid_stat->{$vmid}; + if (!$old) { + $last_proc_vmid_stat->{$vmid} = { + time => $cdtime, + used => $used, + cpu => 0, + }; + next; + } - my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; + my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; - if ($dtime > 1000) { - my $dutime = $used - $old->{used}; + if ($dtime > 1000) { + my $dutime = $used - $old->{used}; - $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; - $last_proc_vmid_stat->{$vmid} = { - time => $cdtime, - used => $used, - cpu => $d->{cpu}, - }; + $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; + $last_proc_vmid_stat->{$vmid} = { + time => $cdtime, + used => $used, + cpu => $d->{cpu}, + }; + } else { + $d->{cpu} = $old->{cpu}; + } } else { - $d->{cpu} = $old->{cpu}; + $d->{cpu} = 0; } } @@ -250,10 +318,19 @@ sub vmstatus { return $list; } -sub read_cgroup_value { - my ($group, $vmid, $name, $full) = @_; +sub read_cgroup_list($$$$) { + my ($group, $vmid, $unprivileged, $name) = @_; + + my $content = read_cgroup_value($group, $vmid, $unprivileged, $name, 1); + + return { split(/\s+/, $content) }; +} + +sub read_cgroup_value($$$$$) { + my ($group, $vmid, $unprivileged, $name, $full) = @_; - my $path = "/sys/fs/cgroup/$group/lxc/$vmid/$name"; + my $nsdir = $unprivileged ? '' : 'ns/'; + my $path = "/sys/fs/cgroup/$group/lxc/$vmid/${nsdir}$name"; return PVE::Tools::file_get_contents($path) if $full; @@ -323,9 +400,111 @@ sub parse_ipv4_cidr { die "unable to parse ipv4 address/mask\n"; } +sub get_cgroup_subsystems { + my $v1 = {}; + my $v2 = 0; + my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); + while ($data =~ /^\d+:([^:\n]*):.*$/gm) { + my $type = $1; + if (length($type)) { + $v1->{$_} = 1 foreach split(/,/, $type); + } else { + $v2 = 1; + } + } + return wantarray ? ($v1, $v2) : $v1; +} + +# Currently we do not need to create seccomp profile 'files' as the only +# choice our configuration actually allows is "with or without keyctl()", +# so we distinguish between using lxc's "default" seccomp profile and our +# added pve-userns.seccomp file. +# +# This returns a configuration line added to the raw lxc config. +sub make_seccomp_config { + my ($conf, $unprivileged, $features) = @_; + # User-configured profile has precedence, note that the user's entry would + # be written 'after' this line anyway... + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) { + # Warn the user if this conflicts with a feature: + if ($features->{keyctl}) { + warn "explicitly configured lxc.seccomp.profile overrides the following settings: features:keyctl\n"; + } + return ''; + } + + # Privileged containers keep using the default (which is already part of + # the files included via lxc.include, so we don't need to write it out, + # that way it stays admin-configurable via /usr/share/lxc/config/... as + # well) + return '' if !$unprivileged; + + # Unprivileged containers will get keyctl() disabled by default as a + # workaround for systemd-networkd behavior. But we have an option to + # explicitly enable it: + return '' if $features->{keyctl}; + + # Finally we're in an unprivileged container without `keyctl` set + # explicitly. We have a file prepared for this: + return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n"; +} + +# Since lxc-3.0.2 we can have lxc generate a profile for the container +# automatically. The default should be equivalent to the old +# `lxc-container-default-cgns` profile. +# +# Additionally this also added `lxc.apparmor.raw` which can be used to inject +# additional lines into the profile. We can use that to allow mounting specific +# file systems. +sub make_apparmor_config { + my ($conf, $unprivileged, $features) = @_; + + # user-configured profile has precedence, but first we go through our own + # code to figure out whether we should warn the user: + + my $raw = "lxc.apparmor.profile = generated\n"; + my @profile_uses; + + # There's lxc.apparmor.allow_nesting now, which will add the necessary + # apparmor lines, create an apparmor namespace for the container, but also + # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have + # lxcfs mounted over them, because that would prevent the container from + # mounting new instances of them for nested containers. + if ($features->{nesting}) { + push @profile_uses, 'features:nesting'; + $raw .= "lxc.apparmor.allow_nesting = 1\n" + } else { + # In the default profile in /etc/apparmor.d we patch this in because + # otherwise a container can for example run `chown` on /sys, breaking + # access to it for non-CAP_DAC_OVERRIDE tools on the host: + $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n"; + $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n"; + # Preferably we could use the 'remount' flag but this does not sit well + # with apparmor_parser currently: + # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/, + } + + if (my $mount = $features->{mount}) { + push @profile_uses, 'features:mount'; + foreach my $fs (PVE::Tools::split_list($mount)) { + $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n"; + } + } + + # More to come? + + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) { + if (length(my $used = join(', ', @profile_uses))) { + warn "explicitly configured lxc.apparmor.profile overrides the following settings: $used\n"; + } + return ''; + } + + return $raw; +} sub update_lxc_config { - my ($storage_cfg, $vmid, $conf) = @_; + my ($vmid, $conf) = @_; my $dir = "/var/lib/lxc/$vmid"; @@ -341,23 +520,26 @@ sub update_lxc_config { die "missing 'arch' - internal error" if !$conf->{arch}; $raw .= "lxc.arch = $conf->{arch}\n"; - my $unprivileged = $conf->{unprivileged}; - my $custom_idmap = grep { $_->[0] eq 'lxc.id_map' } @{$conf->{lxc}}; + my $custom_idmap = PVE::LXC::Config->has_lxc_entry($conf, 'lxc.idmap'); + my $unprivileged = $conf->{unprivileged} || $custom_idmap; my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; - if ($ostype =~ /^(?:debian | ubuntu | centos | fedora | opensuse | archlinux | alpine | unmanaged)$/x) { - my $inc ="/usr/share/lxc/config/$ostype.common.conf"; - $inc ="/usr/share/lxc/config/common.conf" if !-f $inc; + + my $cfgpath = '/usr/share/lxc/config'; + my $inc = "$cfgpath/$ostype.common.conf"; + $inc ="$cfgpath/common.conf" if !-f $inc; + $raw .= "lxc.include = $inc\n"; + if ($unprivileged) { + $inc = "$cfgpath/$ostype.userns.conf"; + $inc = "$cfgpath/userns.conf" if !-f $inc; $raw .= "lxc.include = $inc\n"; - if ($unprivileged || $custom_idmap) { - $inc = "/usr/share/lxc/config/$ostype.userns.conf"; - $inc = "/usr/share/lxc/config/userns.conf" if !-f $inc; - $raw .= "lxc.include = $inc\n" - } - } else { - die "implement me (ostype $ostype)"; } + my $features = PVE::LXC::Config->parse_features($conf->{features}); + + $raw .= make_seccomp_config($conf, $unprivileged, $features); + $raw .= make_apparmor_config($conf, $unprivileged, $features); + # WARNING: DO NOT REMOVE this without making sure that loop device nodes # cannot be exposed to the container with r/w access (cgroup perms). # When this is enabled mounts will still remain in the monitor's namespace @@ -365,71 +547,92 @@ sub update_lxc_config { # files while the container is running! $raw .= "lxc.monitor.unshare = 1\n"; + my $cgv1 = get_cgroup_subsystems(); + # Should we read them from /etc/subuid? if ($unprivileged && !$custom_idmap) { - $raw .= "lxc.id_map = u 0 100000 65536\n"; - $raw .= "lxc.id_map = g 0 100000 65536\n"; + $raw .= "lxc.idmap = u 0 100000 65536\n"; + $raw .= "lxc.idmap = g 0 100000 65536\n"; } if (!PVE::LXC::Config->has_dev_console($conf)) { - $raw .= "lxc.console = none\n"; - $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n"; + $raw .= "lxc.console.path = none\n"; + $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n" if $cgv1->{devices}; } my $ttycount = PVE::LXC::Config->get_tty_count($conf); - $raw .= "lxc.tty = $ttycount\n"; + $raw .= "lxc.tty.max = $ttycount\n"; # some init scripts expect a linux terminal (turnkey). $raw .= "lxc.environment = TERM=linux\n"; my $utsname = $conf->{hostname} || "CT$vmid"; - $raw .= "lxc.utsname = $utsname\n"; + $raw .= "lxc.uts.name = $utsname\n"; - my $memory = $conf->{memory} || 512; - my $swap = $conf->{swap} // 0; + if ($cgv1->{memory}) { + my $memory = $conf->{memory} || 512; + my $swap = $conf->{swap} // 0; - my $lxcmem = int($memory*1024*1024); - $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n"; + my $lxcmem = int($memory*1024*1024); + $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n"; - my $lxcswap = int(($memory + $swap)*1024*1024); - $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; + my $lxcswap = int(($memory + $swap)*1024*1024); + $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; + } + + if ($cgv1->{cpu}) { + if (my $cpulimit = $conf->{cpulimit}) { + $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n"; + my $value = int(100000*$cpulimit); + $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; + } - if (my $cpulimit = $conf->{cpulimit}) { - $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n"; - my $value = int(100000*$cpulimit); - $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; + my $shares = $conf->{cpuunits} || 1024; + $raw .= "lxc.cgroup.cpu.shares = $shares\n"; } - my $shares = $conf->{cpuunits} || 1024; - $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + die "missing 'rootfs' configuration\n" + if !defined($conf->{rootfs}); my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); - $raw .= "lxc.rootfs = $dir/rootfs\n"; + $raw .= "lxc.rootfs.path = $dir/rootfs\n"; - my $netcount = 0; - foreach my $k (keys %$conf) { + foreach my $k (sort keys %$conf) { next if $k !~ m/^net(\d+)$/; my $ind = $1; my $d = PVE::LXC::Config->parse_lxc_network($conf->{$k}); - $netcount++; - $raw .= "lxc.network.type = veth\n"; - $raw .= "lxc.network.veth.pair = veth${vmid}i${ind}\n"; - $raw .= "lxc.network.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); - $raw .= "lxc.network.name = $d->{name}\n" if defined($d->{name}); - $raw .= "lxc.network.mtu = $d->{mtu}\n" if defined($d->{mtu}); + $raw .= "lxc.net.$ind.type = veth\n"; + $raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n"; + $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); + $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name}); + $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu}); } - if (my $lxcconf = $conf->{lxc}) { - foreach my $entry (@$lxcconf) { - my ($k, $v) = @$entry; - $netcount++ if $k eq 'lxc.network.type'; - $raw .= "$k = $v\n"; + if ($cgv1->{cpuset}) { + my $had_cpuset = 0; + if (my $lxcconf = $conf->{lxc}) { + foreach my $entry (@$lxcconf) { + my ($k, $v) = @$entry; + $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; + $raw .= "$k = $v\n"; + } + } + + my $cores = $conf->{cores}; + if (!$had_cpuset && $cores) { + my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; + $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; + my @members = $cpuset->members(); + while (scalar(@members) > $cores) { + my $randidx = int(rand(scalar(@members))); + $cpuset->delete($members[$randidx]); + splice(@members, $randidx, 1); # keep track of the changes + } + $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; } } - $raw .= "lxc.network.type = empty\n" if !$netcount; - File::Path::mkpath("$dir/rootfs"); PVE::Tools::file_set_contents("$dir/config", $raw); @@ -461,19 +664,24 @@ sub verify_searchdomain_list { } sub get_console_command { - my ($vmid, $conf) = @_; + my ($vmid, $conf, $noescapechar) = @_; my $cmode = PVE::LXC::Config->get_cmode($conf); + my $cmd = []; if ($cmode eq 'console') { - return ['lxc-console', '-n', $vmid, '-t', 0]; + push @$cmd, 'lxc-console', '-n', $vmid, '-t', 0; + push @$cmd, '-e', -1 if $noescapechar; } elsif ($cmode eq 'tty') { - return ['lxc-console', '-n', $vmid]; + push @$cmd, 'lxc-console', '-n', $vmid; + push @$cmd, '-e', -1 if $noescapechar; } elsif ($cmode eq 'shell') { - return ['lxc-attach', '--clear-env', '-n', $vmid]; + push @$cmd, 'lxc-attach', '--clear-env', '-n', $vmid; } else { die "internal error"; } + + return $cmd; } sub get_primary_ips { @@ -514,7 +722,7 @@ sub delete_mountpoint_volume { } sub destroy_lxc_container { - my ($storage_cfg, $vmid, $conf) = @_; + my ($storage_cfg, $vmid, $conf, $replacement_conf) = @_; PVE::LXC::Config->foreach_mountpoint($conf, sub { my ($ms, $mountpoint) = @_; @@ -524,7 +732,11 @@ sub destroy_lxc_container { rmdir "/var/lib/lxc/$vmid/rootfs"; unlink "/var/lib/lxc/$vmid/config"; rmdir "/var/lib/lxc/$vmid"; - destroy_config($vmid); + if (defined $replacement_conf) { + PVE::LXC::Config->write_config($vmid, $replacement_conf); + } else { + destroy_config($vmid); + } #my $cmd = ['lxc-destroy', '-n', $vmid ]; #PVE::Tools::run_command($cmd); @@ -673,9 +885,10 @@ sub update_ipconfig { my $newip = $newnet->{$ip}; my $newgw = $newnet->{$gw}; my $oldip = $optdata->{$ip}; + my $oldgw = $optdata->{$gw}; my $change_ip = &$safe_string_ne($oldip, $newip); - my $change_gw = &$safe_string_ne($optdata->{$gw}, $newgw); + my $change_gw = &$safe_string_ne($oldgw, $newgw); return if !$change_ip && !$change_gw; @@ -718,6 +931,11 @@ sub update_ipconfig { # warn and continue warn $@ if $@; } + if ($oldgw && $oldip && !PVE::Network::is_ip_in_cidr($oldgw, $oldip)) { + eval { &$ipcmd($family_opt, 'route', 'del', $oldgw, 'dev', $eth); }; + # warn if the route was deleted manually + warn $@ if $@; + } } # from this point on we save the configuration @@ -832,17 +1050,26 @@ sub template_create { my $storecfg = PVE::Storage::config(); - my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); - my $volid = $rootinfo->{volume}; + PVE::LXC::Config->foreach_mountpoint($conf, sub { + my ($ms, $mountpoint) = @_; - die "Template feature is not available for '$volid'\n" - if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); + my $volid = $mountpoint->{volume}; + + die "Template feature is not available for '$volid'\n" + if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); + }); - PVE::Storage::activate_volumes($storecfg, [$volid]); + PVE::LXC::Config->foreach_mountpoint($conf, sub { + my ($ms, $mountpoint) = @_; - my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid); - $rootinfo->{volume} = $template_volid; - $conf->{rootfs} = PVE::LXC::Config->print_ct_mountpoint($rootinfo, 1); + my $volid = $mountpoint->{volume}; + + PVE::Storage::activate_volumes($storecfg, [$volid]); + + my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid); + $mountpoint->{volume} = $template_volid; + $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq "rootfs"); + }); PVE::LXC::Config->write_config($vmid, $conf); } @@ -854,19 +1081,23 @@ sub check_ct_modify_config_perm { my $check = sub { my ($opt, $delete) = @_; - if ($opt eq 'cpus' || $opt eq 'cpuunits' || $opt eq 'cpulimit') { + if ($opt eq 'cores' || $opt eq 'cpuunits' || $opt eq 'cpulimit') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.CPU']); } elsif ($opt eq 'rootfs' || $opt =~ /^mp\d+$/) { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk']); return if $delete; my $data = $opt eq 'rootfs' ? PVE::LXC::Config->parse_ct_rootfs($newconf->{$opt}) : PVE::LXC::Config->parse_ct_mountpoint($newconf->{$opt}); - raise_perm_exc("mountpoint type $data->{type}") if $data->{type} ne 'volume'; + raise_perm_exc("mount point type $data->{type} is only allowed for root\@pam") + if $data->{type} ne 'volume'; } elsif ($opt eq 'memory' || $opt eq 'swap') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']); } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); + } elsif ($opt eq 'features') { + # For now this is restricted to root@pam + raise_perm_exc("changing feature flags is only allowed for root\@pam"); } else { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']); } @@ -915,7 +1146,7 @@ sub umount_all { } sub mount_all { - my ($vmid, $storage_cfg, $conf) = @_; + my ($vmid, $storage_cfg, $conf, $ignore_ro) = @_; my $rootdir = "/var/lib/lxc/$vmid/rootfs"; File::Path::make_path($rootdir); @@ -927,6 +1158,8 @@ sub mount_all { PVE::LXC::Config->foreach_mountpoint($conf, sub { my ($ms, $mountpoint) = @_; + $mountpoint->{ro} = 0 if $ignore_ro; + mountpoint_mount($mountpoint, $rootdir, $storage_cfg); }); }; @@ -946,15 +1179,6 @@ sub mountpoint_mount_path { return mountpoint_mount($mountpoint, undef, $storage_cfg, $snapname); } -my $check_mount_path = sub { - my ($path) = @_; - $path = File::Spec->canonpath($path); - my $real = Cwd::realpath($path); - if ($real ne $path) { - die "mount path modified by symlink: $path != $real"; - } -}; - sub query_loopdev { my ($path) = @_; my $found; @@ -998,7 +1222,105 @@ sub run_with_loopdev { return $device; } -sub bindmount { +# In scalar mode: returns a file handle to the deepest directory node. +# In list context: returns a list of: +# * the deepest directory node +# * the 2nd deepest directory (parent of the above) +# * directory name of the last directory +# So that the path $2/$3 should lead to $1 afterwards. +sub walk_tree_nofollow($$$) { + my ($start, $subdir, $mkdir) = @_; + + # splitdir() returns '' for empty components including the leading / + my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir); + + sysopen(my $fd, $start, O_PATH | O_DIRECTORY) + or die "failed to open start directory $start: $!\n"; + + my $dir = $start; + my $last_component = undef; + my $second = $fd; + foreach my $component (@comps) { + $dir .= "/$component"; + my $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); + + if (!$next) { + # failed, check for symlinks and try to create the path + die "symlink encountered at: $dir\n" if $! == ELOOP || $! == ENOTDIR; + die "cannot open directory $dir: $!\n" if !$mkdir; + + # We don't check for errors on mkdirat() here and just try to + # openat() again, since at least one error (EEXIST) is an + # expected possibility if multiple containers start + # simultaneously. If someone else injects a symlink now then + # the subsequent openat() will fail due to O_NOFOLLOW anyway. + PVE::Tools::mkdirat(fileno($fd), $component, 0755); + + $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); + die "failed to create path: $dir: $!\n" if !$next; + } + + close $second if defined($last_component); + $last_component = $component; + $second = $fd; + $fd = $next; + } + + return ($fd, defined($last_component) && $second, $last_component) if wantarray; + close $second if defined($last_component); + return $fd; +} + +# To guard against symlink attack races against other currently running +# containers with shared recursive bind mount hierarchies we prepare a +# directory handle for the directory we're mounting over to verify the +# mountpoint afterwards. +sub __bindmount_prepare { + my ($hostroot, $dir) = @_; + my $srcdh = walk_tree_nofollow($hostroot, $dir, 0); + return $srcdh; +} + +# Assuming we mount to rootfs/a/b/c, verify with the directory handle to 'b' +# ($parentfd) that 'b/c' (openat($parentfd, 'c')) really leads to the directory +# we intended to bind mount. +sub __bindmount_verify { + my ($srcdh, $parentfd, $last_dir, $ro) = @_; + my $destdh; + if ($parentfd) { + # Open the mount point path coming from the parent directory since the + # filehandle we would have gotten as first result of walk_tree_nofollow + # earlier is still a handle to the underlying directory instead of the + # mounted path. + $destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH | O_NOFOLLOW | O_DIRECTORY); + die "failed to open mount point: $!\n" if !$destdh; + if ($ro) { + my $dot = '.'; + # no separate function because 99% of the time it's the wrong thing to use. + if (syscall(PVE::Syscall::faccessat, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) { + die "failed to mark bind mount read only\n"; + } + die "read-only check failed: $!\n" if $! != EROFS; + } + } else { + # For the rootfs we don't have a parentfd so we open the path directly. + # Note that this means bindmounting any prefix of the host's + # /var/lib/lxc/$vmid path into another container is considered a grave + # security error. + sysopen $destdh, $last_dir, O_PATH | O_DIRECTORY; + die "failed to open mount point: $!\n" if !$destdh; + } + + my ($srcdev, $srcinode) = stat($srcdh); + my ($dstdev, $dstinode) = stat($destdh); + close $srcdh; + close $destdh; + + return ($srcdev == $dstdev && $srcinode == $dstinode); +} + +# Perform the actual bind mounting: +sub __bindmount_do { my ($dir, $dest, $ro, @extra_opts) = @_; PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]); if ($ro) { @@ -1012,6 +1334,31 @@ sub bindmount { } } +sub bindmount { + my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_; + + my $srcdh = __bindmount_prepare('/', $dir); + + __bindmount_do($dir, $dest, $ro, @extra_opts); + + if (!__bindmount_verify($srcdh, $parentfd, $last_dir, $ro)) { + PVE::Tools::run_command(['umount', $dest]); + die "detected mount path change at: $dir\n"; + } +} + +# Cleanup $rootdir a bit (double and trailing slashes), build the mount path +# from $rootdir and $mount and walk the path from $rootdir to the final +# directory to check for symlinks. +sub __mount_prepare_rootdir { + my ($rootdir, $mount) = @_; + $rootdir =~ s!/+!/!g; + $rootdir =~ s!/+$!!; + my $mount_path = "$rootdir/$mount"; + my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1); + return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir); +} + # use $rootdir = undef to just return the corresponding mount path sub mountpoint_mount { my ($mountpoint, $rootdir, $storage_cfg, $snapname) = @_; @@ -1024,14 +1371,14 @@ sub mountpoint_mount { return if !$volid || !$mount; + $mount =~ s!/+!/!g; + my $mount_path; + my ($mpfd, $parentfd, $last_dir); if (defined($rootdir)) { - $rootdir =~ s!/+$!!; - $mount_path = "$rootdir/$mount"; - $mount_path =~ s!/+!/!g; - &$check_mount_path($mount_path); - File::Path::mkpath($mount_path); + ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) = + __mount_prepare_rootdir($rootdir, $mount); } my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); @@ -1039,16 +1386,24 @@ sub mountpoint_mount { die "unknown snapshot path for '$volid'" if !$storage && defined($snapname); my $optstring = ''; - if (defined($mountpoint->{acl})) { - $optstring .= ($mountpoint->{acl} ? 'acl' : 'noacl'); + my $acl = $mountpoint->{acl}; + if (defined($acl)) { + $optstring .= ($acl ? 'acl' : 'noacl'); } my $readonly = $mountpoint->{ro}; - my @extra_opts = ('-o', $optstring); + my @extra_opts; + @extra_opts = ('-o', $optstring) if $optstring; if ($storage) { my $scfg = PVE::Storage::storage_config($storage_cfg, $storage); + + # early sanity checks: + # we otherwise call realpath on the rbd url + die "containers on rbd storage without krbd are not supported\n" + if $scfg->{type} eq 'rbd' && !$scfg->{krbd}; + my $path = PVE::Storage::path($storage_cfg, $volid, $snapname); my ($vtype, undef, undef, undef, undef, $isBase, $format) = @@ -1067,12 +1422,25 @@ sub mountpoint_mount { die "cannot mount subvol snapshots for storage type '$scfg->{type}'\n"; } } else { - bindmount($path, $mount_path, $readonly, @extra_opts); + if (defined($acl) && $scfg->{type} eq 'zfspool') { + my $acltype = ($acl ? 'acltype=posixacl' : 'acltype=noacl'); + my (undef, $name) = PVE::Storage::parse_volname($storage_cfg, $volid); + $name .= "\@$snapname" if defined($snapname); + PVE::Tools::run_command(['zfs', 'set', $acltype, "$scfg->{pool}/$name"]); + } + bindmount($path, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts); warn "cannot enable quota control for bind mounted subvolumes\n" if $quota; } } - return wantarray ? ($path, 0, $mounted_dev) : $path; + return wantarray ? ($path, 0, undef) : $path; } elsif ($format eq 'raw' || $format eq 'iso') { + # NOTE: 'mount' performs canonicalization without the '-c' switch, which for + # device-mapper devices is special-cased to use the /dev/mapper symlinks. + # Our autodev hook expects the /dev/dm-* device currently + # and will create the /dev/mapper symlink accordingly + $path = Cwd::realpath($path); + die "failed to get device path\n" if !$path; + ($path) = ($path =~ /^(.*)$/s); #untaint my $domount = sub { my ($path) = @_; if ($mount_path) { @@ -1105,13 +1473,15 @@ sub mountpoint_mount { die "unsupported image format '$format'\n"; } } elsif ($type eq 'device') { - push @extra_opts, '-o', 'ro' if $readonly; + push @extra_opts, '-o', 'ro' if $readonly; + push @extra_opts, '-o', 'usrjquota=aquota.user,grpjquota=aquota.group,jqfmt=vfsv0' if $quota; + # See the NOTE above about devicemapper canonicalization + my ($devpath) = (Cwd::realpath($volid) =~ /^(.*)$/s); # realpath() taints PVE::Tools::run_command(['mount', @extra_opts, $volid, $mount_path]) if $mount_path; - return wantarray ? ($volid, 0, $volid) : $volid; + return wantarray ? ($volid, 0, $devpath) : $volid; } elsif ($type eq 'bind') { die "directory '$volid' does not exist\n" if ! -d $volid; - &$check_mount_path($volid); - bindmount($volid, $mount_path, $readonly, @extra_opts) if $mount_path; + bindmount($volid, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts) if $mount_path; warn "cannot enable quota control for bind mounts\n" if $quota; return wantarray ? ($volid, 0, undef) : $volid; } @@ -1161,6 +1531,60 @@ sub destroy_disks { } } +sub alloc_disk { + my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_; + + my $needs_chown = 0; + my $volid; + + my $scfg = PVE::Storage::storage_config($storecfg, $storage); + # fixme: use better naming ct-$vmid-disk-X.raw? + + eval { + my $do_format = 0; + if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs' || $scfg->{type} eq 'cifs' ) { + if ($size_kb > 0) { + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', + undef, $size_kb); + $do_format = 1; + } else { + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', + undef, 0); + $needs_chown = 1; + } + } elsif ($scfg->{type} eq 'zfspool') { + + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', + undef, $size_kb); + $needs_chown = 1; + } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') { + + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); + $do_format = 1; + + } elsif ($scfg->{type} eq 'rbd') { + + die "krbd option must be enabled on storage type '$scfg->{type}'\n" if !$scfg->{krbd}; + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); + $do_format = 1; + } else { + die "unable to create containers on storage type '$scfg->{type}'\n"; + } + format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format; + }; + if (my $err = $@) { + # in case formatting got interrupted: + if (defined($volid)) { + eval { PVE::Storage::vdisk_free($storecfg, $volid); }; + warn $@ if $@; + } + die $err; + } + + return ($volid, $needs_chown); +} + +our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/; sub create_disks { my ($storecfg, $vmid, $settings, $conf) = @_; @@ -1178,42 +1602,14 @@ sub create_disks { my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); - if ($storage && ($volid =~ m/^([^:\s]+):(\d+(\.\d+)?)$/)) { + if ($storage && ($volid =~ $NEW_DISK_RE)) { my ($storeid, $size_gb) = ($1, $2); my $size_kb = int(${size_gb}*1024) * 1024; - my $scfg = PVE::Storage::storage_config($storecfg, $storage); - # fixme: use better naming ct-$vmid-disk-X.raw? - - if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs') { - if ($size_kb > 0) { - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', - undef, $size_kb); - format_disk($storecfg, $volid, $rootuid, $rootgid); - } else { - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', - undef, 0); - push @$chown_vollist, $volid; - } - } elsif ($scfg->{type} eq 'zfspool') { - - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', - undef, $size_kb); - push @$chown_vollist, $volid; - } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') { - - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); - format_disk($storecfg, $volid, $rootuid, $rootgid); - - } elsif ($scfg->{type} eq 'rbd') { - - die "krbd option must be enabled on storage type '$scfg->{type}'\n" if !$scfg->{krbd}; - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); - format_disk($storecfg, $volid, $rootuid, $rootgid); - } else { - die "unable to create containers on storage type '$scfg->{type}'\n"; - } + my $needs_chown = 0; + ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid); + push @$chown_vollist, $volid if $needs_chown; push @$vollist, $volid; $mountpoint->{volume} = $volid; $mountpoint->{size} = $size_kb * 1024; @@ -1309,7 +1705,8 @@ sub parse_id_maps { my $lxc = $conf->{lxc}; foreach my $entry (@$lxc) { my ($key, $value) = @$entry; - next if $key ne 'lxc.id_map'; + # FIXME: remove the 'id_map' variant when lxc-3.0 arrives + next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map'; if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) { my ($type, $ct, $host, $length) = ($1, $2, $3, $4); push @$id_map, [$type, $ct, $host, $length]; @@ -1318,7 +1715,7 @@ sub parse_id_maps { $rootgid = $host if $type eq 'g'; } } else { - die "failed to parse id_map: $value\n"; + die "failed to parse idmap: $value\n"; } } @@ -1340,5 +1737,163 @@ sub userns_command { return []; } +sub vm_start { + my ($vmid, $conf, $skiplock) = @_; + + update_lxc_config($vmid, $conf); + + my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid"; + + if ($skiplock) { + open(my $fh, '>', $skiplock_flag_fn) || die "failed to open $skiplock_flag_fn for writing: $!\n"; + close($fh); + } + + my $cmd = ['systemctl', 'start', "pve-container\@$vmid"]; + + eval { PVE::Tools::run_command($cmd); }; + if (my $err = $@) { + unlink $skiplock_flag_fn; + die $err; + } + + return; +} + +# Helper to stop a container completely and make sure it has stopped completely. +# This is necessary because we want the post-stop hook to have completed its +# unmount-all step, but post-stop happens after lxc puts the container into the +# STOPPED state. +sub vm_stop { + my ($vmid, $kill, $shutdown_timeout, $exit_timeout) = @_; + + # Open the container's command socket. + my $path = "\0/var/lib/lxc/$vmid/command"; + my $sock = IO::Socket::UNIX->new( + Type => SOCK_STREAM(), + Peer => $path, + ); + if (!$sock) { + return if $! == ECONNREFUSED; # The container is not running + die "failed to open container ${vmid}'s command socket: $!\n"; + } + + # Stop the container: + + my $cmd = ['lxc-stop', '-n', $vmid]; + + if ($kill) { + push @$cmd, '--kill'; # doesn't allow timeouts + } elsif (defined($shutdown_timeout)) { + push @$cmd, '--timeout', $shutdown_timeout; + # Give run_command 5 extra seconds + $shutdown_timeout += 5; + } + + eval { PVE::Tools::run_command($cmd, timeout => $shutdown_timeout) }; + if (my $err = $@) { + warn $@ if $@; + } + + my $result = 1; + my $wait = sub { $result = <$sock>; }; + if (defined($exit_timeout)) { + PVE::Tools::run_with_timeout($exit_timeout, $wait); + } else { + $wait->(); + } + + return if !defined $result; # monitor is gone and the ct has stopped. + die "container did not stop\n"; +} + +sub run_unshared { + my ($code) = @_; + + return PVE::Tools::run_fork(sub { + # Unshare the mount namespace + die "failed to unshare mount namespace: $!\n" + if !PVE::Tools::unshare(PVE::Tools::CLONE_NEWNS); + PVE::Tools::run_command(['mount', '--make-rslave', '/']); + return $code->(); + }); +} + +my $copy_volume = sub { + my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname) = @_; + + my $src_mp = { volume => $src_volid, mp => '/' }; + $src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid); + + my $dst_mp = { volume => $dst_volid, mp => '/' }; + $dst_mp->{type} = PVE::LXC::Config->classify_mountpoint($dst_volid); + + my @mounted; + eval { + # mount and copy + mkdir $src; + mountpoint_mount($src_mp, $src, $storage_cfg, $snapname); + push @mounted, $src; + mkdir $dest; + mountpoint_mount($dst_mp, $dest, $storage_cfg); + push @mounted, $dest; + + PVE::Tools::run_command(['/usr/bin/rsync', '--stats', '-X', '-A', '--numeric-ids', + '-aH', '--whole-file', '--sparse', '--one-file-system', + "$src/", $dest]); + }; + my $err = $@; + foreach my $mount (reverse @mounted) { + eval { PVE::Tools::run_command(['/bin/umount', '--lazy', $mount], errfunc => sub{})}; + warn "Can't umount $mount\n" if $@; + } + + # If this fails they're used as mount points in a concurrent operation + # (which should not happen but there's also no real need to get rid of them). + rmdir $dest; + rmdir $src; + + die $err if $err; +}; + +# Should not be called after unsharing the mount namespace! +sub copy_volume { + my ($mp, $vmid, $storage, $storage_cfg, $conf, $snapname) = @_; + + die "cannot copy volumes of type $mp->{type}\n" if $mp->{type} ne 'volume'; + File::Path::make_path("/var/lib/lxc/$vmid"); + my $dest = "/var/lib/lxc/$vmid/.copy-volume-1"; + my $src = "/var/lib/lxc/$vmid/.copy-volume-2"; + + # get id's for unprivileged container + my (undef, $rootuid, $rootgid) = parse_id_maps($conf); + + # Allocate the disk before unsharing in order to make sure zfs subvolumes + # are visible in this namespace, otherwise the host only sees the empty + # (not-mounted) directory. + my $new_volid; + eval { + # Make sure $mp contains a correct size. + $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume}); + my $needs_chown; + ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid); + if ($needs_chown) { + PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef); + my $path = PVE::Storage::path($storage_cfg, $new_volid, undef); + chown($rootuid, $rootgid, $path); + } + + run_unshared(sub { + $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname); + }); + }; + if (my $err = $@) { + PVE::Storage::vdisk_free($storage_cfg, $new_volid) + if defined($new_volid); + die $err; + } + + return $new_volid; +} 1;