X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=80d79e19c2bf9dd2323fefe1b50b2037f6dc773a;hb=4d4946646bcc439278bc575ad6ca76fadabd0283;hp=8de026cfac1bed4ed157365f49efe46709381050;hpb=28df2cde69af22f5ff37c885a733745b13691360;p=pve-container.git diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index 8de026c..80d79e1 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -10,31 +10,27 @@ use Socket; use File::Path; use File::Spec; use Cwd qw(); -use Fcntl qw(O_RDONLY); +use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY); +use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED); +use IO::Socket::UNIX; use PVE::Exception qw(raise_perm_exc); use PVE::Storage; use PVE::SafeSyslog; use PVE::INotify; -use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full); +use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH); +use PVE::CpuSet; use PVE::Network; use PVE::AccessControl; use PVE::ProcFSTools; +use PVE::Syscall; use PVE::LXC::Config; use Time::HiRes qw (gettimeofday); -use Data::Dumper; - my $nodename = PVE::INotify::nodename(); my $cpuinfo= PVE::ProcFSTools::read_cpuinfo(); -our $COMMON_TAR_FLAGS = [ '--sparse', '--numeric-owner', '--acls', - '--xattrs', - '--xattrs-include=user.*', - '--xattrs-include=security.capability', - '--warning=no-xattr-write' ]; - sub config_list { my $vmlist = PVE::Cluster::get_vmlist(); my $res = {}; @@ -103,9 +99,9 @@ sub get_container_disk_usage { my $last_proc_vmid_stat; my $parse_cpuacct_stat = sub { - my ($vmid) = @_; + my ($vmid, $unprivileged) = @_; - my $raw = read_cgroup_value('cpuacct', $vmid, 'cpuacct.stat', 1); + my $raw = read_cgroup_value('cpuacct', $vmid, $unprivileged, 'cpuacct.stat', 1); my $stat = {}; @@ -132,6 +128,8 @@ sub vmstatus { my $uptime = (PVE::ProcFSTools::read_proc_uptime(1))[0]; + my $unprivileged = {}; + foreach my $vmid (keys %$list) { my $d = $list->{$vmid}; @@ -143,10 +141,13 @@ sub vmstatus { my $cfspath = PVE::LXC::Config->cfs_config_path($vmid); my $conf = PVE::Cluster::cfs_read_file($cfspath) || {}; + $unprivileged->{$vmid} = $conf->{unprivileged}; + $d->{name} = $conf->{'hostname'} || "CT$vmid"; $d->{name} =~ s/[\s]//g; - $d->{cpus} = $conf->{cpulimit} || $cpucount; + $d->{cpus} = $conf->{cores} || $conf->{cpulimit}; + $d->{cpus} = $cpucount if !$d->{cpus}; $d->{lock} = $conf->{lock} || ''; @@ -159,7 +160,7 @@ sub vmstatus { # use 4GB by default ?? if (my $rootfs = $conf->{rootfs}) { my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($rootfs); - $d->{maxdisk} = int(($rootinfo->{size} || 4)*1024*1024)*1024; + $d->{maxdisk} = $rootinfo->{size} || (4*1024*1024*1024); } else { $d->{maxdisk} = 4*1024*1024*1024; } @@ -191,19 +192,24 @@ sub vmstatus { my $ctime = (stat("/proc/$pid"))[10]; # 10 = ctime $d->{uptime} = time - $ctime; # the method lxcfs uses - $d->{mem} = read_cgroup_value('memory', $vmid, 'memory.usage_in_bytes'); - $d->{swap} = read_cgroup_value('memory', $vmid, 'memory.memsw.usage_in_bytes') - $d->{mem}; + my $unpriv = $unprivileged->{$vmid}; - my $blkio_bytes = read_cgroup_value('blkio', $vmid, 'blkio.throttle.io_service_bytes', 1); + my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat'); + my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes'); + + $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; + $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + + my $blkio_bytes = read_cgroup_value('blkio', $vmid, $unpriv, 'blkio.throttle.io_service_bytes', 1); my @bytes = split(/\n/, $blkio_bytes); foreach my $byte (@bytes) { if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { - $d->{diskread} = $2 if $key eq 'Read'; - $d->{diskwrite} = $2 if $key eq 'Write'; + $d->{diskread} += $2 if $key eq 'Read'; + $d->{diskwrite} += $2 if $key eq 'Write'; } } - my $pstat = &$parse_cpuacct_stat($vmid); + my $pstat = $parse_cpuacct_stat->($vmid, $unpriv); my $used = $pstat->{utime} + $pstat->{stime}; @@ -250,10 +256,19 @@ sub vmstatus { return $list; } -sub read_cgroup_value { - my ($group, $vmid, $name, $full) = @_; +sub read_cgroup_list($$$$) { + my ($group, $vmid, $unprivileged, $name) = @_; + + my $content = read_cgroup_value($group, $vmid, $unprivileged, $name, 1); + + return { split(/\s+/, $content) }; +} - my $path = "/sys/fs/cgroup/$group/lxc/$vmid/$name"; +sub read_cgroup_value($$$$$) { + my ($group, $vmid, $unprivileged, $name, $full) = @_; + + my $nsdir = $unprivileged ? '' : 'ns/'; + my $path = "/sys/fs/cgroup/$group/lxc/$vmid/${nsdir}$name"; return PVE::Tools::file_get_contents($path) if $full; @@ -325,7 +340,7 @@ sub parse_ipv4_cidr { sub update_lxc_config { - my ($storage_cfg, $vmid, $conf) = @_; + my ($vmid, $conf) = @_; my $dir = "/var/lib/lxc/$vmid"; @@ -342,20 +357,17 @@ sub update_lxc_config { $raw .= "lxc.arch = $conf->{arch}\n"; my $unprivileged = $conf->{unprivileged}; - my $custom_idmap = grep { $_->[0] eq 'lxc.id_map' } @{$conf->{lxc}}; + my $custom_idmap = grep { $_->[0] eq 'lxc.idmap' } @{$conf->{lxc}}; my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; - if ($ostype =~ /^(?:debian | ubuntu | centos | fedora | opensuse | archlinux | alpine | unmanaged)$/x) { - my $inc ="/usr/share/lxc/config/$ostype.common.conf"; - $inc ="/usr/share/lxc/config/common.conf" if !-f $inc; - $raw .= "lxc.include = $inc\n"; - if ($unprivileged || $custom_idmap) { - $inc = "/usr/share/lxc/config/$ostype.userns.conf"; - $inc = "/usr/share/lxc/config/userns.conf" if !-f $inc; - $raw .= "lxc.include = $inc\n" - } - } else { - die "implement me (ostype $ostype)"; + + my $inc ="/usr/share/lxc/config/$ostype.common.conf"; + $inc ="/usr/share/lxc/config/common.conf" if !-f $inc; + $raw .= "lxc.include = $inc\n"; + if ($unprivileged || $custom_idmap) { + $inc = "/usr/share/lxc/config/$ostype.userns.conf"; + $inc = "/usr/share/lxc/config/userns.conf" if !-f $inc; + $raw .= "lxc.include = $inc\n" } # WARNING: DO NOT REMOVE this without making sure that loop device nodes @@ -367,23 +379,23 @@ sub update_lxc_config { # Should we read them from /etc/subuid? if ($unprivileged && !$custom_idmap) { - $raw .= "lxc.id_map = u 0 100000 65536\n"; - $raw .= "lxc.id_map = g 0 100000 65536\n"; + $raw .= "lxc.idmap = u 0 100000 65536\n"; + $raw .= "lxc.idmap = g 0 100000 65536\n"; } if (!PVE::LXC::Config->has_dev_console($conf)) { - $raw .= "lxc.console = none\n"; + $raw .= "lxc.console.path = none\n"; $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n"; } my $ttycount = PVE::LXC::Config->get_tty_count($conf); - $raw .= "lxc.tty = $ttycount\n"; + $raw .= "lxc.tty.max = $ttycount\n"; # some init scripts expect a linux terminal (turnkey). $raw .= "lxc.environment = TERM=linux\n"; my $utsname = $conf->{hostname} || "CT$vmid"; - $raw .= "lxc.utsname = $utsname\n"; + $raw .= "lxc.uts.name = $utsname\n"; my $memory = $conf->{memory} || 512; my $swap = $conf->{swap} // 0; @@ -403,32 +415,45 @@ sub update_lxc_config { my $shares = $conf->{cpuunits} || 1024; $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + die "missing 'rootfs' configuration\n" + if !defined($conf->{rootfs}); + my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); - $raw .= "lxc.rootfs = $dir/rootfs\n"; + $raw .= "lxc.rootfs.path = $dir/rootfs\n"; - my $netcount = 0; - foreach my $k (keys %$conf) { + foreach my $k (sort keys %$conf) { next if $k !~ m/^net(\d+)$/; my $ind = $1; my $d = PVE::LXC::Config->parse_lxc_network($conf->{$k}); - $netcount++; - $raw .= "lxc.network.type = veth\n"; - $raw .= "lxc.network.veth.pair = veth${vmid}i${ind}\n"; - $raw .= "lxc.network.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); - $raw .= "lxc.network.name = $d->{name}\n" if defined($d->{name}); - $raw .= "lxc.network.mtu = $d->{mtu}\n" if defined($d->{mtu}); + $raw .= "lxc.net.$ind.type = veth\n"; + $raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n"; + $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); + $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name}); + $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu}); } + my $had_cpuset = 0; if (my $lxcconf = $conf->{lxc}) { foreach my $entry (@$lxcconf) { my ($k, $v) = @$entry; - $netcount++ if $k eq 'lxc.network.type'; + $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; $raw .= "$k = $v\n"; } } - $raw .= "lxc.network.type = empty\n" if !$netcount; + my $cores = $conf->{cores}; + if (!$had_cpuset && $cores) { + my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; + $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; + my @members = $cpuset->members(); + while (scalar(@members) > $cores) { + my $randidx = int(rand(scalar(@members))); + $cpuset->delete($members[$randidx]); + splice(@members, $randidx, 1); # keep track of the changes + } + $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; + } File::Path::mkpath("$dir/rootfs"); @@ -461,19 +486,24 @@ sub verify_searchdomain_list { } sub get_console_command { - my ($vmid, $conf) = @_; + my ($vmid, $conf, $noescapechar) = @_; my $cmode = PVE::LXC::Config->get_cmode($conf); + my $cmd = []; if ($cmode eq 'console') { - return ['lxc-console', '-n', $vmid, '-t', 0]; + push @$cmd, 'lxc-console', '-n', $vmid, '-t', 0; + push @$cmd, '-e', -1 if $noescapechar; } elsif ($cmode eq 'tty') { - return ['lxc-console', '-n', $vmid]; + push @$cmd, 'lxc-console', '-n', $vmid; + push @$cmd, '-e', -1 if $noescapechar; } elsif ($cmode eq 'shell') { - return ['lxc-attach', '--clear-env', '-n', $vmid]; + push @$cmd, 'lxc-attach', '--clear-env', '-n', $vmid; } else { die "internal error"; } + + return $cmd; } sub get_primary_ips { @@ -514,7 +544,7 @@ sub delete_mountpoint_volume { } sub destroy_lxc_container { - my ($storage_cfg, $vmid, $conf) = @_; + my ($storage_cfg, $vmid, $conf, $replacement_conf) = @_; PVE::LXC::Config->foreach_mountpoint($conf, sub { my ($ms, $mountpoint) = @_; @@ -524,7 +554,11 @@ sub destroy_lxc_container { rmdir "/var/lib/lxc/$vmid/rootfs"; unlink "/var/lib/lxc/$vmid/config"; rmdir "/var/lib/lxc/$vmid"; - destroy_config($vmid); + if (defined $replacement_conf) { + PVE::LXC::Config->write_config($vmid, $replacement_conf); + } else { + destroy_config($vmid); + } #my $cmd = ['lxc-destroy', '-n', $vmid ]; #PVE::Tools::run_command($cmd); @@ -586,9 +620,10 @@ sub update_net { hotplug_net($vmid, $conf, $opt, $newnet, $netid); - } elsif (&$safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) || - &$safe_num_ne($oldnet->{tag}, $newnet->{tag}) || - &$safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) { + } else { + if (&$safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) || + &$safe_num_ne($oldnet->{tag}, $newnet->{tag}) || + &$safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) { if ($oldnet->{bridge}) { PVE::Network::tap_unplug($veth); @@ -599,12 +634,19 @@ sub update_net { PVE::LXC::Config->write_config($vmid, $conf); } - PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}); - foreach (qw(bridge tag firewall)) { + PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); + # This includes the rate: + foreach (qw(bridge tag firewall rate)) { $oldnet->{$_} = $newnet->{$_} if $newnet->{$_}; } - $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet); - PVE::LXC::Config->write_config($vmid, $conf); + } elsif (&$safe_string_ne($oldnet->{rate}, $newnet->{rate})) { + # Rate can be applied on its own but any change above needs to + # include the rate in tap_plug since OVS resets everything. + PVE::Network::tap_rate_limit($veth, $newnet->{rate}); + $oldnet->{rate} = $newnet->{rate} + } + $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet); + PVE::LXC::Config->write_config($vmid, $conf); } } else { hotplug_net($vmid, $conf, $opt, $newnet, $netid); @@ -621,7 +663,7 @@ sub hotplug_net { my $eth = $newnet->{name}; PVE::Network::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr}); - PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}); + PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); # attach peer in container my $cmd = ['lxc-device', '-n', $vmid, 'add', $vethpeer, "$eth" ]; @@ -846,14 +888,15 @@ sub check_ct_modify_config_perm { my $check = sub { my ($opt, $delete) = @_; - if ($opt eq 'cpus' || $opt eq 'cpuunits' || $opt eq 'cpulimit') { + if ($opt eq 'cores' || $opt eq 'cpuunits' || $opt eq 'cpulimit') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.CPU']); } elsif ($opt eq 'rootfs' || $opt =~ /^mp\d+$/) { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk']); return if $delete; my $data = $opt eq 'rootfs' ? PVE::LXC::Config->parse_ct_rootfs($newconf->{$opt}) : PVE::LXC::Config->parse_ct_mountpoint($newconf->{$opt}); - raise_perm_exc("mountpoint type $data->{type}") if $data->{type} ne 'volume'; + raise_perm_exc("mount point type $data->{type} is only allowed for root\@pam") + if $data->{type} ne 'volume'; } elsif ($opt eq 'memory' || $opt eq 'swap') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']); } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || @@ -907,7 +950,7 @@ sub umount_all { } sub mount_all { - my ($vmid, $storage_cfg, $conf) = @_; + my ($vmid, $storage_cfg, $conf, $ignore_ro) = @_; my $rootdir = "/var/lib/lxc/$vmid/rootfs"; File::Path::make_path($rootdir); @@ -919,6 +962,8 @@ sub mount_all { PVE::LXC::Config->foreach_mountpoint($conf, sub { my ($ms, $mountpoint) = @_; + $mountpoint->{ro} = 0 if $ignore_ro; + mountpoint_mount($mountpoint, $rootdir, $storage_cfg); }); }; @@ -938,15 +983,6 @@ sub mountpoint_mount_path { return mountpoint_mount($mountpoint, undef, $storage_cfg, $snapname); } -my $check_mount_path = sub { - my ($path) = @_; - $path = File::Spec->canonpath($path); - my $real = Cwd::realpath($path); - if ($real ne $path) { - die "mount path modified by symlink: $path != $real"; - } -}; - sub query_loopdev { my ($path) = @_; my $found; @@ -990,7 +1026,105 @@ sub run_with_loopdev { return $device; } -sub bindmount { +# In scalar mode: returns a file handle to the deepest directory node. +# In list context: returns a list of: +# * the deepest directory node +# * the 2nd deepest directory (parent of the above) +# * directory name of the last directory +# So that the path $2/$3 should lead to $1 afterwards. +sub walk_tree_nofollow($$$) { + my ($start, $subdir, $mkdir) = @_; + + # splitdir() returns '' for empty components including the leading / + my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir); + + sysopen(my $fd, $start, O_PATH | O_DIRECTORY) + or die "failed to open start directory $start: $!\n"; + + my $dir = $start; + my $last_component = undef; + my $second = $fd; + foreach my $component (@comps) { + $dir .= "/$component"; + my $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); + + if (!$next) { + # failed, check for symlinks and try to create the path + die "symlink encountered at: $dir\n" if $! == ELOOP || $! == ENOTDIR; + die "cannot open directory $dir: $!\n" if !$mkdir; + + # We don't check for errors on mkdirat() here and just try to + # openat() again, since at least one error (EEXIST) is an + # expected possibility if multiple containers start + # simultaneously. If someone else injects a symlink now then + # the subsequent openat() will fail due to O_NOFOLLOW anyway. + PVE::Tools::mkdirat(fileno($fd), $component, 0755); + + $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); + die "failed to create path: $dir: $!\n" if !$next; + } + + close $second if defined($last_component); + $last_component = $component; + $second = $fd; + $fd = $next; + } + + return ($fd, defined($last_component) && $second, $last_component) if wantarray; + close $second if defined($last_component); + return $fd; +} + +# To guard against symlink attack races against other currently running +# containers with shared recursive bind mount hierarchies we prepare a +# directory handle for the directory we're mounting over to verify the +# mountpoint afterwards. +sub __bindmount_prepare { + my ($hostroot, $dir) = @_; + my $srcdh = walk_tree_nofollow($hostroot, $dir, 0); + return $srcdh; +} + +# Assuming we mount to rootfs/a/b/c, verify with the directory handle to 'b' +# ($parentfd) that 'b/c' (openat($parentfd, 'c')) really leads to the directory +# we intended to bind mount. +sub __bindmount_verify { + my ($srcdh, $parentfd, $last_dir, $ro) = @_; + my $destdh; + if ($parentfd) { + # Open the mount point path coming from the parent directory since the + # filehandle we would have gotten as first result of walk_tree_nofollow + # earlier is still a handle to the underlying directory instead of the + # mounted path. + $destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH | O_NOFOLLOW | O_DIRECTORY); + die "failed to open mount point: $!\n" if !$destdh; + if ($ro) { + my $dot = '.'; + # no separate function because 99% of the time it's the wrong thing to use. + if (syscall(PVE::Syscall::faccessat, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) { + die "failed to mark bind mount read only\n"; + } + die "read-only check failed: $!\n" if $! != EROFS; + } + } else { + # For the rootfs we don't have a parentfd so we open the path directly. + # Note that this means bindmounting any prefix of the host's + # /var/lib/lxc/$vmid path into another container is considered a grave + # security error. + sysopen $destdh, $last_dir, O_PATH | O_DIRECTORY; + die "failed to open mount point: $!\n" if !$destdh; + } + + my ($srcdev, $srcinode) = stat($srcdh); + my ($dstdev, $dstinode) = stat($destdh); + close $srcdh; + close $destdh; + + return ($srcdev == $dstdev && $srcinode == $dstinode); +} + +# Perform the actual bind mounting: +sub __bindmount_do { my ($dir, $dest, $ro, @extra_opts) = @_; PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]); if ($ro) { @@ -1004,6 +1138,31 @@ sub bindmount { } } +sub bindmount { + my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_; + + my $srcdh = __bindmount_prepare('/', $dir); + + __bindmount_do($dir, $dest, $ro, @extra_opts); + + if (!__bindmount_verify($srcdh, $parentfd, $last_dir, $ro)) { + PVE::Tools::run_command(['umount', $dest]); + die "detected mount path change at: $dir\n"; + } +} + +# Cleanup $rootdir a bit (double and trailing slashes), build the mount path +# from $rootdir and $mount and walk the path from $rootdir to the final +# directory to check for symlinks. +sub __mount_prepare_rootdir { + my ($rootdir, $mount) = @_; + $rootdir =~ s!/+!/!g; + $rootdir =~ s!/+$!!; + my $mount_path = "$rootdir/$mount"; + my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1); + return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir); +} + # use $rootdir = undef to just return the corresponding mount path sub mountpoint_mount { my ($mountpoint, $rootdir, $storage_cfg, $snapname) = @_; @@ -1016,14 +1175,14 @@ sub mountpoint_mount { return if !$volid || !$mount; + $mount =~ s!/+!/!g; + my $mount_path; + my ($mpfd, $parentfd, $last_dir); if (defined($rootdir)) { - $rootdir =~ s!/+$!!; - $mount_path = "$rootdir/$mount"; - $mount_path =~ s!/+!/!g; - &$check_mount_path($mount_path); - File::Path::mkpath($mount_path); + ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) = + __mount_prepare_rootdir($rootdir, $mount); } my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); @@ -1031,16 +1190,23 @@ sub mountpoint_mount { die "unknown snapshot path for '$volid'" if !$storage && defined($snapname); my $optstring = ''; - if (defined($mountpoint->{acl})) { - $optstring .= ($mountpoint->{acl} ? 'acl' : 'noacl'); + my $acl = $mountpoint->{acl}; + if (defined($acl)) { + $optstring .= ($acl ? 'acl' : 'noacl'); } my $readonly = $mountpoint->{ro}; - my @extra_opts = ('-o', $optstring); + my @extra_opts = ('-o', $optstring) if $optstring; if ($storage) { my $scfg = PVE::Storage::storage_config($storage_cfg, $storage); + + # early sanity checks: + # we otherwise call realpath on the rbd url + die "containers on rbd storage without krbd are not supported\n" + if $scfg->{type} eq 'rbd' && !$scfg->{krbd}; + my $path = PVE::Storage::path($storage_cfg, $volid, $snapname); my ($vtype, undef, undef, undef, undef, $isBase, $format) = @@ -1059,12 +1225,25 @@ sub mountpoint_mount { die "cannot mount subvol snapshots for storage type '$scfg->{type}'\n"; } } else { - bindmount($path, $mount_path, $readonly, @extra_opts); + if (defined($acl) && $scfg->{type} eq 'zfspool') { + my $acltype = ($acl ? 'acltype=posixacl' : 'acltype=noacl'); + my (undef, $name) = PVE::Storage::parse_volname($storage_cfg, $volid); + $name .= "\@$snapname" if defined($snapname); + PVE::Tools::run_command(['zfs', 'set', $acltype, "$scfg->{pool}/$name"]); + } + bindmount($path, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts); warn "cannot enable quota control for bind mounted subvolumes\n" if $quota; } } - return wantarray ? ($path, 0, $mounted_dev) : $path; + return wantarray ? ($path, 0, undef) : $path; } elsif ($format eq 'raw' || $format eq 'iso') { + # NOTE: 'mount' performs canonicalization without the '-c' switch, which for + # device-mapper devices is special-cased to use the /dev/mapper symlinks. + # Our autodev hook expects the /dev/dm-* device currently + # and will create the /dev/mapper symlink accordingly + $path = Cwd::realpath($path); + die "failed to get device path\n" if !$path; + ($path) = ($path =~ /^(.*)$/s); #untaint my $domount = sub { my ($path) = @_; if ($mount_path) { @@ -1097,13 +1276,15 @@ sub mountpoint_mount { die "unsupported image format '$format'\n"; } } elsif ($type eq 'device') { - push @extra_opts, '-o', 'ro' if $readonly; + push @extra_opts, '-o', 'ro' if $readonly; + push @extra_opts, '-o', 'usrjquota=aquota.user,grpjquota=aquota.group,jqfmt=vfsv0' if $quota; + # See the NOTE above about devicemapper canonicalization + my ($devpath) = (Cwd::realpath($volid) =~ /^(.*)$/s); # realpath() taints PVE::Tools::run_command(['mount', @extra_opts, $volid, $mount_path]) if $mount_path; - return wantarray ? ($volid, 0, $volid) : $volid; + return wantarray ? ($volid, 0, $devpath) : $volid; } elsif ($type eq 'bind') { die "directory '$volid' does not exist\n" if ! -d $volid; - &$check_mount_path($volid); - bindmount($volid, $mount_path, $readonly, @extra_opts) if $mount_path; + bindmount($volid, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts) if $mount_path; warn "cannot enable quota control for bind mounts\n" if $quota; return wantarray ? ($volid, 0, undef) : $volid; } @@ -1153,6 +1334,7 @@ sub destroy_disks { } } +our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/; sub create_disks { my ($storecfg, $vmid, $settings, $conf) = @_; @@ -1170,7 +1352,7 @@ sub create_disks { my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); - if ($storage && ($volid =~ m/^([^:\s]+):(\d+(\.\d+)?)$/)) { + if ($storage && ($volid =~ $NEW_DISK_RE)) { my ($storeid, $size_gb) = ($1, $2); my $size_kb = int(${size_gb}*1024) * 1024; @@ -1301,7 +1483,8 @@ sub parse_id_maps { my $lxc = $conf->{lxc}; foreach my $entry (@$lxc) { my ($key, $value) = @$entry; - next if $key ne 'lxc.id_map'; + # FIXME: remove the 'id_map' variant when lxc-3.0 arrives + next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map'; if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) { my ($type, $ct, $host, $length) = ($1, $2, $3, $4); push @$id_map, [$type, $ct, $host, $length]; @@ -1310,7 +1493,7 @@ sub parse_id_maps { $rootgid = $host if $type eq 'g'; } } else { - die "failed to parse id_map: $value\n"; + die "failed to parse idmap: $value\n"; } } @@ -1332,5 +1515,51 @@ sub userns_command { return []; } +# Helper to stop a container completely and make sure it has stopped completely. +# This is necessary because we want the post-stop hook to have completed its +# unmount-all step, but post-stop happens after lxc puts the container into the +# STOPPED state. +sub vm_stop { + my ($vmid, $kill, $shutdown_timeout, $exit_timeout) = @_; + + # Open the container's command socket. + my $path = "\0/var/lib/lxc/$vmid/command"; + my $sock = IO::Socket::UNIX->new( + Type => SOCK_STREAM(), + Peer => $path, + ); + if (!$sock) { + return if $! == ECONNREFUSED; # The container is not running + die "failed to open container ${vmid}'s command socket: $!\n"; + } + + # Stop the container: + + my $cmd = ['lxc-stop', '-n', $vmid]; + + if ($kill) { + push @$cmd, '--kill'; # doesn't allow timeouts + } elsif (defined($shutdown_timeout)) { + push @$cmd, '--timeout', $shutdown_timeout; + # Give run_command 5 extra seconds + $shutdown_timeout += 5; + } + + eval { PVE::Tools::run_command($cmd, timeout => $shutdown_timeout) }; + if (my $err = $@) { + warn $@ if $@; + } + + my $result = 1; + my $wait = sub { $result = <$sock>; }; + if (defined($exit_timeout)) { + PVE::Tools::run_with_timeout($exit_timeout, $wait); + } else { + $wait->(); + } + + return if !defined $result; # monitor is gone and the ct has stopped. + die "container did not stop\n"; +} 1;