X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=32b0318b18e40aae8f3415271d610802c25df7de;hb=0389da0d5ef1fcafc8916e7046f69def280270bf;hp=b2ffa160416f9fc78311addbee3f08b615ccb93b;hpb=1b4cf758969b726ca2b376ffd73b22d78cccc4ba;p=pve-container.git diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index b2ffa16..32b0318 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -10,16 +10,19 @@ use Socket; use File::Path; use File::Spec; use Cwd qw(); -use Fcntl qw(O_RDONLY); +use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY); +use Errno qw(ELOOP ENOTDIR EROFS); use PVE::Exception qw(raise_perm_exc); use PVE::Storage; use PVE::SafeSyslog; use PVE::INotify; -use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full); +use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH); +use PVE::CpuSet; use PVE::Network; use PVE::AccessControl; use PVE::ProcFSTools; +use PVE::Syscall; use PVE::LXC::Config; use Time::HiRes qw (gettimeofday); @@ -70,7 +73,7 @@ sub list_active_containers { return $res if !$fh; while (defined(my $line = <$fh>)) { - if ($line =~ m/^[a-f0-9]+:\s\S+\s\S+\s\S+\s\S+\s\S+\s\d+\s(\S+)$/) { + if ($line =~ m/^[a-f0-9]+:\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+\s+(\S+)$/) { my $path = $1; if ($path =~ m!^@/var/lib/lxc/(\d+)/command$!) { $res->{$1} = 1; @@ -146,7 +149,8 @@ sub vmstatus { $d->{name} = $conf->{'hostname'} || "CT$vmid"; $d->{name} =~ s/[\s]//g; - $d->{cpus} = $conf->{cpulimit} || $cpucount; + $d->{cpus} = $conf->{cores} || $conf->{cpulimit}; + $d->{cpus} = $cpucount if !$d->{cpus}; $d->{lock} = $conf->{lock} || ''; @@ -159,7 +163,7 @@ sub vmstatus { # use 4GB by default ?? if (my $rootfs = $conf->{rootfs}) { my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($rootfs); - $d->{maxdisk} = int(($rootinfo->{size} || 4)*1024*1024)*1024; + $d->{maxdisk} = $rootinfo->{size} || (4*1024*1024*1024); } else { $d->{maxdisk} = 4*1024*1024*1024; } @@ -191,8 +195,11 @@ sub vmstatus { my $ctime = (stat("/proc/$pid"))[10]; # 10 = ctime $d->{uptime} = time - $ctime; # the method lxcfs uses - $d->{mem} = read_cgroup_value('memory', $vmid, 'memory.usage_in_bytes'); - $d->{swap} = read_cgroup_value('memory', $vmid, 'memory.memsw.usage_in_bytes') - $d->{mem}; + my $memory_stat = read_cgroup_list('memory', $vmid, 'memory.stat'); + my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, 'memory.usage_in_bytes'); + + $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; + $d->{swap} = read_cgroup_value('memory', $vmid, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; my $blkio_bytes = read_cgroup_value('blkio', $vmid, 'blkio.throttle.io_service_bytes', 1); my @bytes = split(/\n/, $blkio_bytes); @@ -250,6 +257,14 @@ sub vmstatus { return $list; } +sub read_cgroup_list { + my ($group, $vmid, $name) = @_; + + my $content = read_cgroup_value($group, $vmid, $name, 1); + + return { split(/\s+/, $content) }; +} + sub read_cgroup_value { my ($group, $vmid, $name, $full) = @_; @@ -325,7 +340,7 @@ sub parse_ipv4_cidr { sub update_lxc_config { - my ($storage_cfg, $vmid, $conf) = @_; + my ($vmid, $conf) = @_; my $dir = "/var/lib/lxc/$vmid"; @@ -345,7 +360,7 @@ sub update_lxc_config { my $custom_idmap = grep { $_->[0] eq 'lxc.id_map' } @{$conf->{lxc}}; my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; - if ($ostype =~ /^(?:debian | ubuntu | centos | fedora | opensuse | archlinux | alpine | unmanaged)$/x) { + if ($ostype =~ /^(?:debian | ubuntu | centos | fedora | opensuse | archlinux | alpine | gentoo | unmanaged)$/x) { my $inc ="/usr/share/lxc/config/$ostype.common.conf"; $inc ="/usr/share/lxc/config/common.conf" if !-f $inc; $raw .= "lxc.include = $inc\n"; @@ -403,12 +418,15 @@ sub update_lxc_config { my $shares = $conf->{cpuunits} || 1024; $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + die "missing 'rootfs' configuration\n" + if !defined($conf->{rootfs}); + my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); $raw .= "lxc.rootfs = $dir/rootfs\n"; my $netcount = 0; - foreach my $k (keys %$conf) { + foreach my $k (sort keys %$conf) { next if $k !~ m/^net(\d+)$/; my $ind = $1; my $d = PVE::LXC::Config->parse_lxc_network($conf->{$k}); @@ -420,15 +438,30 @@ sub update_lxc_config { $raw .= "lxc.network.mtu = $d->{mtu}\n" if defined($d->{mtu}); } + my $had_cpuset = 0; if (my $lxcconf = $conf->{lxc}) { foreach my $entry (@$lxcconf) { my ($k, $v) = @$entry; $netcount++ if $k eq 'lxc.network.type'; + $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; $raw .= "$k = $v\n"; } } $raw .= "lxc.network.type = empty\n" if !$netcount; + + my $cores = $conf->{cores}; + if (!$had_cpuset && $cores) { + my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; + $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; + my @members = $cpuset->members(); + while (scalar(@members) > $cores) { + my $randidx = int(rand(scalar(@members))); + $cpuset->delete($members[$randidx]); + splice(@members, $randidx, 1); # keep track of the changes + } + $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; + } File::Path::mkpath("$dir/rootfs"); @@ -514,7 +547,7 @@ sub delete_mountpoint_volume { } sub destroy_lxc_container { - my ($storage_cfg, $vmid, $conf) = @_; + my ($storage_cfg, $vmid, $conf, $replacement_conf) = @_; PVE::LXC::Config->foreach_mountpoint($conf, sub { my ($ms, $mountpoint) = @_; @@ -524,7 +557,11 @@ sub destroy_lxc_container { rmdir "/var/lib/lxc/$vmid/rootfs"; unlink "/var/lib/lxc/$vmid/config"; rmdir "/var/lib/lxc/$vmid"; - destroy_config($vmid); + if (defined $replacement_conf) { + PVE::LXC::Config->write_config($vmid, $replacement_conf); + } else { + destroy_config($vmid); + } #my $cmd = ['lxc-destroy', '-n', $vmid ]; #PVE::Tools::run_command($cmd); @@ -586,9 +623,10 @@ sub update_net { hotplug_net($vmid, $conf, $opt, $newnet, $netid); - } elsif (&$safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) || - &$safe_num_ne($oldnet->{tag}, $newnet->{tag}) || - &$safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) { + } else { + if (&$safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) || + &$safe_num_ne($oldnet->{tag}, $newnet->{tag}) || + &$safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) { if ($oldnet->{bridge}) { PVE::Network::tap_unplug($veth); @@ -599,12 +637,19 @@ sub update_net { PVE::LXC::Config->write_config($vmid, $conf); } - PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}); - foreach (qw(bridge tag firewall)) { + PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); + # This includes the rate: + foreach (qw(bridge tag firewall rate)) { $oldnet->{$_} = $newnet->{$_} if $newnet->{$_}; } - $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet); - PVE::LXC::Config->write_config($vmid, $conf); + } elsif (&$safe_string_ne($oldnet->{rate}, $newnet->{rate})) { + # Rate can be applied on its own but any change above needs to + # include the rate in tap_plug since OVS resets everything. + PVE::Network::tap_rate_limit($veth, $newnet->{rate}); + $oldnet->{rate} = $newnet->{rate} + } + $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet); + PVE::LXC::Config->write_config($vmid, $conf); } } else { hotplug_net($vmid, $conf, $opt, $newnet, $netid); @@ -621,7 +666,7 @@ sub hotplug_net { my $eth = $newnet->{name}; PVE::Network::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr}); - PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}); + PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); # attach peer in container my $cmd = ['lxc-device', '-n', $vmid, 'add', $vethpeer, "$eth" ]; @@ -846,14 +891,15 @@ sub check_ct_modify_config_perm { my $check = sub { my ($opt, $delete) = @_; - if ($opt eq 'cpus' || $opt eq 'cpuunits' || $opt eq 'cpulimit') { + if ($opt eq 'cores' || $opt eq 'cpuunits' || $opt eq 'cpulimit') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.CPU']); } elsif ($opt eq 'rootfs' || $opt =~ /^mp\d+$/) { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk']); return if $delete; my $data = $opt eq 'rootfs' ? PVE::LXC::Config->parse_ct_rootfs($newconf->{$opt}) : PVE::LXC::Config->parse_ct_mountpoint($newconf->{$opt}); - raise_perm_exc("mountpoint type $data->{type}") if $data->{type} ne 'volume'; + raise_perm_exc("mount point type $data->{type} is only allowed for root\@pam") + if $data->{type} ne 'volume'; } elsif ($opt eq 'memory' || $opt eq 'swap') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']); } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || @@ -907,7 +953,7 @@ sub umount_all { } sub mount_all { - my ($vmid, $storage_cfg, $conf) = @_; + my ($vmid, $storage_cfg, $conf, $ignore_ro) = @_; my $rootdir = "/var/lib/lxc/$vmid/rootfs"; File::Path::make_path($rootdir); @@ -919,6 +965,8 @@ sub mount_all { PVE::LXC::Config->foreach_mountpoint($conf, sub { my ($ms, $mountpoint) = @_; + $mountpoint->{ro} = 0 if $ignore_ro; + mountpoint_mount($mountpoint, $rootdir, $storage_cfg); }); }; @@ -938,15 +986,6 @@ sub mountpoint_mount_path { return mountpoint_mount($mountpoint, undef, $storage_cfg, $snapname); } -my $check_mount_path = sub { - my ($path) = @_; - $path = File::Spec->canonpath($path); - my $real = Cwd::realpath($path); - if ($real ne $path) { - die "mount path modified by symlink: $path != $real"; - } -}; - sub query_loopdev { my ($path) = @_; my $found; @@ -990,7 +1029,105 @@ sub run_with_loopdev { return $device; } -sub bindmount { +# In scalar mode: returns a file handle to the deepest directory node. +# In list context: returns a list of: +# * the deepest directory node +# * the 2nd deepest directory (parent of the above) +# * directory name of the last directory +# So that the path $2/$3 should lead to $1 afterwards. +sub walk_tree_nofollow($$$) { + my ($start, $subdir, $mkdir) = @_; + + # splitdir() returns '' for empty components including the leading / + my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir); + + sysopen(my $fd, $start, O_PATH | O_DIRECTORY) + or die "failed to open start directory $start: $!\n"; + + my $dir = $start; + my $last_component = undef; + my $second = $fd; + foreach my $component (@comps) { + $dir .= "/$component"; + my $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); + + if (!$next) { + # failed, check for symlinks and try to create the path + die "symlink encountered at: $dir\n" if $! == ELOOP || $! == ENOTDIR; + die "cannot open directory $dir: $!\n" if !$mkdir; + + # We don't check for errors on mkdirat() here and just try to + # openat() again, since at least one error (EEXIST) is an + # expected possibility if multiple containers start + # simultaneously. If someone else injects a symlink now then + # the subsequent openat() will fail due to O_NOFOLLOW anyway. + PVE::Tools::mkdirat(fileno($fd), $component, 0755); + + $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); + die "failed to create path: $dir: $!\n" if !$next; + } + + close $second if defined($last_component); + $last_component = $component; + $second = $fd; + $fd = $next; + } + + return ($fd, defined($last_component) && $second, $last_component) if wantarray; + close $second if defined($last_component); + return $fd; +} + +# To guard against symlink attack races against other currently running +# containers with shared recursive bind mount hierarchies we prepare a +# directory handle for the directory we're mounting over to verify the +# mountpoint afterwards. +sub __bindmount_prepare { + my ($hostroot, $dir) = @_; + my $srcdh = walk_tree_nofollow($hostroot, $dir, 0); + return $srcdh; +} + +# Assuming we mount to rootfs/a/b/c, verify with the directory handle to 'b' +# ($parentfd) that 'b/c' (openat($parentfd, 'c')) really leads to the directory +# we intended to bind mount. +sub __bindmount_verify { + my ($srcdh, $parentfd, $last_dir, $ro) = @_; + my $destdh; + if ($parentfd) { + # Open the mount point path coming from the parent directory since the + # filehandle we would have gotten as first result of walk_tree_nofollow + # earlier is still a handle to the underlying directory instead of the + # mounted path. + $destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH | O_NOFOLLOW | O_DIRECTORY); + die "failed to open mount point: $!\n" if !$destdh; + if ($ro) { + my $dot = '.'; + # no separate function because 99% of the time it's the wrong thing to use. + if (syscall(PVE::Syscall::faccessat, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) { + die "failed to mark bind mount read only\n"; + } + die "read-only check failed: $!\n" if $! != EROFS; + } + } else { + # For the rootfs we don't have a parentfd so we open the path directly. + # Note that this means bindmounting any prefix of the host's + # /var/lib/lxc/$vmid path into another container is considered a grave + # security error. + sysopen $destdh, $last_dir, O_PATH | O_DIRECTORY; + die "failed to open mount point: $!\n" if !$destdh; + } + + my ($srcdev, $srcinode) = stat($srcdh); + my ($dstdev, $dstinode) = stat($destdh); + close $srcdh; + close $destdh; + + return ($srcdev == $dstdev && $srcinode == $dstinode); +} + +# Perform the actual bind mounting: +sub __bindmount_do { my ($dir, $dest, $ro, @extra_opts) = @_; PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]); if ($ro) { @@ -1004,6 +1141,31 @@ sub bindmount { } } +sub bindmount { + my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_; + + my $srcdh = __bindmount_prepare('/', $dir); + + __bindmount_do($dir, $dest, $ro, @extra_opts); + + if (!__bindmount_verify($srcdh, $parentfd, $last_dir, $ro)) { + PVE::Tools::run_command(['umount', $dest]); + die "detected mount path change at: $dir\n"; + } +} + +# Cleanup $rootdir a bit (double and trailing slashes), build the mount path +# from $rootdir and $mount and walk the path from $rootdir to the final +# directory to check for symlinks. +sub __mount_prepare_rootdir { + my ($rootdir, $mount) = @_; + $rootdir =~ s!/+!/!g; + $rootdir =~ s!/+$!!; + my $mount_path = "$rootdir/$mount"; + my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1); + return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir); +} + # use $rootdir = undef to just return the corresponding mount path sub mountpoint_mount { my ($mountpoint, $rootdir, $storage_cfg, $snapname) = @_; @@ -1016,14 +1178,14 @@ sub mountpoint_mount { return if !$volid || !$mount; + $mount =~ s!/+!/!g; + my $mount_path; + my ($mpfd, $parentfd, $last_dir); if (defined($rootdir)) { - $rootdir =~ s!/+$!!; - $mount_path = "$rootdir/$mount"; - $mount_path =~ s!/+!/!g; - &$check_mount_path($mount_path); - File::Path::mkpath($mount_path); + ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) = + __mount_prepare_rootdir($rootdir, $mount); } my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); @@ -1031,16 +1193,23 @@ sub mountpoint_mount { die "unknown snapshot path for '$volid'" if !$storage && defined($snapname); my $optstring = ''; - if (defined($mountpoint->{acl})) { - $optstring .= ($mountpoint->{acl} ? 'acl' : 'noacl'); + my $acl = $mountpoint->{acl}; + if (defined($acl)) { + $optstring .= ($acl ? 'acl' : 'noacl'); } my $readonly = $mountpoint->{ro}; - my @extra_opts = ('-o', $optstring); + my @extra_opts = ('-o', $optstring) if $optstring; if ($storage) { my $scfg = PVE::Storage::storage_config($storage_cfg, $storage); + + # early sanity checks: + # we otherwise call realpath on the rbd url + die "containers on rbd storage without krbd are not supported\n" + if $scfg->{type} eq 'rbd' && !$scfg->{krbd}; + my $path = PVE::Storage::path($storage_cfg, $volid, $snapname); my ($vtype, undef, undef, undef, undef, $isBase, $format) = @@ -1059,12 +1228,25 @@ sub mountpoint_mount { die "cannot mount subvol snapshots for storage type '$scfg->{type}'\n"; } } else { - bindmount($path, $mount_path, $readonly, @extra_opts); + if (defined($acl) && $scfg->{type} eq 'zfspool') { + my $acltype = ($acl ? 'acltype=posixacl' : 'acltype=noacl'); + my (undef, $name) = PVE::Storage::parse_volname($storage_cfg, $volid); + $name .= "\@$snapname" if defined($snapname); + PVE::Tools::run_command(['zfs', 'set', $acltype, "$scfg->{pool}/$name"]); + } + bindmount($path, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts); warn "cannot enable quota control for bind mounted subvolumes\n" if $quota; } } - return wantarray ? ($path, 0, $mounted_dev) : $path; + return wantarray ? ($path, 0, undef) : $path; } elsif ($format eq 'raw' || $format eq 'iso') { + # NOTE: 'mount' performs canonicalization without the '-c' switch, which for + # device-mapper devices is special-cased to use the /dev/mapper symlinks. + # Our autodev hook expects the /dev/dm-* device currently + # and will create the /dev/mapper symlink accordingly + $path = Cwd::realpath($path); + die "failed to get device path\n" if !$path; + ($path) = ($path =~ /^(.*)$/s); #untaint my $domount = sub { my ($path) = @_; if ($mount_path) { @@ -1097,13 +1279,15 @@ sub mountpoint_mount { die "unsupported image format '$format'\n"; } } elsif ($type eq 'device') { - push @extra_opts, '-o', 'ro' if $readonly; + push @extra_opts, '-o', 'ro' if $readonly; + push @extra_opts, '-o', 'usrjquota=aquota.user,grpjquota=aquota.group,jqfmt=vfsv0' if $quota; + # See the NOTE above about devicemapper canonicalization + my ($devpath) = (Cwd::realpath($volid) =~ /^(.*)$/s); # realpath() taints PVE::Tools::run_command(['mount', @extra_opts, $volid, $mount_path]) if $mount_path; - return wantarray ? ($volid, 0, $volid) : $volid; + return wantarray ? ($volid, 0, $devpath) : $volid; } elsif ($type eq 'bind') { die "directory '$volid' does not exist\n" if ! -d $volid; - &$check_mount_path($volid); - bindmount($volid, $mount_path, $readonly, @extra_opts) if $mount_path; + bindmount($volid, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts) if $mount_path; warn "cannot enable quota control for bind mounts\n" if $quota; return wantarray ? ($volid, 0, undef) : $volid; }