X-Git-Url: https://git.proxmox.com/?p=pve-container.git;a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=89f289e9113de6a4540d03e142807ed65cbfc3ab;hp=8d91b228a8ae5b3014dccf15316ee8a88b7a14a6;hb=5a63f1c5d3b995dd682a70e7fbd1364240e09278;hpb=5fa038abea8b993199f6e573069b6f6a59c15920 diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index 8d91b22..89f289e 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -11,12 +11,14 @@ use File::Path; use File::Spec; use Cwd qw(); use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY); -use Errno qw(ELOOP ENOTDIR EROFS); +use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED); +use IO::Socket::UNIX; use PVE::Exception qw(raise_perm_exc); use PVE::Storage; use PVE::SafeSyslog; use PVE::INotify; +use PVE::JSONSchema qw(get_standard_option); use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH); use PVE::CpuSet; use PVE::Network; @@ -24,9 +26,10 @@ use PVE::AccessControl; use PVE::ProcFSTools; use PVE::Syscall; use PVE::LXC::Config; + use Time::HiRes qw (gettimeofday); -use Data::Dumper; +my $LXC_CONFIG_PATH = '/usr/share/lxc/config'; my $nodename = PVE::INotify::nodename(); @@ -43,7 +46,7 @@ sub config_list { my $d = $ids->{$vmid}; next if !$d->{node} || $d->{node} ne $nodename; next if !$d->{type} || $d->{type} ne 'lxc'; - $res->{$vmid}->{type} = 'lxc'; + $res->{$vmid} = { type => 'lxc', vmid => $vmid }; } return $res; } @@ -100,9 +103,9 @@ sub get_container_disk_usage { my $last_proc_vmid_stat; my $parse_cpuacct_stat = sub { - my ($vmid) = @_; + my ($vmid, $unprivileged) = @_; - my $raw = read_cgroup_value('cpuacct', $vmid, 'cpuacct.stat', 1); + my $raw = read_cgroup_value('cpuacct', $vmid, $unprivileged, 'cpuacct.stat', 1); my $stat = {}; @@ -116,10 +119,53 @@ my $parse_cpuacct_stat = sub { return $stat; }; +our $vmstatus_return_properties = { + vmid => get_standard_option('pve-vmid'), + status => { + description => "LXC Container status.", + type => 'string', + enum => ['stopped', 'running'], + }, + maxmem => { + description => "Maximum memory in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + maxswap => { + description => "Maximum SWAP memory in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + maxdisk => { + description => "Root disk size in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + name => { + description => "Container name.", + type => 'string', + optional => 1, + }, + uptime => { + description => "Uptime.", + type => 'integer', + optional => 1, + renderer => 'duration', + }, + cpus => { + description => "Maximum usable CPUs.", + type => 'number', + optional => 1, + }, +}; + sub vmstatus { my ($opt_vmid) = @_; - my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc' }} : config_list(); + my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => $opt_vmid }} : config_list(); my $active_hash = list_active_containers(); @@ -128,6 +174,9 @@ sub vmstatus { my $cdtime = gettimeofday; my $uptime = (PVE::ProcFSTools::read_proc_uptime(1))[0]; + my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); + + my $unprivileged = {}; foreach my $vmid (keys %$list) { my $d = $list->{$vmid}; @@ -140,6 +189,8 @@ sub vmstatus { my $cfspath = PVE::LXC::Config->cfs_config_path($vmid); my $conf = PVE::Cluster::cfs_read_file($cfspath) || {}; + $unprivileged->{$vmid} = $conf->{unprivileged}; + $d->{name} = $conf->{'hostname'} || "CT$vmid"; $d->{name} =~ s/[\s]//g; @@ -186,51 +237,67 @@ sub vmstatus { next if !$pid; # skip stopped CTs - my $ctime = (stat("/proc/$pid"))[10]; # 10 = ctime - $d->{uptime} = time - $ctime; # the method lxcfs uses + my $proc_pid_stat = PVE::ProcFSTools::read_proc_pid_stat($pid); + $d->{uptime} = int(($uptime - $proc_pid_stat->{starttime}) / $clock_ticks); # the method lxcfs uses - my $memory_stat = read_cgroup_list('memory', $vmid, 'memory.stat'); - my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, 'memory.usage_in_bytes'); + my $unpriv = $unprivileged->{$vmid}; - $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; - $d->{swap} = read_cgroup_value('memory', $vmid, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + if (-d '/sys/fs/cgroup/memory') { + my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat'); + my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes'); - my $blkio_bytes = read_cgroup_value('blkio', $vmid, 'blkio.throttle.io_service_bytes', 1); - my @bytes = split(/\n/, $blkio_bytes); - foreach my $byte (@bytes) { - if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { - $d->{diskread} = $2 if $key eq 'Read'; - $d->{diskwrite} = $2 if $key eq 'Write'; + $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; + $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + } else { + $d->{mem} = 0; + $d->{swap} = 0; + } + + if (-d '/sys/fs/cgroup/blkio') { + my $blkio_bytes = read_cgroup_value('blkio', $vmid, $unpriv, 'blkio.throttle.io_service_bytes', 1); + my @bytes = split(/\n/, $blkio_bytes); + foreach my $byte (@bytes) { + if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { + $d->{diskread} += $2 if $key eq 'Read'; + $d->{diskwrite} += $2 if $key eq 'Write'; + } } + } else { + $d->{diskread} = 0; + $d->{diskwrite} = 0; } - my $pstat = &$parse_cpuacct_stat($vmid); + if (-d '/sys/fs/cgroup/cpuacct') { + my $pstat = $parse_cpuacct_stat->($vmid, $unpriv); - my $used = $pstat->{utime} + $pstat->{stime}; + my $used = $pstat->{utime} + $pstat->{stime}; - my $old = $last_proc_vmid_stat->{$vmid}; - if (!$old) { - $last_proc_vmid_stat->{$vmid} = { - time => $cdtime, - used => $used, - cpu => 0, - }; - next; - } + my $old = $last_proc_vmid_stat->{$vmid}; + if (!$old) { + $last_proc_vmid_stat->{$vmid} = { + time => $cdtime, + used => $used, + cpu => 0, + }; + next; + } - my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; + my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; - if ($dtime > 1000) { - my $dutime = $used - $old->{used}; + if ($dtime > 1000) { + my $dutime = $used - $old->{used}; - $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; - $last_proc_vmid_stat->{$vmid} = { - time => $cdtime, - used => $used, - cpu => $d->{cpu}, - }; + $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; + $last_proc_vmid_stat->{$vmid} = { + time => $cdtime, + used => $used, + cpu => $d->{cpu}, + }; + } else { + $d->{cpu} = $old->{cpu}; + } } else { - $d->{cpu} = $old->{cpu}; + $d->{cpu} = 0; } } @@ -251,18 +318,19 @@ sub vmstatus { return $list; } -sub read_cgroup_list { - my ($group, $vmid, $name) = @_; +sub read_cgroup_list($$$$) { + my ($group, $vmid, $unprivileged, $name) = @_; - my $content = read_cgroup_value($group, $vmid, $name, 1); + my $content = read_cgroup_value($group, $vmid, $unprivileged, $name, 1); return { split(/\s+/, $content) }; } -sub read_cgroup_value { - my ($group, $vmid, $name, $full) = @_; +sub read_cgroup_value($$$$$) { + my ($group, $vmid, $unprivileged, $name, $full) = @_; - my $path = "/sys/fs/cgroup/$group/lxc/$vmid/$name"; + my $nsdir = $unprivileged ? '' : 'ns/'; + my $path = "/sys/fs/cgroup/$group/lxc/$vmid/${nsdir}$name"; return PVE::Tools::file_get_contents($path) if $full; @@ -332,6 +400,108 @@ sub parse_ipv4_cidr { die "unable to parse ipv4 address/mask\n"; } +sub get_cgroup_subsystems { + my $v1 = {}; + my $v2 = 0; + my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); + while ($data =~ /^\d+:([^:\n]*):.*$/gm) { + my $type = $1; + if (length($type)) { + $v1->{$_} = 1 foreach split(/,/, $type); + } else { + $v2 = 1; + } + } + return wantarray ? ($v1, $v2) : $v1; +} + +# Currently we do not need to create seccomp profile 'files' as the only +# choice our configuration actually allows is "with or without keyctl()", +# so we distinguish between using lxc's "default" seccomp profile and our +# added pve-userns.seccomp file. +# +# This returns a configuration line added to the raw lxc config. +sub make_seccomp_config { + my ($conf, $unprivileged, $features) = @_; + # User-configured profile has precedence, note that the user's entry would + # be written 'after' this line anyway... + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) { + # Warn the user if this conflicts with a feature: + if ($features->{keyctl}) { + warn "explicitly configured lxc.seccomp.profile overrides the following settings: features:keyctl\n"; + } + return ''; + } + + # Privileged containers keep using the default (which is already part of + # the files included via lxc.include, so we don't need to write it out, + # that way it stays admin-configurable via /usr/share/lxc/config/... as + # well) + return '' if !$unprivileged; + + # Unprivileged containers will get keyctl() disabled by default as a + # workaround for systemd-networkd behavior. But we have an option to + # explicitly enable it: + return '' if $features->{keyctl}; + + # Finally we're in an unprivileged container without `keyctl` set + # explicitly. We have a file prepared for this: + return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n"; +} + +# Since lxc-3.0.2 we can have lxc generate a profile for the container +# automatically. The default should be equivalent to the old +# `lxc-container-default-cgns` profile. +# +# Additionally this also added `lxc.apparmor.raw` which can be used to inject +# additional lines into the profile. We can use that to allow mounting specific +# file systems. +sub make_apparmor_config { + my ($conf, $unprivileged, $features) = @_; + + # user-configured profile has precedence, but first we go through our own + # code to figure out whether we should warn the user: + + my $raw = "lxc.apparmor.profile = generated\n"; + my @profile_uses; + + # There's lxc.apparmor.allow_nesting now, which will add the necessary + # apparmor lines, create an apparmor namespace for the container, but also + # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have + # lxcfs mounted over them, because that would prevent the container from + # mounting new instances of them for nested containers. + if ($features->{nesting}) { + push @profile_uses, 'features:nesting'; + $raw .= "lxc.apparmor.allow_nesting = 1\n" + } else { + # In the default profile in /etc/apparmor.d we patch this in because + # otherwise a container can for example run `chown` on /sys, breaking + # access to it for non-CAP_DAC_OVERRIDE tools on the host: + $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n"; + $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n"; + # Preferably we could use the 'remount' flag but this does not sit well + # with apparmor_parser currently: + # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/, + } + + if (my $mount = $features->{mount}) { + push @profile_uses, 'features:mount'; + foreach my $fs (PVE::Tools::split_list($mount)) { + $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n"; + } + } + + # More to come? + + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) { + if (length(my $used = join(', ', @profile_uses))) { + warn "explicitly configured lxc.apparmor.profile overrides the following settings: $used\n"; + } + return ''; + } + + return $raw; +} sub update_lxc_config { my ($vmid, $conf) = @_; @@ -350,23 +520,26 @@ sub update_lxc_config { die "missing 'arch' - internal error" if !$conf->{arch}; $raw .= "lxc.arch = $conf->{arch}\n"; - my $unprivileged = $conf->{unprivileged}; - my $custom_idmap = grep { $_->[0] eq 'lxc.id_map' } @{$conf->{lxc}}; + my $custom_idmap = PVE::LXC::Config->has_lxc_entry($conf, 'lxc.idmap'); + my $unprivileged = $conf->{unprivileged} || $custom_idmap; my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; - if ($ostype =~ /^(?:debian | ubuntu | centos | fedora | opensuse | archlinux | alpine | gentoo | unmanaged)$/x) { - my $inc ="/usr/share/lxc/config/$ostype.common.conf"; - $inc ="/usr/share/lxc/config/common.conf" if !-f $inc; + + my $cfgpath = '/usr/share/lxc/config'; + my $inc = "$cfgpath/$ostype.common.conf"; + $inc ="$cfgpath/common.conf" if !-f $inc; + $raw .= "lxc.include = $inc\n"; + if ($unprivileged) { + $inc = "$cfgpath/$ostype.userns.conf"; + $inc = "$cfgpath/userns.conf" if !-f $inc; $raw .= "lxc.include = $inc\n"; - if ($unprivileged || $custom_idmap) { - $inc = "/usr/share/lxc/config/$ostype.userns.conf"; - $inc = "/usr/share/lxc/config/userns.conf" if !-f $inc; - $raw .= "lxc.include = $inc\n" - } - } else { - die "implement me (ostype $ostype)"; } + my $features = PVE::LXC::Config->parse_features($conf->{features}); + + $raw .= make_seccomp_config($conf, $unprivileged, $features); + $raw .= make_apparmor_config($conf, $unprivileged, $features); + # WARNING: DO NOT REMOVE this without making sure that loop device nodes # cannot be exposed to the container with r/w access (cgroup perms). # When this is enabled mounts will still remain in the monitor's namespace @@ -374,89 +547,92 @@ sub update_lxc_config { # files while the container is running! $raw .= "lxc.monitor.unshare = 1\n"; + my $cgv1 = get_cgroup_subsystems(); + # Should we read them from /etc/subuid? if ($unprivileged && !$custom_idmap) { - $raw .= "lxc.id_map = u 0 100000 65536\n"; - $raw .= "lxc.id_map = g 0 100000 65536\n"; + $raw .= "lxc.idmap = u 0 100000 65536\n"; + $raw .= "lxc.idmap = g 0 100000 65536\n"; } if (!PVE::LXC::Config->has_dev_console($conf)) { - $raw .= "lxc.console = none\n"; - $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n"; + $raw .= "lxc.console.path = none\n"; + $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n" if $cgv1->{devices}; } my $ttycount = PVE::LXC::Config->get_tty_count($conf); - $raw .= "lxc.tty = $ttycount\n"; + $raw .= "lxc.tty.max = $ttycount\n"; # some init scripts expect a linux terminal (turnkey). $raw .= "lxc.environment = TERM=linux\n"; my $utsname = $conf->{hostname} || "CT$vmid"; - $raw .= "lxc.utsname = $utsname\n"; + $raw .= "lxc.uts.name = $utsname\n"; - my $memory = $conf->{memory} || 512; - my $swap = $conf->{swap} // 0; + if ($cgv1->{memory}) { + my $memory = $conf->{memory} || 512; + my $swap = $conf->{swap} // 0; - my $lxcmem = int($memory*1024*1024); - $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n"; + my $lxcmem = int($memory*1024*1024); + $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n"; - my $lxcswap = int(($memory + $swap)*1024*1024); - $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; - - if (my $cpulimit = $conf->{cpulimit}) { - $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n"; - my $value = int(100000*$cpulimit); - $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; + my $lxcswap = int(($memory + $swap)*1024*1024); + $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; } - my $shares = $conf->{cpuunits} || 1024; - $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + if ($cgv1->{cpu}) { + if (my $cpulimit = $conf->{cpulimit}) { + $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n"; + my $value = int(100000*$cpulimit); + $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; + } + + my $shares = $conf->{cpuunits} || 1024; + $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + } die "missing 'rootfs' configuration\n" if !defined($conf->{rootfs}); my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); - $raw .= "lxc.rootfs = $dir/rootfs\n"; + $raw .= "lxc.rootfs.path = $dir/rootfs\n"; - my $netcount = 0; foreach my $k (sort keys %$conf) { next if $k !~ m/^net(\d+)$/; my $ind = $1; my $d = PVE::LXC::Config->parse_lxc_network($conf->{$k}); - $netcount++; - $raw .= "lxc.network.type = veth\n"; - $raw .= "lxc.network.veth.pair = veth${vmid}i${ind}\n"; - $raw .= "lxc.network.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); - $raw .= "lxc.network.name = $d->{name}\n" if defined($d->{name}); - $raw .= "lxc.network.mtu = $d->{mtu}\n" if defined($d->{mtu}); + $raw .= "lxc.net.$ind.type = veth\n"; + $raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n"; + $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); + $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name}); + $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu}); } - my $had_cpuset = 0; - if (my $lxcconf = $conf->{lxc}) { - foreach my $entry (@$lxcconf) { - my ($k, $v) = @$entry; - $netcount++ if $k eq 'lxc.network.type'; - $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; - $raw .= "$k = $v\n"; + if ($cgv1->{cpuset}) { + my $had_cpuset = 0; + if (my $lxcconf = $conf->{lxc}) { + foreach my $entry (@$lxcconf) { + my ($k, $v) = @$entry; + $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; + $raw .= "$k = $v\n"; + } } - } - $raw .= "lxc.network.type = empty\n" if !$netcount; - - my $cores = $conf->{cores}; - if (!$had_cpuset && $cores) { - my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; - $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; - my @members = $cpuset->members(); - while (scalar(@members) > $cores) { - my $randidx = int(rand(scalar(@members))); - $cpuset->delete($members[$randidx]); - splice(@members, $randidx, 1); # keep track of the changes + my $cores = $conf->{cores}; + if (!$had_cpuset && $cores) { + my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; + $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; + my @members = $cpuset->members(); + while (scalar(@members) > $cores) { + my $randidx = int(rand(scalar(@members))); + $cpuset->delete($members[$randidx]); + splice(@members, $randidx, 1); # keep track of the changes + } + $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; } - $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; } - + File::Path::mkpath("$dir/rootfs"); PVE::Tools::file_set_contents("$dir/config", $raw); @@ -488,19 +664,24 @@ sub verify_searchdomain_list { } sub get_console_command { - my ($vmid, $conf) = @_; + my ($vmid, $conf, $noescapechar) = @_; my $cmode = PVE::LXC::Config->get_cmode($conf); + my $cmd = []; if ($cmode eq 'console') { - return ['lxc-console', '-n', $vmid, '-t', 0]; + push @$cmd, 'lxc-console', '-n', $vmid, '-t', 0; + push @$cmd, '-e', -1 if $noescapechar; } elsif ($cmode eq 'tty') { - return ['lxc-console', '-n', $vmid]; + push @$cmd, 'lxc-console', '-n', $vmid; + push @$cmd, '-e', -1 if $noescapechar; } elsif ($cmode eq 'shell') { - return ['lxc-attach', '--clear-env', '-n', $vmid]; + push @$cmd, 'lxc-attach', '--clear-env', '-n', $vmid; } else { die "internal error"; } + + return $cmd; } sub get_primary_ips { @@ -704,9 +885,10 @@ sub update_ipconfig { my $newip = $newnet->{$ip}; my $newgw = $newnet->{$gw}; my $oldip = $optdata->{$ip}; + my $oldgw = $optdata->{$gw}; my $change_ip = &$safe_string_ne($oldip, $newip); - my $change_gw = &$safe_string_ne($optdata->{$gw}, $newgw); + my $change_gw = &$safe_string_ne($oldgw, $newgw); return if !$change_ip && !$change_gw; @@ -749,6 +931,11 @@ sub update_ipconfig { # warn and continue warn $@ if $@; } + if ($oldgw && $oldip && !PVE::Network::is_ip_in_cidr($oldgw, $oldip)) { + eval { &$ipcmd($family_opt, 'route', 'del', $oldgw, 'dev', $eth); }; + # warn if the route was deleted manually + warn $@ if $@; + } } # from this point on we save the configuration @@ -863,17 +1050,26 @@ sub template_create { my $storecfg = PVE::Storage::config(); - my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); - my $volid = $rootinfo->{volume}; + PVE::LXC::Config->foreach_mountpoint($conf, sub { + my ($ms, $mountpoint) = @_; + + my $volid = $mountpoint->{volume}; + + die "Template feature is not available for '$volid'\n" + if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); + }); - die "Template feature is not available for '$volid'\n" - if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); + PVE::LXC::Config->foreach_mountpoint($conf, sub { + my ($ms, $mountpoint) = @_; - PVE::Storage::activate_volumes($storecfg, [$volid]); + my $volid = $mountpoint->{volume}; - my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid); - $rootinfo->{volume} = $template_volid; - $conf->{rootfs} = PVE::LXC::Config->print_ct_mountpoint($rootinfo, 1); + PVE::Storage::activate_volumes($storecfg, [$volid]); + + my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid); + $mountpoint->{volume} = $template_volid; + $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq "rootfs"); + }); PVE::LXC::Config->write_config($vmid, $conf); } @@ -899,6 +1095,9 @@ sub check_ct_modify_config_perm { } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); + } elsif ($opt eq 'features') { + # For now this is restricted to root@pam + raise_perm_exc("changing feature flags is only allowed for root\@pam"); } else { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']); } @@ -1193,7 +1392,8 @@ sub mountpoint_mount { } my $readonly = $mountpoint->{ro}; - my @extra_opts = ('-o', $optstring) if $optstring; + my @extra_opts; + @extra_opts = ('-o', $optstring) if $optstring; if ($storage) { @@ -1331,6 +1531,60 @@ sub destroy_disks { } } +sub alloc_disk { + my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_; + + my $needs_chown = 0; + my $volid; + + my $scfg = PVE::Storage::storage_config($storecfg, $storage); + # fixme: use better naming ct-$vmid-disk-X.raw? + + eval { + my $do_format = 0; + if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs' || $scfg->{type} eq 'cifs' ) { + if ($size_kb > 0) { + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', + undef, $size_kb); + $do_format = 1; + } else { + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', + undef, 0); + $needs_chown = 1; + } + } elsif ($scfg->{type} eq 'zfspool') { + + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', + undef, $size_kb); + $needs_chown = 1; + } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') { + + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); + $do_format = 1; + + } elsif ($scfg->{type} eq 'rbd') { + + die "krbd option must be enabled on storage type '$scfg->{type}'\n" if !$scfg->{krbd}; + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); + $do_format = 1; + } else { + die "unable to create containers on storage type '$scfg->{type}'\n"; + } + format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format; + }; + if (my $err = $@) { + # in case formatting got interrupted: + if (defined($volid)) { + eval { PVE::Storage::vdisk_free($storecfg, $volid); }; + warn $@ if $@; + } + die $err; + } + + return ($volid, $needs_chown); +} + +our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/; sub create_disks { my ($storecfg, $vmid, $settings, $conf) = @_; @@ -1348,42 +1602,14 @@ sub create_disks { my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); - if ($storage && ($volid =~ m/^([^:\s]+):(\d+(\.\d+)?)$/)) { + if ($storage && ($volid =~ $NEW_DISK_RE)) { my ($storeid, $size_gb) = ($1, $2); my $size_kb = int(${size_gb}*1024) * 1024; - my $scfg = PVE::Storage::storage_config($storecfg, $storage); - # fixme: use better naming ct-$vmid-disk-X.raw? - - if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs') { - if ($size_kb > 0) { - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', - undef, $size_kb); - format_disk($storecfg, $volid, $rootuid, $rootgid); - } else { - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', - undef, 0); - push @$chown_vollist, $volid; - } - } elsif ($scfg->{type} eq 'zfspool') { - - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', - undef, $size_kb); - push @$chown_vollist, $volid; - } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') { - - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); - format_disk($storecfg, $volid, $rootuid, $rootgid); - - } elsif ($scfg->{type} eq 'rbd') { - - die "krbd option must be enabled on storage type '$scfg->{type}'\n" if !$scfg->{krbd}; - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); - format_disk($storecfg, $volid, $rootuid, $rootgid); - } else { - die "unable to create containers on storage type '$scfg->{type}'\n"; - } + my $needs_chown = 0; + ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid); + push @$chown_vollist, $volid if $needs_chown; push @$vollist, $volid; $mountpoint->{volume} = $volid; $mountpoint->{size} = $size_kb * 1024; @@ -1479,7 +1705,8 @@ sub parse_id_maps { my $lxc = $conf->{lxc}; foreach my $entry (@$lxc) { my ($key, $value) = @$entry; - next if $key ne 'lxc.id_map'; + # FIXME: remove the 'id_map' variant when lxc-3.0 arrives + next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map'; if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) { my ($type, $ct, $host, $length) = ($1, $2, $3, $4); push @$id_map, [$type, $ct, $host, $length]; @@ -1488,7 +1715,7 @@ sub parse_id_maps { $rootgid = $host if $type eq 'g'; } } else { - die "failed to parse id_map: $value\n"; + die "failed to parse idmap: $value\n"; } } @@ -1510,5 +1737,163 @@ sub userns_command { return []; } +sub vm_start { + my ($vmid, $conf, $skiplock) = @_; + + update_lxc_config($vmid, $conf); + + my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid"; + + if ($skiplock) { + open(my $fh, '>', $skiplock_flag_fn) || die "failed to open $skiplock_flag_fn for writing: $!\n"; + close($fh); + } + + my $cmd = ['systemctl', 'start', "pve-container\@$vmid"]; + + eval { PVE::Tools::run_command($cmd); }; + if (my $err = $@) { + unlink $skiplock_flag_fn; + die $err; + } + + return; +} + +# Helper to stop a container completely and make sure it has stopped completely. +# This is necessary because we want the post-stop hook to have completed its +# unmount-all step, but post-stop happens after lxc puts the container into the +# STOPPED state. +sub vm_stop { + my ($vmid, $kill, $shutdown_timeout, $exit_timeout) = @_; + + # Open the container's command socket. + my $path = "\0/var/lib/lxc/$vmid/command"; + my $sock = IO::Socket::UNIX->new( + Type => SOCK_STREAM(), + Peer => $path, + ); + if (!$sock) { + return if $! == ECONNREFUSED; # The container is not running + die "failed to open container ${vmid}'s command socket: $!\n"; + } + + # Stop the container: + + my $cmd = ['lxc-stop', '-n', $vmid]; + + if ($kill) { + push @$cmd, '--kill'; # doesn't allow timeouts + } elsif (defined($shutdown_timeout)) { + push @$cmd, '--timeout', $shutdown_timeout; + # Give run_command 5 extra seconds + $shutdown_timeout += 5; + } + + eval { PVE::Tools::run_command($cmd, timeout => $shutdown_timeout) }; + if (my $err = $@) { + warn $@ if $@; + } + + my $result = 1; + my $wait = sub { $result = <$sock>; }; + if (defined($exit_timeout)) { + PVE::Tools::run_with_timeout($exit_timeout, $wait); + } else { + $wait->(); + } + + return if !defined $result; # monitor is gone and the ct has stopped. + die "container did not stop\n"; +} + +sub run_unshared { + my ($code) = @_; + + return PVE::Tools::run_fork(sub { + # Unshare the mount namespace + die "failed to unshare mount namespace: $!\n" + if !PVE::Tools::unshare(PVE::Tools::CLONE_NEWNS); + PVE::Tools::run_command(['mount', '--make-rslave', '/']); + return $code->(); + }); +} + +my $copy_volume = sub { + my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname) = @_; + + my $src_mp = { volume => $src_volid, mp => '/' }; + $src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid); + + my $dst_mp = { volume => $dst_volid, mp => '/' }; + $dst_mp->{type} = PVE::LXC::Config->classify_mountpoint($dst_volid); + + my @mounted; + eval { + # mount and copy + mkdir $src; + mountpoint_mount($src_mp, $src, $storage_cfg, $snapname); + push @mounted, $src; + mkdir $dest; + mountpoint_mount($dst_mp, $dest, $storage_cfg); + push @mounted, $dest; + + PVE::Tools::run_command(['/usr/bin/rsync', '--stats', '-X', '-A', '--numeric-ids', + '-aH', '--whole-file', '--sparse', '--one-file-system', + "$src/", $dest]); + }; + my $err = $@; + foreach my $mount (reverse @mounted) { + eval { PVE::Tools::run_command(['/bin/umount', '--lazy', $mount], errfunc => sub{})}; + warn "Can't umount $mount\n" if $@; + } + + # If this fails they're used as mount points in a concurrent operation + # (which should not happen but there's also no real need to get rid of them). + rmdir $dest; + rmdir $src; + + die $err if $err; +}; + +# Should not be called after unsharing the mount namespace! +sub copy_volume { + my ($mp, $vmid, $storage, $storage_cfg, $conf, $snapname) = @_; + + die "cannot copy volumes of type $mp->{type}\n" if $mp->{type} ne 'volume'; + File::Path::make_path("/var/lib/lxc/$vmid"); + my $dest = "/var/lib/lxc/$vmid/.copy-volume-1"; + my $src = "/var/lib/lxc/$vmid/.copy-volume-2"; + + # get id's for unprivileged container + my (undef, $rootuid, $rootgid) = parse_id_maps($conf); + + # Allocate the disk before unsharing in order to make sure zfs subvolumes + # are visible in this namespace, otherwise the host only sees the empty + # (not-mounted) directory. + my $new_volid; + eval { + # Make sure $mp contains a correct size. + $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume}); + my $needs_chown; + ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid); + if ($needs_chown) { + PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef); + my $path = PVE::Storage::path($storage_cfg, $new_volid, undef); + chown($rootuid, $rootgid, $path); + } + + run_unshared(sub { + $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname); + }); + }; + if (my $err = $@) { + PVE::Storage::vdisk_free($storage_cfg, $new_volid) + if defined($new_volid); + die $err; + } + + return $new_volid; +} 1;