X-Git-Url: https://git.proxmox.com/?p=pve-container.git;a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=89f289e9113de6a4540d03e142807ed65cbfc3ab;hp=a107ec204b00c2b761810921c6550adad06466cb;hb=5a63f1c5d3b995dd682a70e7fbd1364240e09278;hpb=1322f50d6ae4eb2b145e3492137a305b68d95f0c diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index a107ec2..89f289e 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -18,6 +18,7 @@ use PVE::Exception qw(raise_perm_exc); use PVE::Storage; use PVE::SafeSyslog; use PVE::INotify; +use PVE::JSONSchema qw(get_standard_option); use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH); use PVE::CpuSet; use PVE::Network; @@ -25,8 +26,11 @@ use PVE::AccessControl; use PVE::ProcFSTools; use PVE::Syscall; use PVE::LXC::Config; + use Time::HiRes qw (gettimeofday); +my $LXC_CONFIG_PATH = '/usr/share/lxc/config'; + my $nodename = PVE::INotify::nodename(); my $cpuinfo= PVE::ProcFSTools::read_cpuinfo(); @@ -42,7 +46,7 @@ sub config_list { my $d = $ids->{$vmid}; next if !$d->{node} || $d->{node} ne $nodename; next if !$d->{type} || $d->{type} ne 'lxc'; - $res->{$vmid}->{type} = 'lxc'; + $res->{$vmid} = { type => 'lxc', vmid => $vmid }; } return $res; } @@ -115,10 +119,53 @@ my $parse_cpuacct_stat = sub { return $stat; }; +our $vmstatus_return_properties = { + vmid => get_standard_option('pve-vmid'), + status => { + description => "LXC Container status.", + type => 'string', + enum => ['stopped', 'running'], + }, + maxmem => { + description => "Maximum memory in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + maxswap => { + description => "Maximum SWAP memory in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + maxdisk => { + description => "Root disk size in bytes.", + type => 'integer', + optional => 1, + renderer => 'bytes', + }, + name => { + description => "Container name.", + type => 'string', + optional => 1, + }, + uptime => { + description => "Uptime.", + type => 'integer', + optional => 1, + renderer => 'duration', + }, + cpus => { + description => "Maximum usable CPUs.", + type => 'number', + optional => 1, + }, +}; + sub vmstatus { my ($opt_vmid) = @_; - my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc' }} : config_list(); + my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => $opt_vmid }} : config_list(); my $active_hash = list_active_containers(); @@ -127,6 +174,7 @@ sub vmstatus { my $cdtime = gettimeofday; my $uptime = (PVE::ProcFSTools::read_proc_uptime(1))[0]; + my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); my $unprivileged = {}; @@ -189,53 +237,67 @@ sub vmstatus { next if !$pid; # skip stopped CTs - my $ctime = (stat("/proc/$pid"))[10]; # 10 = ctime - $d->{uptime} = time - $ctime; # the method lxcfs uses + my $proc_pid_stat = PVE::ProcFSTools::read_proc_pid_stat($pid); + $d->{uptime} = int(($uptime - $proc_pid_stat->{starttime}) / $clock_ticks); # the method lxcfs uses my $unpriv = $unprivileged->{$vmid}; - my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat'); - my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes'); + if (-d '/sys/fs/cgroup/memory') { + my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat'); + my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes'); - $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; - $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; + $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + } else { + $d->{mem} = 0; + $d->{swap} = 0; + } - my $blkio_bytes = read_cgroup_value('blkio', $vmid, $unpriv, 'blkio.throttle.io_service_bytes', 1); - my @bytes = split(/\n/, $blkio_bytes); - foreach my $byte (@bytes) { - if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { - $d->{diskread} += $2 if $key eq 'Read'; - $d->{diskwrite} += $2 if $key eq 'Write'; + if (-d '/sys/fs/cgroup/blkio') { + my $blkio_bytes = read_cgroup_value('blkio', $vmid, $unpriv, 'blkio.throttle.io_service_bytes', 1); + my @bytes = split(/\n/, $blkio_bytes); + foreach my $byte (@bytes) { + if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { + $d->{diskread} += $2 if $key eq 'Read'; + $d->{diskwrite} += $2 if $key eq 'Write'; + } } + } else { + $d->{diskread} = 0; + $d->{diskwrite} = 0; } - my $pstat = $parse_cpuacct_stat->($vmid, $unpriv); + if (-d '/sys/fs/cgroup/cpuacct') { + my $pstat = $parse_cpuacct_stat->($vmid, $unpriv); - my $used = $pstat->{utime} + $pstat->{stime}; + my $used = $pstat->{utime} + $pstat->{stime}; - my $old = $last_proc_vmid_stat->{$vmid}; - if (!$old) { - $last_proc_vmid_stat->{$vmid} = { - time => $cdtime, - used => $used, - cpu => 0, - }; - next; - } + my $old = $last_proc_vmid_stat->{$vmid}; + if (!$old) { + $last_proc_vmid_stat->{$vmid} = { + time => $cdtime, + used => $used, + cpu => 0, + }; + next; + } - my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; + my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; - if ($dtime > 1000) { - my $dutime = $used - $old->{used}; + if ($dtime > 1000) { + my $dutime = $used - $old->{used}; - $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; - $last_proc_vmid_stat->{$vmid} = { - time => $cdtime, - used => $used, - cpu => $d->{cpu}, - }; + $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; + $last_proc_vmid_stat->{$vmid} = { + time => $cdtime, + used => $used, + cpu => $d->{cpu}, + }; + } else { + $d->{cpu} = $old->{cpu}; + } } else { - $d->{cpu} = $old->{cpu}; + $d->{cpu} = 0; } } @@ -338,6 +400,108 @@ sub parse_ipv4_cidr { die "unable to parse ipv4 address/mask\n"; } +sub get_cgroup_subsystems { + my $v1 = {}; + my $v2 = 0; + my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); + while ($data =~ /^\d+:([^:\n]*):.*$/gm) { + my $type = $1; + if (length($type)) { + $v1->{$_} = 1 foreach split(/,/, $type); + } else { + $v2 = 1; + } + } + return wantarray ? ($v1, $v2) : $v1; +} + +# Currently we do not need to create seccomp profile 'files' as the only +# choice our configuration actually allows is "with or without keyctl()", +# so we distinguish between using lxc's "default" seccomp profile and our +# added pve-userns.seccomp file. +# +# This returns a configuration line added to the raw lxc config. +sub make_seccomp_config { + my ($conf, $unprivileged, $features) = @_; + # User-configured profile has precedence, note that the user's entry would + # be written 'after' this line anyway... + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) { + # Warn the user if this conflicts with a feature: + if ($features->{keyctl}) { + warn "explicitly configured lxc.seccomp.profile overrides the following settings: features:keyctl\n"; + } + return ''; + } + + # Privileged containers keep using the default (which is already part of + # the files included via lxc.include, so we don't need to write it out, + # that way it stays admin-configurable via /usr/share/lxc/config/... as + # well) + return '' if !$unprivileged; + + # Unprivileged containers will get keyctl() disabled by default as a + # workaround for systemd-networkd behavior. But we have an option to + # explicitly enable it: + return '' if $features->{keyctl}; + + # Finally we're in an unprivileged container without `keyctl` set + # explicitly. We have a file prepared for this: + return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n"; +} + +# Since lxc-3.0.2 we can have lxc generate a profile for the container +# automatically. The default should be equivalent to the old +# `lxc-container-default-cgns` profile. +# +# Additionally this also added `lxc.apparmor.raw` which can be used to inject +# additional lines into the profile. We can use that to allow mounting specific +# file systems. +sub make_apparmor_config { + my ($conf, $unprivileged, $features) = @_; + + # user-configured profile has precedence, but first we go through our own + # code to figure out whether we should warn the user: + + my $raw = "lxc.apparmor.profile = generated\n"; + my @profile_uses; + + # There's lxc.apparmor.allow_nesting now, which will add the necessary + # apparmor lines, create an apparmor namespace for the container, but also + # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have + # lxcfs mounted over them, because that would prevent the container from + # mounting new instances of them for nested containers. + if ($features->{nesting}) { + push @profile_uses, 'features:nesting'; + $raw .= "lxc.apparmor.allow_nesting = 1\n" + } else { + # In the default profile in /etc/apparmor.d we patch this in because + # otherwise a container can for example run `chown` on /sys, breaking + # access to it for non-CAP_DAC_OVERRIDE tools on the host: + $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n"; + $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n"; + # Preferably we could use the 'remount' flag but this does not sit well + # with apparmor_parser currently: + # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/, + } + + if (my $mount = $features->{mount}) { + push @profile_uses, 'features:mount'; + foreach my $fs (PVE::Tools::split_list($mount)) { + $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n"; + } + } + + # More to come? + + if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) { + if (length(my $used = join(', ', @profile_uses))) { + warn "explicitly configured lxc.apparmor.profile overrides the following settings: $used\n"; + } + return ''; + } + + return $raw; +} sub update_lxc_config { my ($vmid, $conf) = @_; @@ -356,20 +520,26 @@ sub update_lxc_config { die "missing 'arch' - internal error" if !$conf->{arch}; $raw .= "lxc.arch = $conf->{arch}\n"; - my $unprivileged = $conf->{unprivileged}; - my $custom_idmap = grep { $_->[0] eq 'lxc.idmap' } @{$conf->{lxc}}; + my $custom_idmap = PVE::LXC::Config->has_lxc_entry($conf, 'lxc.idmap'); + my $unprivileged = $conf->{unprivileged} || $custom_idmap; my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; - my $inc ="/usr/share/lxc/config/$ostype.common.conf"; - $inc ="/usr/share/lxc/config/common.conf" if !-f $inc; + my $cfgpath = '/usr/share/lxc/config'; + my $inc = "$cfgpath/$ostype.common.conf"; + $inc ="$cfgpath/common.conf" if !-f $inc; $raw .= "lxc.include = $inc\n"; - if ($unprivileged || $custom_idmap) { - $inc = "/usr/share/lxc/config/$ostype.userns.conf"; - $inc = "/usr/share/lxc/config/userns.conf" if !-f $inc; - $raw .= "lxc.include = $inc\n" + if ($unprivileged) { + $inc = "$cfgpath/$ostype.userns.conf"; + $inc = "$cfgpath/userns.conf" if !-f $inc; + $raw .= "lxc.include = $inc\n"; } + my $features = PVE::LXC::Config->parse_features($conf->{features}); + + $raw .= make_seccomp_config($conf, $unprivileged, $features); + $raw .= make_apparmor_config($conf, $unprivileged, $features); + # WARNING: DO NOT REMOVE this without making sure that loop device nodes # cannot be exposed to the container with r/w access (cgroup perms). # When this is enabled mounts will still remain in the monitor's namespace @@ -377,6 +547,8 @@ sub update_lxc_config { # files while the container is running! $raw .= "lxc.monitor.unshare = 1\n"; + my $cgv1 = get_cgroup_subsystems(); + # Should we read them from /etc/subuid? if ($unprivileged && !$custom_idmap) { $raw .= "lxc.idmap = u 0 100000 65536\n"; @@ -385,7 +557,7 @@ sub update_lxc_config { if (!PVE::LXC::Config->has_dev_console($conf)) { $raw .= "lxc.console.path = none\n"; - $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n"; + $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n" if $cgv1->{devices}; } my $ttycount = PVE::LXC::Config->get_tty_count($conf); @@ -397,23 +569,27 @@ sub update_lxc_config { my $utsname = $conf->{hostname} || "CT$vmid"; $raw .= "lxc.uts.name = $utsname\n"; - my $memory = $conf->{memory} || 512; - my $swap = $conf->{swap} // 0; + if ($cgv1->{memory}) { + my $memory = $conf->{memory} || 512; + my $swap = $conf->{swap} // 0; - my $lxcmem = int($memory*1024*1024); - $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n"; + my $lxcmem = int($memory*1024*1024); + $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n"; - my $lxcswap = int(($memory + $swap)*1024*1024); - $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; - - if (my $cpulimit = $conf->{cpulimit}) { - $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n"; - my $value = int(100000*$cpulimit); - $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; + my $lxcswap = int(($memory + $swap)*1024*1024); + $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; } - my $shares = $conf->{cpuunits} || 1024; - $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + if ($cgv1->{cpu}) { + if (my $cpulimit = $conf->{cpulimit}) { + $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n"; + my $value = int(100000*$cpulimit); + $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; + } + + my $shares = $conf->{cpuunits} || 1024; + $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + } die "missing 'rootfs' configuration\n" if !defined($conf->{rootfs}); @@ -433,28 +609,30 @@ sub update_lxc_config { $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu}); } - my $had_cpuset = 0; - if (my $lxcconf = $conf->{lxc}) { - foreach my $entry (@$lxcconf) { - my ($k, $v) = @$entry; - $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; - $raw .= "$k = $v\n"; + if ($cgv1->{cpuset}) { + my $had_cpuset = 0; + if (my $lxcconf = $conf->{lxc}) { + foreach my $entry (@$lxcconf) { + my ($k, $v) = @$entry; + $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; + $raw .= "$k = $v\n"; + } } - } - my $cores = $conf->{cores}; - if (!$had_cpuset && $cores) { - my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; - $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; - my @members = $cpuset->members(); - while (scalar(@members) > $cores) { - my $randidx = int(rand(scalar(@members))); - $cpuset->delete($members[$randidx]); - splice(@members, $randidx, 1); # keep track of the changes + my $cores = $conf->{cores}; + if (!$had_cpuset && $cores) { + my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; + $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; + my @members = $cpuset->members(); + while (scalar(@members) > $cores) { + my $randidx = int(rand(scalar(@members))); + $cpuset->delete($members[$randidx]); + splice(@members, $randidx, 1); # keep track of the changes + } + $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; } - $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; } - + File::Path::mkpath("$dir/rootfs"); PVE::Tools::file_set_contents("$dir/config", $raw); @@ -707,9 +885,10 @@ sub update_ipconfig { my $newip = $newnet->{$ip}; my $newgw = $newnet->{$gw}; my $oldip = $optdata->{$ip}; + my $oldgw = $optdata->{$gw}; my $change_ip = &$safe_string_ne($oldip, $newip); - my $change_gw = &$safe_string_ne($optdata->{$gw}, $newgw); + my $change_gw = &$safe_string_ne($oldgw, $newgw); return if !$change_ip && !$change_gw; @@ -752,6 +931,11 @@ sub update_ipconfig { # warn and continue warn $@ if $@; } + if ($oldgw && $oldip && !PVE::Network::is_ip_in_cidr($oldgw, $oldip)) { + eval { &$ipcmd($family_opt, 'route', 'del', $oldgw, 'dev', $eth); }; + # warn if the route was deleted manually + warn $@ if $@; + } } # from this point on we save the configuration @@ -866,17 +1050,26 @@ sub template_create { my $storecfg = PVE::Storage::config(); - my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); - my $volid = $rootinfo->{volume}; + PVE::LXC::Config->foreach_mountpoint($conf, sub { + my ($ms, $mountpoint) = @_; + + my $volid = $mountpoint->{volume}; + + die "Template feature is not available for '$volid'\n" + if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); + }); + + PVE::LXC::Config->foreach_mountpoint($conf, sub { + my ($ms, $mountpoint) = @_; - die "Template feature is not available for '$volid'\n" - if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); + my $volid = $mountpoint->{volume}; - PVE::Storage::activate_volumes($storecfg, [$volid]); + PVE::Storage::activate_volumes($storecfg, [$volid]); - my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid); - $rootinfo->{volume} = $template_volid; - $conf->{rootfs} = PVE::LXC::Config->print_ct_mountpoint($rootinfo, 1); + my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid); + $mountpoint->{volume} = $template_volid; + $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq "rootfs"); + }); PVE::LXC::Config->write_config($vmid, $conf); } @@ -902,6 +1095,9 @@ sub check_ct_modify_config_perm { } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); + } elsif ($opt eq 'features') { + # For now this is restricted to root@pam + raise_perm_exc("changing feature flags is only allowed for root\@pam"); } else { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']); } @@ -1196,7 +1392,8 @@ sub mountpoint_mount { } my $readonly = $mountpoint->{ro}; - my @extra_opts = ('-o', $optstring) if $optstring; + my @extra_opts; + @extra_opts = ('-o', $optstring) if $optstring; if ($storage) { @@ -1345,7 +1542,7 @@ sub alloc_disk { eval { my $do_format = 0; - if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs') { + if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs' || $scfg->{type} eq 'cifs' ) { if ($size_kb > 0) { $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); @@ -1676,6 +1873,8 @@ sub copy_volume { # (not-mounted) directory. my $new_volid; eval { + # Make sure $mp contains a correct size. + $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume}); my $needs_chown; ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid); if ($needs_chown) {