X-Git-Url: https://git.proxmox.com/?p=pve-container.git;a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=bb1cbdbd18fd507669d7b4786b9b5e7ce4f99983;hp=0742a53acfa07f60ed451930cbc250504e3889c7;hb=HEAD;hpb=a1ff8c3748674382326b9287af4607995873bc98 diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index 0742a53..65d0fa8 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -3,39 +3,51 @@ package PVE::LXC; use strict; use warnings; -use POSIX qw(EINTR); - -use Socket; - +use Cwd qw(); +use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST); +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode); use File::Path; use File::Spec; -use Cwd qw(); -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY); -use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED ENOSYS EEXIST); +use IO::Poll qw(POLLIN POLLHUP); use IO::Socket::UNIX; +use POSIX qw(EINTR); +use Socket; +use Time::HiRes qw (gettimeofday); +use PVE::AccessControl; +use PVE::CGroup; +use PVE::CpuSet; use PVE::Exception qw(raise_perm_exc); -use PVE::Storage; -use PVE::SafeSyslog; +use PVE::Firewall; +use PVE::GuestHelpers qw(check_vnet_access safe_string_ne safe_num_ne safe_boolean_ne); use PVE::INotify; use PVE::JSONSchema qw(get_standard_option); -use PVE::Tools qw( - dir_glob_foreach file_get_contents file_set_contents lock_file - lock_file_full AT_FDCWD O_PATH $IPV4RE $IPV6RE -); -use PVE::CpuSet; use PVE::Network; -use PVE::AccessControl; use PVE::ProcFSTools; +use PVE::RESTEnvironment; +use PVE::SafeSyslog; +use PVE::Storage; +use PVE::Tools qw( + run_command + dir_glob_foreach + file_get_contents + file_set_contents + AT_FDCWD + O_PATH + $IPV4RE + $IPV6RE +); use PVE::Syscall qw(:fsmount); + +use PVE::LXC::CGroup; use PVE::LXC::Config; -use PVE::GuestHelpers qw(safe_string_ne safe_num_ne safe_boolean_ne); +use PVE::LXC::Monitor; use PVE::LXC::Tools; -use Time::HiRes qw (gettimeofday); my $have_sdn; eval { require PVE::Network::SDN::Zones; + require PVE::Network::SDN::Vnets; $have_sdn = 1; }; @@ -45,6 +57,8 @@ my $nodename = PVE::INotify::nodename(); my $cpuinfo= PVE::ProcFSTools::read_cpuinfo(); +our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/; + sub config_list { my $vmlist = PVE::Cluster::get_vmlist(); my $res = {}; @@ -56,7 +70,7 @@ sub config_list { my $d = $ids->{$vmid}; next if !$d->{node} || $d->{node} ne $nodename; next if !$d->{type} || $d->{type} ne 'lxc'; - $res->{$vmid} = { type => 'lxc', vmid => $vmid }; + $res->{$vmid} = { type => 'lxc', vmid => int($vmid) }; } return $res; } @@ -106,23 +120,6 @@ sub get_container_disk_usage { my $last_proc_vmid_stat; -my $parse_cpuacct_stat = sub { - my ($vmid, $unprivileged) = @_; - - my $raw = read_cgroup_value('cpuacct', $vmid, $unprivileged, 'cpuacct.stat', 1); - - my $stat = {}; - - if ($raw =~ m/^user (\d+)\nsystem (\d+)\n/) { - - $stat->{utime} = $1; - $stat->{stime} = $2; - - } - - return $stat; -}; - our $vmstatus_return_properties = { vmid => get_standard_option('pve-vmid'), status => { @@ -179,7 +176,7 @@ our $vmstatus_return_properties = { sub vmstatus { my ($opt_vmid) = @_; - my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => $opt_vmid }} : config_list(); + my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => int($opt_vmid) }} : config_list(); my $active_hash = list_active_containers(); @@ -195,7 +192,7 @@ sub vmstatus { foreach my $vmid (keys %$list) { my $d = $list->{$vmid}; - eval { $d->{pid} = find_lxc_pid($vmid) if defined($active_hash->{$vmid}); }; + eval { $d->{pid} = int(find_lxc_pid($vmid)) if defined($active_hash->{$vmid}); }; warn $@ if $@; # ignore errors (consider them stopped) $d->{status} = $active_hash->{$vmid} ? 'running' : 'stopped'; @@ -211,18 +208,17 @@ sub vmstatus { $d->{cpus} = $conf->{cores} || $conf->{cpulimit}; $d->{cpus} = $cpucount if !$d->{cpus}; - $d->{lock} = $conf->{lock} || ''; $d->{tags} = $conf->{tags} if defined($conf->{tags}); if ($d->{pid}) { my $res = get_container_disk_usage($vmid, $d->{pid}); - $d->{disk} = $res->{used}; - $d->{maxdisk} = $res->{total}; + $d->{disk} = int($res->{used}); + $d->{maxdisk} = int($res->{total}); } else { $d->{disk} = 0; # use 4GB by default ?? if (my $rootfs = $conf->{rootfs}) { - my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($rootfs); + my $rootinfo = PVE::LXC::Config->parse_volume('rootfs', $rootfs); $d->{maxdisk} = $rootinfo->{size} || (4*1024*1024*1024); } else { $d->{maxdisk} = 4*1024*1024*1024; @@ -243,7 +239,7 @@ sub vmstatus { $d->{diskread} = 0; $d->{diskwrite} = 0; - $d->{template} = PVE::LXC::Config->is_template($conf); + $d->{template} = 1 if PVE::LXC::Config->is_template($conf); $d->{lock} = $conf->{lock} if $conf->{lock}; } @@ -258,55 +254,45 @@ sub vmstatus { my $unpriv = $unprivileged->{$vmid}; - if (-d '/sys/fs/cgroup/memory') { - my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat'); - my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes'); + my $cgroups = PVE::LXC::CGroup->new($vmid); - $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache}; - $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes; + if (defined(my $mem = $cgroups->get_memory_stat())) { + $d->{mem} = int($mem->{mem}); + $d->{swap} = int($mem->{swap}); } else { $d->{mem} = 0; $d->{swap} = 0; } - if (-d '/sys/fs/cgroup/blkio') { - my $blkio_bytes = read_cgroup_value('blkio', $vmid, 0, 'blkio.throttle.io_service_bytes', 1); # don't check if unpriv - my @bytes = split(/\n/, $blkio_bytes); - foreach my $byte (@bytes) { - if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) { - $d->{diskread} += $2 if $key eq 'Read'; - $d->{diskwrite} += $2 if $key eq 'Write'; - } - } + if (defined(my $blkio = $cgroups->get_io_stats())) { + $d->{diskread} = int($blkio->{diskread}); + $d->{diskwrite} = int($blkio->{diskwrite}); } else { $d->{diskread} = 0; $d->{diskwrite} = 0; } - if (-d '/sys/fs/cgroup/cpuacct') { - my $pstat = $parse_cpuacct_stat->($vmid, $unpriv); - - my $used = $pstat->{utime} + $pstat->{stime}; + if (defined(my $cpu = $cgroups->get_cpu_stat())) { + # Total time (in milliseconds) used up by the cpu. + my $used_ms = $cpu->{utime} + $cpu->{stime}; my $old = $last_proc_vmid_stat->{$vmid}; if (!$old) { $last_proc_vmid_stat->{$vmid} = { time => $cdtime, - used => $used, + used => $used_ms, cpu => 0, }; next; } - my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz}; - - if ($dtime > 1000) { - my $dutime = $used - $old->{used}; - - $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus}; + my $delta_ms = ($cdtime - $old->{time}) * $cpucount * 1000.0; + if ($delta_ms > 1000.0) { + my $delta_used_ms = $used_ms - $old->{used}; + $d->{cpu} = (($delta_used_ms / $delta_ms) * $cpucount) / $d->{cpus}; $last_proc_vmid_stat->{$vmid} = { time => $cdtime, - used => $used, + used => $used_ms, cpu => $d->{cpu}, }; } else { @@ -334,33 +320,6 @@ sub vmstatus { return $list; } -sub read_cgroup_list($$$$) { - my ($group, $vmid, $unprivileged, $name) = @_; - - my $content = read_cgroup_value($group, $vmid, $unprivileged, $name, 1); - - return { split(/\s+/, $content) }; -} - -sub read_cgroup_value($$$$$) { - my ($group, $vmid, $unprivileged, $name, $full) = @_; - - my $nsdir = $unprivileged ? '' : 'ns/'; - my $path = "/sys/fs/cgroup/$group/lxc/$vmid/${nsdir}$name"; - - return PVE::Tools::file_get_contents($path) if $full; - - return PVE::Tools::file_read_firstline($path); -} - -sub write_cgroup_value { - my ($group, $vmid, $name, $value) = @_; - - my $path = "/sys/fs/cgroup/$group/lxc/$vmid/$name"; - PVE::ProcFSTools::write_proc_entry($path, $value) if -e $path; - -} - sub find_lxc_console_pids { my $res = {}; @@ -454,21 +413,6 @@ sub parse_ipv4_cidr { die "unable to parse ipv4 address/mask\n"; } -sub get_cgroup_subsystems { - my $v1 = {}; - my $v2 = 0; - my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); - while ($data =~ /^\d+:([^:\n]*):.*$/gm) { - my $type = $1; - if (length($type)) { - $v1->{$_} = 1 foreach split(/,/, $type); - } else { - $v2 = 1; - } - } - return wantarray ? ($v1, $v2) : $v1; -} - # With seccomp trap to userspace we now have the ability to optionally forward # certain syscalls to the "host" to handle (via our pve-lxc-syscalld daemon). # @@ -496,6 +440,15 @@ sub make_seccomp_config { my $rules = { keyctl => ['errno 38'], + + # Disable btrfs ioctrls since they don't work particularly well in user namespaces. + # Particularly, without the mount option to enable rmdir removing snapshots, user + # namespaces can create snapshots but neither `show` or `delete` them, which is quite + # horrible, so for now, just disable this entirely: + # + # BTRFS_IOCTL_MAGIC 0x94, _IOC type shift is 8, + # so `(req & 0xFF00) == 0x9400` is a btrfs ioctl and gets an EPERM + ioctl => ['errno 1 [1,0x9400,SCMP_CMP_MASKED_EQ,0xff00]'], }; my $raw_conf = ''; @@ -633,8 +586,26 @@ sub update_lxc_config { return; } + my ($lxc_major, $lxc_minor) = get_lxc_version(); + my $raw = ''; + if ($lxc_major >= 4) { + # Explicitly don't use relative directories, which is the default, but + # note that we do this mostly because they are only applied for *some* + # cgroups. Our pve-container@.service now starts lxc-start with `-F`, + # so we also don't need to worry about the new monitor cgroup to + # confuse systemd. + $raw .= "lxc.cgroup.relative = 0\n"; + + # To make things easier, let's keep our previous cgroup layout and + # simply move the monitor outside: + $raw .= "lxc.cgroup.dir.monitor = lxc.monitor/$vmid\n"; + # cgroup namespace separation for stronger limits: + $raw .= "lxc.cgroup.dir.container = lxc/$vmid\n"; + $raw .= "lxc.cgroup.dir.container.inner = ns\n"; + } + die "missing 'arch' - internal error" if !$conf->{arch}; $raw .= "lxc.arch = $conf->{arch}\n"; @@ -643,6 +614,8 @@ sub update_lxc_config { my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error"; + File::Path::mkpath($dir); + my $cfgpath = '/usr/share/lxc/config'; my $inc = "$cfgpath/$ostype.common.conf"; $inc ="$cfgpath/common.conf" if !-f $inc; @@ -662,6 +635,25 @@ sub update_lxc_config { $raw .= "lxc.mount.entry = /dev/fuse dev/fuse none bind,create=file 0 0\n"; } + if ($unprivileged && !$features->{force_rw_sys}) { + # unpriv. CT default to sys:rw, but that doesn't always plays well with + # systemd, e.g., systemd-networkd https://systemd.io/CONTAINER_INTERFACE/ + $raw .= "lxc.mount.auto = sys:mixed\n"; + } + + PVE::LXC::Config->foreach_passthrough_device($conf, sub { + my ($key, $device) = @_; + + die "Path is not defined for passthrough device $key\n" + if !defined($device->{path}); + + my ($mode, $rdev) = PVE::LXC::Tools::get_device_mode_and_rdev($device->{path}); + my $major = PVE::Tools::dev_t_major($rdev); + my $minor = PVE::Tools::dev_t_minor($rdev); + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c'; + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n"; + }); + # WARNING: DO NOT REMOVE this without making sure that loop device nodes # cannot be exposed to the container with r/w access (cgroup perms). # When this is enabled mounts will still remain in the monitor's namespace @@ -669,7 +661,7 @@ sub update_lxc_config { # files while the container is running! $raw .= "lxc.monitor.unshare = 1\n"; - my $cgv1 = get_cgroup_subsystems(); + my ($cgv1, $cgv2) = PVE::CGroup::get_cgroup_controllers(); # Should we read them from /etc/subuid? if ($unprivileged && !$custom_idmap) { @@ -679,7 +671,11 @@ sub update_lxc_config { if (!PVE::LXC::Config->has_dev_console($conf)) { $raw .= "lxc.console.path = none\n"; - $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n" if $cgv1->{devices}; + if ($cgv1->{devices}) { + $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n"; + } elsif (defined($cgv2)) { + $raw .= "lxc.cgroup2.devices.deny = c 5:1 rwm\n"; + } } my $ttycount = PVE::LXC::Config->get_tty_count($conf); @@ -687,7 +683,7 @@ sub update_lxc_config { # some init scripts expect a linux terminal (turnkey). $raw .= "lxc.environment = TERM=linux\n"; - + my $utsname = $conf->{hostname} || "CT$vmid"; $raw .= "lxc.uts.name = $utsname\n"; @@ -700,6 +696,18 @@ sub update_lxc_config { my $lxcswap = int(($memory + $swap)*1024*1024); $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n"; + } elsif ($cgv2->{memory}) { + my $memory = $conf->{memory} || 512; + my $swap = $conf->{swap} // 0; + + # cgroup memory usage is limited by the hard 'max' limit (OOM-killer enforced) and the soft + # 'high' limit (cgroup processes get throttled and put under heavy reclaim pressure). + my ($lxc_mem_max, $lxc_mem_high) = PVE::LXC::Config::calculate_memory_constraints($memory); + $raw .= "lxc.cgroup2.memory.max = $lxc_mem_max\n"; + $raw .= "lxc.cgroup2.memory.high = $lxc_mem_high\n"; + + my $lxcswap = int($swap*1024*1024); + $raw .= "lxc.cgroup2.memory.swap.max = $lxcswap\n"; } if ($cgv1->{cpu}) { @@ -709,14 +717,25 @@ sub update_lxc_config { $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n"; } - my $shares = $conf->{cpuunits} || 1024; + my $shares = PVE::CGroup::clamp_cpu_shares($conf->{cpuunits}); $raw .= "lxc.cgroup.cpu.shares = $shares\n"; + } elsif ($cgv2->{cpu}) { + # See PVE::CGroup + if (my $cpulimit = $conf->{cpulimit}) { + my $value = int(100000*$cpulimit); + $raw .= "lxc.cgroup2.cpu.max = $value 100000\n"; + } + + if (defined(my $shares = $conf->{cpuunits})) { + $shares = PVE::CGroup::clamp_cpu_shares($shares); + $raw .= "lxc.cgroup2.cpu.weight = $shares\n"; + } } die "missing 'rootfs' configuration\n" if !defined($conf->{rootfs}); - my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs}); + my $mountpoint = PVE::LXC::Config->parse_volume('rootfs', $conf->{rootfs}); $raw .= "lxc.rootfs.path = $dir/rootfs\n"; @@ -728,31 +747,47 @@ sub update_lxc_config { $raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n"; $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name}); - $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu}); + + my $bridge_mtu = PVE::Network::read_bridge_mtu($d->{bridge}); + my $mtu = $d->{mtu} || $bridge_mtu; + + # Keep container from starting with invalid mtu configuration + die "$k: MTU size '$mtu' is bigger than bridge MTU '$bridge_mtu'\n" + if ($mtu > $bridge_mtu); + + $raw .= "lxc.net.$ind.mtu = $mtu\n"; + + # Starting with lxc 4.0, we do not patch lxc to execute our up-scripts. + if ($lxc_major >= 4) { + $raw .= "lxc.net.$ind.script.up = /usr/share/lxc/lxcnetaddbr\n"; + } } - if ($cgv1->{cpuset}) { - my $had_cpuset = 0; - if (my $lxcconf = $conf->{lxc}) { - foreach my $entry (@$lxcconf) { - my ($k, $v) = @$entry; - $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus'; - $raw .= "$k = $v\n"; - } + my $had_cpuset = 0; + if (my $lxcconf = $conf->{lxc}) { + foreach my $entry (@$lxcconf) { + my ($k, $v) = @$entry; + $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus' || $k eq 'lxc.cgroup2.cpuset.cpus'; + $raw .= "$k = $v\n"; } + } - my $cores = $conf->{cores}; - if (!$had_cpuset && $cores) { - my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') }; - $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset; - my @members = $cpuset->members(); - while (scalar(@members) > $cores) { - my $randidx = int(rand(scalar(@members))); - $cpuset->delete($members[$randidx]); - splice(@members, $randidx, 1); # keep track of the changes - } - $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n"; + my $cpuset; + my ($cpuset_cgroup, $cpuset_version) = eval { PVE::CGroup::cpuset_controller_path() }; + if (defined($cpuset_cgroup)) { + $cpuset = eval { PVE::CpuSet->new_from_path("$cpuset_cgroup/lxc", 1) } + || PVE::CpuSet->new_from_path($cpuset_cgroup, 1); + } + my $cores = $conf->{cores}; + if (!$had_cpuset && $cores && $cpuset) { + my @members = $cpuset->members(); + while (scalar(@members) > $cores) { + my $randidx = int(rand(scalar(@members))); + $cpuset->delete($members[$randidx]); + splice(@members, $randidx, 1); # keep track of the changes } + my $ver = $cpuset_version == 1 ? '' : '2'; + $raw .= "lxc.cgroup$ver.cpuset.cpus = ".$cpuset->short_string()."\n"; } File::Path::mkpath("$dir/rootfs"); @@ -766,7 +801,7 @@ sub verify_nameserver_list { my @list = (); foreach my $server (PVE::Tools::split_list($nameserver_list)) { - PVE::JSONSchema::pve_verify_ip($server); + PVE::LXC::Config::verify_ip_with_ll_iface($server); push @list, $server; } @@ -852,12 +887,33 @@ sub delete_mountpoint_volume { } sub destroy_lxc_container { - my ($storage_cfg, $vmid, $conf, $replacement_conf) = @_; + my ($storage_cfg, $vmid, $conf, $replacement_conf, $purge_unreferenced) = @_; - PVE::LXC::Config->foreach_mountpoint($conf, sub { + my $volids = {}; + my $remove_volume = sub { my ($ms, $mountpoint) = @_; - delete_mountpoint_volume($storage_cfg, $vmid, $mountpoint->{volume}); - }); + + my $volume = $mountpoint->{volume}; + + return if $volids->{$volume}; + $volids->{$volume} = 1; + + delete_mountpoint_volume($storage_cfg, $vmid, $volume); + }; + PVE::LXC::Config->foreach_volume_full($conf, {include_unused => 1}, $remove_volume); + + PVE::LXC::Config->foreach_volume_full($conf->{pending}, {include_unused => 1}, $remove_volume); + + if ($purge_unreferenced) { # also remove unreferenced disk + my $vmdisks = PVE::Storage::vdisk_list($storage_cfg, undef, $vmid, undef, 'rootdir'); + PVE::Storage::foreach_volid($vmdisks, sub { + my ($volid, $sid, $volname, $d) = @_; + eval { PVE::Storage::vdisk_free($storage_cfg, $volid) }; + warn $@ if $@; + }); + } + + delete_ifaces_ipams_ips($conf, $vmid); rmdir "/var/lib/lxc/$vmid/rootfs"; unlink "/var/lib/lxc/$vmid/config"; @@ -881,6 +937,32 @@ sub vm_stop_cleanup { warn $@ if $@; # avoid errors - just warn } +sub net_tap_plug : prototype($$) { + my ($iface, $net) = @_; + + if (defined($net->{link_down})) { + PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'down']); + # Don't add disconnected interfaces to the bridge, otherwise e.g. applying any network + # change (e.g. `ifreload -a`) could (re-)activate it unintentionally. + return; + } + + my ($bridge, $tag, $trunks, $rate, $hwaddr) = + $net->@{'bridge', 'tag', 'trunks', 'rate', 'hwaddr'}; + + # The nftable-based implementation from the newer proxmox-firewall does not requires FW bridges + my $create_firewall_bridges = $net->{firewall} && !PVE::Firewall::is_nftables(); + + if ($have_sdn) { + PVE::Network::SDN::Zones::tap_plug($iface, $bridge, $tag, $create_firewall_bridges, $trunks, $rate); + PVE::Network::SDN::Zones::add_bridge_fdb($iface, $hwaddr, $bridge); + } else { + PVE::Network::tap_plug($iface, $bridge, $tag, $create_firewall_bridges, $trunks, $rate, { mac => $hwaddr }); + } + + PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'up']); +} + sub update_net { my ($vmid, $conf, $opt, $newnet, $netid, $rootdir) = @_; @@ -899,33 +981,51 @@ sub update_net { safe_string_ne($oldnet->{name}, $newnet->{name})) { PVE::Network::veth_delete($veth); + + if ($have_sdn && safe_string_ne($oldnet->{hwaddr}, $newnet->{hwaddr})) { + eval { PVE::Network::SDN::Vnets::del_ips_from_mac($oldnet->{bridge}, $oldnet->{hwaddr}, $conf->{hostname}) }; + warn $@ if $@; + + PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1); + PVE::Network::SDN::Vnets::add_dhcp_mapping($newnet->{bridge}, $newnet->{hwaddr}, $vmid, $conf->{hostname}); + } + delete $conf->{$opt}; PVE::LXC::Config->write_config($vmid, $conf); hotplug_net($vmid, $conf, $opt, $newnet, $netid); } else { - if (safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) || - safe_num_ne($oldnet->{tag}, $newnet->{tag}) || - safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) { + my $bridge_changed = safe_string_ne($oldnet->{bridge}, $newnet->{bridge}); + if ($bridge_changed || + safe_num_ne($oldnet->{tag}, $newnet->{tag}) || + safe_num_ne($oldnet->{firewall}, $newnet->{firewall}) || + safe_boolean_ne($oldnet->{link_down}, $newnet->{link_down}) + ) { if ($oldnet->{bridge}) { + my $oldbridge = $oldnet->{bridge}; + PVE::Network::tap_unplug($veth); foreach (qw(bridge tag firewall)) { delete $oldnet->{$_}; } $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet); PVE::LXC::Config->write_config($vmid, $conf); + + if ($have_sdn && $bridge_changed) { + eval { PVE::Network::SDN::Vnets::del_ips_from_mac($oldbridge, $oldnet->{hwaddr}, $conf->{hostname}) }; + warn $@ if $@; + } } - if ($have_sdn) { - PVE::Network::SDN::Zones::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); - } else { - PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); + if ($have_sdn && $bridge_changed) { + PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1); } + PVE::LXC::net_tap_plug($veth, $newnet); # This includes the rate: - foreach (qw(bridge tag firewall rate)) { + foreach (qw(bridge tag firewall rate link_down)) { $oldnet->{$_} = $newnet->{$_} if $newnet->{$_}; } } elsif (safe_string_ne($oldnet->{rate}, $newnet->{rate})) { @@ -938,6 +1038,11 @@ sub update_net { PVE::LXC::Config->write_config($vmid, $conf); } } else { + if ($have_sdn) { + PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1); + PVE::Network::SDN::Vnets::add_dhcp_mapping($newnet->{bridge}, $newnet->{hwaddr}, $vmid, $conf->{hostname}); + } + hotplug_net($vmid, $conf, $opt, $newnet, $netid); } @@ -953,12 +1058,12 @@ sub hotplug_net { if ($have_sdn) { PVE::Network::SDN::Zones::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr}); - PVE::Network::SDN::Zones::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); } else { PVE::Network::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr}); - PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate}); } + PVE::LXC::net_tap_plug($veth, $newnet); + # attach peer in container my $cmd = ['lxc-device', '-n', $vmid, 'add', $vethpeer, "$eth" ]; PVE::Tools::run_command($cmd); @@ -968,7 +1073,7 @@ sub hotplug_net { PVE::Tools::run_command($cmd); my $done = { type => 'veth' }; - foreach (qw(bridge tag firewall hwaddr name)) { + foreach (qw(bridge tag firewall hwaddr name link_down)) { $done->{$_} = $newnet->{$_} if $newnet->{$_}; } $conf->{$opt} = PVE::LXC::Config->print_lxc_network($done); @@ -976,6 +1081,32 @@ sub hotplug_net { PVE::LXC::Config->write_config($vmid, $conf); } +sub get_interfaces { + my ($vmid) = @_; + + my $pid = eval { find_lxc_pid($vmid); }; + return if $@; + + my $output; + # enters the network namespace of the container and executes 'ip a' + run_command(['nsenter', '-t', $pid, '--net', '--', 'ip', '--json', 'a'], + outfunc => sub { $output .= shift; }); + + my $config = JSON::decode_json($output); + + my $res; + for my $interface ($config->@*) { + my $obj = { name => $interface->{ifname} }; + for my $ip ($interface->{addr_info}->@*) { + $obj->{$ip->{family}} = $ip->{local} . "/" . $ip->{prefixlen}; + } + $obj->{hwaddr} = $interface->{address}; + push @$res, $obj + } + + return $res; +} + sub update_ipconfig { my ($vmid, $conf, $opt, $eth, $newnet, $rootdir) = @_; @@ -1133,11 +1264,24 @@ my $do_syncfs = sub { my $mountdata = do { local $/ = undef; <$socket> }; close $socket; + my %nosyncfs = ( + cgroup => 1, + cgroup2 => 1, + devtmpfs => 1, + devpts => 1, + 'fuse.lxcfs' => 1, + fusectl => 1, + mqueue => 1, + proc => 1, + sysfs => 1, + tmpfs => 1, + ); + # Now sync all mountpoints... my $mounts = PVE::ProcFSTools::parse_mounts($mountdata); foreach my $mp (@$mounts) { my ($what, $dir, $fs) = @$mp; - next if $fs eq 'fuse.lxcfs'; + next if $nosyncfs{$fs}; eval { PVE::Tools::sync_mountpoint($dir); }; warn $@ if $@; } @@ -1185,7 +1329,7 @@ sub template_create { my $storecfg = PVE::Storage::config(); - PVE::LXC::Config->foreach_mountpoint($conf, sub { + PVE::LXC::Config->foreach_volume($conf, sub { my ($ms, $mountpoint) = @_; my $volid = $mountpoint->{volume}; @@ -1194,7 +1338,7 @@ sub template_create { if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid); }); - PVE::LXC::Config->foreach_mountpoint($conf, sub { + PVE::LXC::Config->foreach_volume($conf, sub { my ($ms, $mountpoint) = @_; my $volid = $mountpoint->{volume}; @@ -1210,9 +1354,10 @@ sub template_create { } sub check_ct_modify_config_perm { - my ($rpcenv, $authuser, $vmid, $pool, $newconf, $delete) = @_; + my ($rpcenv, $authuser, $vmid, $pool, $oldconf, $newconf, $delete, $unprivileged) = @_; return 1 if $authuser eq 'root@pam'; + my $storage_cfg = PVE::Storage::config(); my $check = sub { my ($opt, $delete) = @_; @@ -1221,21 +1366,84 @@ sub check_ct_modify_config_perm { } elsif ($opt eq 'rootfs' || $opt =~ /^mp\d+$/) { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk']); return if $delete; - my $data = $opt eq 'rootfs' ? PVE::LXC::Config->parse_ct_rootfs($newconf->{$opt}) - : PVE::LXC::Config->parse_ct_mountpoint($newconf->{$opt}); + my $data = PVE::LXC::Config->parse_volume($opt, $newconf->{$opt}); raise_perm_exc("mount point type $data->{type} is only allowed for root\@pam") if $data->{type} ne 'volume'; + my $volid = $data->{volume}; + if ($volid =~ $NEW_DISK_RE) { + my $sid = $1; + $rpcenv->check($authuser, "/storage/$sid", ['Datastore.AllocateSpace']); + } else { + PVE::Storage::check_volume_access( + $rpcenv, + $authuser, + $storage_cfg, + $vmid, + $volid, + 'rootdir', + ); + } } elsif ($opt eq 'memory' || $opt eq 'swap') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']); - } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || - $opt eq 'searchdomain' || $opt eq 'hostname') { + } elsif ($opt =~ m/^net\d+$/) { + $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); + check_bridge_access($rpcenv, $authuser, $oldconf->{$opt}) if $oldconf->{$opt}; + check_bridge_access($rpcenv, $authuser, $newconf->{$opt}) if $newconf->{$opt}; + } elsif ($opt =~ m/^dev\d+$/) { + raise_perm_exc("configuring device passthrough is only allowed for root\@pam"); + } elsif ($opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); } elsif ($opt eq 'features') { - # For now this is restricted to root@pam - raise_perm_exc("changing feature flags is only allowed for root\@pam"); + raise_perm_exc("changing feature flags for privileged container is only allowed for root\@pam") + if !$unprivileged; + + my $nesting_changed = 0; + my $other_changed = 0; + if (!$delete) { + my $features = PVE::LXC::Config->parse_features($newconf->{$opt}); + if (defined($oldconf) && $oldconf->{$opt}) { + # existing container with features + my $old_features = PVE::LXC::Config->parse_features($oldconf->{$opt}); + for my $feature ((keys %$old_features, keys %$features)) { + my $old = $old_features->{$feature} // ''; + my $new = $features->{$feature} // ''; + if ($old ne $new) { + if ($feature eq 'nesting') { + $nesting_changed = 1; + next; + } else { + $other_changed = 1; + last; + } + } + } + } else { + # new container or no features defined + if (scalar(keys %$features) == 1 && $features->{nesting}) { + $nesting_changed = 1; + } elsif (scalar(keys %$features) > 0) { + $other_changed = 1; + } + } + } else { + my $features = PVE::LXC::Config->parse_features($oldconf->{$opt}); + if (scalar(keys %$features) == 1 && $features->{nesting}) { + $nesting_changed = 1; + } elsif (scalar(keys %$features) > 0) { + $other_changed = 1; + } + } + raise_perm_exc("changing feature flags (except nesting) is only allowed for root\@pam") + if $other_changed; + $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Allocate']) + if $nesting_changed; } elsif ($opt eq 'hookscript') { # For now this is restricted to root@pam raise_perm_exc("changing the hookscript is only allowed for root\@pam"); + } elsif ($opt eq 'tags') { + my $old = $oldconf->{$opt}; + my $new = $delete ? '' : $newconf->{$opt}; + PVE::GuestHelpers::assert_tag_permissions($vmid, $old, $new, $rpcenv, $authuser); } else { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']); } @@ -1251,6 +1459,18 @@ sub check_ct_modify_config_perm { return 1; } +sub check_bridge_access { + my ($rpcenv, $authuser, $raw) = @_; + + return 1 if $authuser eq 'root@pam'; + + my $net = PVE::LXC::Config->parse_lxc_network($raw); + my ($bridge, $tag, $trunks) = $net->@{'bridge', 'tag', 'trunks'}; + check_vnet_access($rpcenv, $authuser, $bridge, $tag, $trunks); + + return 1; +}; + sub umount_all { my ($vmid, $storage_cfg, $conf, $noerr) = @_; @@ -1259,7 +1479,7 @@ sub umount_all { my $res = 1; - PVE::LXC::Config->foreach_mountpoint_reverse($conf, sub { + PVE::LXC::Config->foreach_volume_full($conf, {'reverse' => 1}, sub { my ($ms, $mountpoint) = @_; my $volid = $mountpoint->{volume}; @@ -1297,15 +1517,15 @@ sub mount_all { my $volid_list = PVE::LXC::Config->get_vm_volumes($conf); PVE::Storage::activate_volumes($storage_cfg, $volid_list); - my (undef, $rootuid, $rootgid) = parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = parse_id_maps($conf); eval { - PVE::LXC::Config->foreach_mountpoint($conf, sub { + PVE::LXC::Config->foreach_volume($conf, sub { my ($ms, $mountpoint) = @_; $mountpoint->{ro} = 0 if $ignore_ro; - mountpoint_mount($mountpoint, $rootdir, $storage_cfg, undef, $rootuid, $rootgid); + mountpoint_mount($mountpoint, $rootdir, $storage_cfg, undef, $root_uid, $root_gid); }); }; if (my $err = $@) { @@ -1381,17 +1601,17 @@ sub run_with_loopdev { # * directory name of the last directory # So that the path $2/$3 should lead to $1 afterwards. sub walk_tree_nofollow($$$;$$) { - my ($start, $subdir, $mkdir, $rootuid, $rootgid) = @_; + my ($start, $subdir, $mkdir, $root_uid, $root_gid) = @_; sysopen(my $fd, $start, O_PATH | O_DIRECTORY) or die "failed to open start directory $start: $!\n"; - return walk_tree_nofollow_fd($start, $fd, $subdir, $mkdir, $rootuid, $rootgid); + return walk_tree_nofollow_fd($start, $fd, $subdir, $mkdir, $root_uid, $root_gid); } sub walk_tree_nofollow_fd($$$$;$$) { - my ($start_dirname, $start_fd, $subdir, $mkdir, $rootuid, $rootgid) = @_; + my ($start_dirname, $start_fd, $subdir, $mkdir, $root_uid, $root_gid) = @_; # splitdir() returns '' for empty components including the leading / my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir); @@ -1419,8 +1639,8 @@ sub walk_tree_nofollow_fd($$$$;$$) { $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); die "failed to create path: $dir: $!\n" if !$next; - PVE::Tools::fchownat(fileno($next), '', $rootuid, $rootgid, PVE::Tools::AT_EMPTY_PATH) - if defined($rootuid) && defined($rootgid); + PVE::Tools::fchownat(fileno($next), '', $root_uid, $root_gid, PVE::Tools::AT_EMPTY_PATH) + if defined($root_uid) && defined($root_gid); } close $second if defined($last_component) && $second != $start_fd; @@ -1514,27 +1734,26 @@ sub bindmount { # from $rootdir and $mount and walk the path from $rootdir to the final # directory to check for symlinks. sub __mount_prepare_rootdir { - my ($rootdir, $mount, $rootuid, $rootgid) = @_; + my ($rootdir, $mount, $root_uid, $root_gid) = @_; $rootdir =~ s!/+!/!g; $rootdir =~ s!/+$!!; my $mount_path = "$rootdir/$mount"; - my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1, $rootuid, $rootgid); + my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1, $root_uid, $root_gid); return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir); } # use $rootdir = undef to just return the corresponding mount path sub mountpoint_mount { - my ($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid) = @_; - return __mountpoint_mount($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid, undef); + my ($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid) = @_; + return __mountpoint_mount($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid, undef); } sub mountpoint_stage { - my ($mountpoint, $stage_dir, $storage_cfg, $snapname, $rootuid, $rootgid) = @_; + my ($mountpoint, $stage_dir, $storage_cfg, $snapname, $root_uid, $root_gid) = @_; my ($path, $loop, $dev) = - __mountpoint_mount($mountpoint, $stage_dir, $storage_cfg, $snapname, $rootuid, $rootgid, 1); + __mountpoint_mount($mountpoint, $stage_dir, $storage_cfg, $snapname, $root_uid, $root_gid, 1); if (!defined($path)) { - return undef if $! == ENOSYS; die "failed to mount subvolume: $!\n"; } @@ -1549,14 +1768,14 @@ sub mountpoint_stage { } sub mountpoint_insert_staged { - my ($mount_fd, $rootdir_fd, $mp_dir, $opt, $rootuid, $rootgid) = @_; + my ($mount_fd, $rootdir_fd, $mp_dir, $opt, $root_uid, $root_gid) = @_; if (!defined($rootdir_fd)) { sysopen($rootdir_fd, '.', O_PATH | O_DIRECTORY) or die "failed to open '.': $!\n"; } - my $dest_fd = walk_tree_nofollow_fd('/', $rootdir_fd, $mp_dir, 1, $rootuid, $rootgid); + my $dest_fd = walk_tree_nofollow_fd('/', $rootdir_fd, $mp_dir, 1, $root_uid, $root_gid); PVE::Tools::move_mount( fileno($mount_fd), @@ -1569,15 +1788,8 @@ sub mountpoint_insert_staged { # Use $stage_mount, $rootdir is treated as a temporary path to "stage" the file system. The user # can then open a file descriptor to it which can be used with the `move_mount` syscall. -# Note that if the kernel does not support the new mount API, this will not perform any action -# and return `undef` with $! = ENOSYS. sub __mountpoint_mount { - my ($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid, $stage_mount) = @_; - - if (defined($stage_mount) && !PVE::LXC::Tools::can_use_new_mount_api()) { - $! = ENOSYS; - return undef; - } + my ($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid, $stage_mount) = @_; # When staging mount points we always mount to $rootdir directly (iow. as if `mp=/`). # This is required since __mount_prepare_rootdir() will return handles to the parent directory @@ -1588,23 +1800,23 @@ sub __mountpoint_mount { my $type = $mountpoint->{type}; my $quota = !$snapname && !$mountpoint->{ro} && $mountpoint->{quota}; my $mounted_dev; - + return if !$volid || !$mount; $mount =~ s!/+!/!g; my $mount_path; my ($mpfd, $parentfd, $last_dir); - + if (defined($rootdir)) { ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) = - __mount_prepare_rootdir($rootdir, $mount, $rootuid, $rootgid); + __mount_prepare_rootdir($rootdir, $mount, $root_uid, $root_gid); } if (defined($stage_mount)) { $mount_path = $rootdir; } - + my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1); die "unknown snapshot path for '$volid'" if !$storage && defined($snapname); @@ -1617,8 +1829,10 @@ sub __mountpoint_mount { } my $acl = $mountpoint->{acl}; - if (defined($acl)) { - push @$optlist, ($acl ? 'acl' : 'noacl'); + + if ($acl) { + push @$optlist, 'acl'; + # NOTE: the else branch is handled below } my $optstring = join(',', @$optlist); @@ -1631,6 +1845,7 @@ sub __mountpoint_mount { my $scfg = PVE::Storage::storage_config($storage_cfg, $storage); + PVE::Storage::activate_volumes($storage_cfg, [$volid], $snapname); my $path = PVE::Storage::map_volume($storage_cfg, $volid, $snapname); $path = PVE::Storage::path($storage_cfg, $volid, $snapname) if !defined($path); @@ -1638,6 +1853,12 @@ sub __mountpoint_mount { my ($vtype, undef, undef, undef, undef, $isBase, $format) = PVE::Storage::parse_volname($storage_cfg, $volid); + if (defined($acl) && !$acl) { + # Does having this really makes sense or should we drop it with a future major release? + # Kernel 6.1 removed the noacl mount option for ext4, which is used for all raw volumes. + push @$optlist, 'noacl' if $format ne 'raw'; + } + $format = 'iso' if $vtype eq 'iso'; # allow to handle iso files if ($format eq 'subvol') { @@ -1685,15 +1906,16 @@ sub __mountpoint_mount { } }; my $use_loopdev = 0; - if ($scfg->{path}) { - $mounted_dev = run_with_loopdev($domount, $path, $readonly); - $use_loopdev = 1; - } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || - $scfg->{type} eq 'rbd' || $scfg->{type} eq 'lvmthin') { - $mounted_dev = $path; - &$domount($path); + if ($scfg->{content}->{rootdir}) { + if ($scfg->{path}) { + $mounted_dev = run_with_loopdev($domount, $path, $readonly); + $use_loopdev = 1; + } else { + $mounted_dev = $path; + &$domount($path); + } } else { - die "unsupported storage type '$scfg->{type}'\n"; + die "storage '$storage' does not support containers\n"; } return wantarray ? ($path, $use_loopdev, $mounted_dev) : $path; } else { @@ -1712,14 +1934,14 @@ sub __mountpoint_mount { warn "cannot enable quota control for bind mounts\n" if $quota; return wantarray ? ($volid, 0, undef) : $volid; } - + die "unsupported storage"; } -sub mountpoint_hotplug($$$) { +sub mountpoint_hotplug :prototype($$$$$) { my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_; - my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf); # We do the rest in a fork with an unshared mount namespace, because: # -) change our papparmor profile to that of /usr/bin/lxc-start @@ -1755,21 +1977,22 @@ sub mountpoint_hotplug($$$) { my $dir = get_staging_mount_path($opt); # Now switch our apparmor profile before mounting: - my $data = 'changeprofile /usr/bin/lxc-start'; - if (syswrite($aa_fd, $data, length($data)) != length($data)) { + my $data = 'changeprofile pve-container-mounthotplug'; + my $data_written = syswrite($aa_fd, $data, length($data)); + if (!defined($data_written) || $data_written != length($data)) { die "failed to change apparmor profile: $!\n"; } # Check errors on close as well: close($aa_fd) or die "failed to change apparmor profile (close() failed): $!\n"; - my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $rootuid, $rootgid); + my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $root_uid, $root_gid); PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS); chdir('/') or die "failed to change root directory within the container's mount namespace: $!\n"; - mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $rootuid, $rootgid); + mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $root_uid, $root_gid); }); } @@ -1785,7 +2008,7 @@ sub get_staging_mount_path($) { return $target; } -# Mount /run/pve/mountpoints as tmpfs +# Mount tmpfs for mount point staging and return the path. sub get_staging_tempfs() { # We choose a path in /var/lib/lxc/ here because the lxc-start apparmor profile restricts most # mounts to that. @@ -1802,18 +2025,38 @@ sub get_staging_tempfs() { } sub mkfs { - my ($dev, $rootuid, $rootgid) = @_; - - PVE::Tools::run_command(['mkfs.ext4', '-O', 'mmp', - '-E', "root_owner=$rootuid:$rootgid", - $dev]); + my ($dev, $root_uid, $root_gid) = @_; + + run_command( + [ + 'mkfs.ext4', + '-O', + 'mmp', + '-E', + "root_owner=$root_uid:$root_gid", + $dev, + ], + outfunc => sub { + my $line = shift; + # a hack to print only the relevant stuff, i.e., the one which could help on repair + if ($line =~ /^(Creating filesystem|Filesystem UUID|Superblock backups|\s+\d+, \d)/) { + print "$line\n"; + } + }, + errfunc => sub { + my $line = shift; + print STDERR "$line\n" if $line && $line !~ /^mke2fs \d\.\d/; + } + ); } sub format_disk { - my ($storage_cfg, $volid, $rootuid, $rootgid) = @_; + my ($storage_cfg, $volid, $root_uid, $root_gid) = @_; if ($volid =~ m!^/dev/.+!) { - mkfs($volid); + # FIXME: remove in Proxmox VE 9 – this code path cannot really be reached currently, using + # block devices needs manual preparations by the user + mkfs($volid, $root_uid, $root_gid); return; } @@ -1833,7 +2076,7 @@ sub format_disk { die "cannot format volume '$volid' (format == $format)\n" if $format ne 'raw'; - mkfs($path, $rootuid, $rootgid); + mkfs($path, $root_uid, $root_gid); } sub destroy_disks { @@ -1846,7 +2089,7 @@ sub destroy_disks { } sub alloc_disk { - my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_; + my ($storecfg, $vmid, $storage, $size_kb, $root_uid, $root_gid) = @_; my $needs_chown = 0; my $volid; @@ -1856,34 +2099,24 @@ sub alloc_disk { eval { my $do_format = 0; - if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs' || $scfg->{type} eq 'cifs' ) { - if ($size_kb > 0) { - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', - undef, $size_kb); + if ($scfg->{content}->{rootdir} && $scfg->{path}) { + if ($size_kb > 0 && !($scfg->{type} eq 'btrfs' && $scfg->{quotas})) { + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); $do_format = 1; } else { - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', - undef, 0); + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', undef, $size_kb); $needs_chown = 1; } } elsif ($scfg->{type} eq 'zfspool') { - - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', - undef, $size_kb); + $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol', undef, $size_kb); $needs_chown = 1; - } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') { - - $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); - $do_format = 1; - - } elsif ($scfg->{type} eq 'rbd') { - + } elsif ($scfg->{content}->{rootdir}) { $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb); $do_format = 1; } else { - die "unable to create containers on storage type '$scfg->{type}'\n"; + die "content type 'rootdir' is not available or configured on storage '$storage'\n"; } - format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format; + format_disk($storecfg, $volid, $root_uid, $root_gid) if $do_format; }; if (my $err = $@) { # in case formatting got interrupted: @@ -1897,17 +2130,16 @@ sub alloc_disk { return ($volid, $needs_chown); } -our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/; sub create_disks { my ($storecfg, $vmid, $settings, $conf, $pending) = @_; my $vollist = []; eval { - my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf); my $chown_vollist = []; - PVE::LXC::Config->foreach_mountpoint($settings, sub { + PVE::LXC::Config->foreach_volume($settings, sub { my ($ms, $mountpoint) = @_; my $volid = $mountpoint->{volume}; @@ -1921,7 +2153,7 @@ sub create_disks { my $size_kb = int(${size_gb}*1024) * 1024; my $needs_chown = 0; - ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid); + ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $root_uid, $root_gid); push @$chown_vollist, $volid if $needs_chown; push @$vollist, $volid; $mountpoint->{volume} = $volid; @@ -1940,7 +2172,7 @@ sub create_disks { PVE::Storage::activate_volumes($storecfg, $chown_vollist, undef); foreach my $volid (@$chown_vollist) { my $path = PVE::Storage::path($storecfg, $volid, undef); - chown($rootuid, $rootgid, $path); + chown($root_uid, $root_gid, $path); } PVE::Storage::deactivate_volumes($storecfg, $chown_vollist, undef); }; @@ -1966,12 +2198,12 @@ sub update_disksize { $changes = 1; print "$prefix updated volume size of '$mp->{volume}' in config.\n"; $mp->{size} = $size; - my $nomp = 1 if ($key eq 'rootfs'); - $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $nomp); + my $no_mp = $key eq 'rootfs'; # rootfs is handled different from other mount points + $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $no_mp); } }; - PVE::LXC::Config->foreach_mountpoint($conf, $update_mp); + PVE::LXC::Config->foreach_volume($conf, $update_mp); return $changes; } @@ -2027,7 +2259,7 @@ sub update_unused { sub scan_volids { my ($cfg, $vmid) = @_; - my $info = PVE::Storage::vdisk_list($cfg, undef, $vmid); + my $info = PVE::Storage::vdisk_list($cfg, undef, $vmid, undef, 'rootdir'); my $all_volumes = {}; foreach my $storeid (keys %$info) { @@ -2047,12 +2279,6 @@ sub rescan { my $cfg = PVE::Storage::config(); - # FIXME: Remove once our RBD plugin can handle CT and VM on a single storage - # see: https://pve.proxmox.com/pipermail/pve-devel/2018-July/032900.html - foreach my $stor (keys %{$cfg->{ids}}) { - delete($cfg->{ids}->{$stor}) if !$cfg->{ids}->{$stor}->{content}->{rootdir}; - } - print "rescan volumes...\n"; my $all_volumes = scan_volids($cfg, $vmid); @@ -2160,20 +2386,21 @@ sub parse_id_maps { my ($conf) = @_; my $id_map = []; - my $rootuid = 0; - my $rootgid = 0; + my $root_uid = 0; + my $root_gid = 0; my $lxc = $conf->{lxc}; foreach my $entry (@$lxc) { my ($key, $value) = @$entry; - # FIXME: remove the 'id_map' variant when lxc-3.0 arrives - next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map'; + + next if $key ne 'lxc.idmap'; + if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) { my ($type, $ct, $host, $length) = ($1, $2, $3, $4); push @$id_map, [$type, $ct, $host, $length]; if ($ct == 0) { - $rootuid = $host if $type eq 'u'; - $rootgid = $host if $type eq 'g'; + $root_uid = $host if $type eq 'u'; + $root_gid = $host if $type eq 'g'; } } else { die "failed to parse idmap: $value\n"; @@ -2184,10 +2411,73 @@ sub parse_id_maps { # Should we read them from /etc/subuid? $id_map = [ ['u', '0', '100000', '65536'], ['g', '0', '100000', '65536'] ]; - $rootuid = $rootgid = 100000; + $root_uid = $root_gid = 100000; + } + + return ($id_map, $root_uid, $root_gid); +} + +sub validate_id_maps { + my ($id_map) = @_; + + # $mappings->{$type}->{$side} = [ { line => $line, start => $start, count => $count }, ... ] + # $type: either "u" or "g" + # $side: either "container" or "host" + # $line: index of this mapping in @$id_map + # $start, $count: interval of this mapping + my $mappings = { u => {}, g => {} }; + for (my $i = 0; $i < scalar(@$id_map); $i++) { + my ($type, $ct_start, $host_start, $count) = $id_map->[$i]->@*; + my $sides = $mappings->{$type}; + push $sides->{host}->@*, { line => $i, start => $host_start, count => $count }; + push $sides->{container}->@*, { line => $i, start => $ct_start, count => $count }; + } + + # find the first conflict between two consecutive mappings when sorted by their start id + for my $type (qw(u g)) { + for my $side (qw(container host)) { + my @entries = sort { $a->{start} <=> $b->{start} } $mappings->{$type}->{$side}->@*; + for my $idx (1..scalar(@entries) - 1) { + my $previous = $entries[$idx - 1]; + my $current = $entries[$idx]; + if ($previous->{start} + $previous->{count} > $current->{start}) { + my $conflict = $current->{start}; + my @previous_line = $id_map->[$previous->{line}]->@*; + my @current_line = $id_map->[$current->{line}]->@*; + die "invalid map entry '@current_line': $side ${type}id $conflict " + ."is also mapped by entry '@previous_line'\n"; + } + } + } } +} + +sub map_ct_id_to_host { + my ($id, $id_map, $id_type) = @_; - return ($id_map, $rootuid, $rootgid); + for my $mapping (@$id_map) { + my ($type, $ct, $host, $length) = @$mapping; + + next if ($type ne $id_type); + + if ($id >= $ct && $id < ($ct + $length)) { + return $host - $ct + $id; + } + } + + return $id; +} + +sub map_ct_uid_to_host { + my ($uid, $id_map) = @_; + + return map_ct_id_to_host($uid, $id_map, 'u'); +} + +sub map_ct_gid_to_host { + my ($gid, $id_map) = @_; + + return map_ct_id_to_host($gid, $id_map, 'g'); } sub userns_command { @@ -2198,18 +2488,86 @@ sub userns_command { return []; } +my sub print_ct_stderr_log { + my ($vmid) = @_; + my $log = eval { file_get_contents("/run/pve/ct-$vmid.stderr") }; + return if !$log; + + while ($log =~ /^\h*(lxc-start:?\s+$vmid:?\s*\S+\s*)?(.*?)\h*$/gm) { + my $line = $2; + print STDERR "$line\n"; + } +} +my sub print_ct_warn_log { + my ($vmid) = @_; + my $log_fn = "/run/pve/ct-$vmid.warnings"; + my $log = eval { file_get_contents($log_fn) }; + return if !$log; + + while ($log =~ /^\h*\s*(.*?)\h*$/gm) { + PVE::RESTEnvironment::log_warn($1); + } + unlink $log_fn or warn "could not unlink '$log_fn' - $!\n"; +} + +my sub monitor_state_change($$) { + my ($monitor_socket, $vmid) = @_; + die "no monitor socket\n" if !defined($monitor_socket); + + while (1) { + my ($type, $name, $value) = PVE::LXC::Monitor::read_lxc_message($monitor_socket); + + die "monitor socket: got EOF\n" if !defined($type); + + next if $name ne "$vmid" || $type ne 'STATE'; + + if ($value eq PVE::LXC::Monitor::STATE_STARTING) { + alarm(0); # don't timeout after seeing the starting state + } elsif ($value eq PVE::LXC::Monitor::STATE_ABORTING || + $value eq PVE::LXC::Monitor::STATE_STOPPING || + $value eq PVE::LXC::Monitor::STATE_STOPPED) { + return 0; + } elsif ($value eq PVE::LXC::Monitor::STATE_RUNNING) { + return 1; + } else { + warn "unexpected message from monitor socket - " . + "type: '$type' - value: '$value'\n"; + } + } +} +my sub monitor_start($$) { + my ($monitor_socket, $vmid) = @_; + + my $success = eval { + PVE::Tools::run_with_timeout(10, \&monitor_state_change, $monitor_socket, $vmid) + }; + if (my $err = $@) { + warn "problem with monitor socket, but continuing anyway: $err\n"; + } elsif (!$success) { + print_ct_stderr_log($vmid); + die "startup for container '$vmid' failed\n"; + } +} + sub vm_start { - my ($vmid, $conf, $skiplock) = @_; + my ($vmid, $conf, $skiplock, $debug) = @_; # apply pending changes while starting if (scalar(keys %{$conf->{pending}})) { my $storecfg = PVE::Storage::config(); PVE::LXC::Config->vmconfig_apply_pending($vmid, $conf, $storecfg); + PVE::LXC::Config->write_config($vmid, $conf); $conf = PVE::LXC::Config->load_config($vmid); # update/reload } update_lxc_config($vmid, $conf); + eval { + my ($id_map, undef, undef) = PVE::LXC::parse_id_maps($conf); + PVE::LXC::validate_id_maps($id_map); + }; + warn "lxc.idmap: $@" if $@; + my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid"; if ($skiplock) { @@ -2217,10 +2575,32 @@ sub vm_start { close($fh); } - my $cmd = ['systemctl', 'start', "pve-container\@$vmid"]; + my $storage_cfg = PVE::Storage::config(); + my $vollist = PVE::LXC::Config->get_vm_volumes($conf); + + PVE::Storage::activate_volumes($storage_cfg, $vollist); + + my $monitor_socket = eval { PVE::LXC::Monitor::get_monitor_socket() }; + warn $@ if $@; + + unlink "/run/pve/ct-$vmid.stderr"; # systemd does not truncate log files + + my $is_debug = $debug || (!defined($debug) && $conf->{debug}); + my $base_unit = $is_debug ? 'pve-container-debug' : 'pve-container'; + + my $cmd = ['systemctl', 'start', "$base_unit\@$vmid"]; PVE::GuestHelpers::exec_hookscript($conf, $vmid, 'pre-start', 1); - eval { PVE::Tools::run_command($cmd); }; + eval { + run_command($cmd); + + monitor_start($monitor_socket, $vmid) if defined($monitor_socket); + + # if debug is requested, print the log it also when the start succeeded + print_ct_stderr_log($vmid) if $is_debug; + + print_ct_warn_log($vmid); # always print warn log, if any + }; if (my $err = $@) { unlink $skiplock_flag_fn; die $err; @@ -2271,14 +2651,23 @@ sub vm_stop { } } - eval { PVE::Tools::run_command($cmd, timeout => $shutdown_timeout) }; + eval { run_command($cmd, timeout => $shutdown_timeout) }; + + # Wait until the command socket is closed. + # In case the lxc-stop call failed, reading from the command socket may block forever, + # so poll with another timeout to avoid freezing the shutdown task. if (my $err = $@) { - warn $@ if $@; - } + warn $err if $err; - my $result = <$sock>; + my $poll = IO::Poll->new(); + $poll->mask($sock => POLLIN | POLLHUP); # watch for input and EOF events + $poll->poll($shutdown_timeout); # IO::Poll timeout is in seconds + return if ($poll->events($sock) & POLLHUP); + } else { + my $result = <$sock>; + return if !defined $result; # monitor is gone and the ct has stopped. + } - return if !defined $result; # monitor is gone and the ct has stopped. die "container did not stop\n"; } @@ -2302,13 +2691,13 @@ sub run_unshared { # Unshare the mount namespace die "failed to unshare mount namespace: $!\n" if !PVE::Tools::unshare(PVE::Tools::CLONE_NEWNS); - PVE::Tools::run_command(['mount', '--make-rslave', '/']); + run_command(['mount', '--make-rslave', '/']); return $code->(); }); } my $copy_volume = sub { - my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname, $bwlimit, $rootuid, $rootgid) = @_; + my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname, $bwlimit, $root_uid, $root_gid) = @_; my $src_mp = { volume => $src_volid, mp => '/', ro => 1 }; $src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid); @@ -2320,17 +2709,28 @@ my $copy_volume = sub { eval { # mount and copy mkdir $src; - mountpoint_mount($src_mp, $src, $storage_cfg, $snapname, $rootuid, $rootgid); + mountpoint_mount($src_mp, $src, $storage_cfg, $snapname, $root_uid, $root_gid); push @mounted, $src; mkdir $dest; - mountpoint_mount($dst_mp, $dest, $storage_cfg, undef, $rootuid, $rootgid); + mountpoint_mount($dst_mp, $dest, $storage_cfg, undef, $root_uid, $root_gid); push @mounted, $dest; $bwlimit //= 0; - PVE::Tools::run_command(['/usr/bin/rsync', '--stats', '-X', '-A', '--numeric-ids', - '-aH', '--whole-file', '--sparse', '--one-file-system', - "--bwlimit=$bwlimit", "$src/", $dest]); + run_command([ + 'rsync', + '--stats', + '-X', + '-A', + '--numeric-ids', + '-aH', + '--whole-file', + '--sparse', + '--one-file-system', + "--bwlimit=$bwlimit", + "$src/", + $dest + ]); }; my $err = $@; @@ -2339,7 +2739,7 @@ my $copy_volume = sub { while ((system {"fuser"} "fuser", "-s", $dest) == 0) {sleep 1}; foreach my $mount (reverse @mounted) { - eval { PVE::Tools::run_command(['/bin/umount', $mount], errfunc => sub{})}; + eval { run_command(['/bin/umount', $mount], errfunc => sub{})}; warn "Can't umount $mount\n" if $@; } @@ -2361,7 +2761,7 @@ sub copy_volume { my $src = "/var/lib/lxc/$vmid/.copy-volume-2"; # get id's for unprivileged container - my (undef, $rootuid, $rootgid) = parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = parse_id_maps($conf); # Allocate the disk before unsharing in order to make sure zfs subvolumes # are visible in this namespace, otherwise the host only sees the empty @@ -2371,15 +2771,15 @@ sub copy_volume { # Make sure $mp contains a correct size. $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume}); my $needs_chown; - ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid); + ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $root_uid, $root_gid); if ($needs_chown) { PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef); my $path = PVE::Storage::path($storage_cfg, $new_volid, undef); - chown($rootuid, $rootgid, $path); + chown($root_uid, $root_gid, $path); } run_unshared(sub { - $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname, $bwlimit, $rootuid, $rootgid); + $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname, $bwlimit, $root_uid, $root_gid); }); }; if (my $err = $@) { @@ -2391,4 +2791,64 @@ sub copy_volume { return $new_volid; } +sub get_lxc_version() { + my $version; + run_command([qw(lxc-start --version)], outfunc => sub { + my ($line) = @_; + # We only parse out major & minor version numbers. + if ($line =~ /^(\d+)\.(\d+)(?:\D.*)?$/) { + $version = [$1, $2]; + } + }); + + die "failed to get lxc version\n" if !defined($version); + + # return as a list: + return $version->@*; +} + +sub freeze($) { + my ($vmid) = @_; + if (PVE::CGroup::cgroup_mode() == 2) { + PVE::LXC::Command::freeze($vmid, 30); + } else { + PVE::LXC::CGroup->new($vmid)->freeze_thaw(1); + } +} + +sub thaw($) { + my ($vmid) = @_; + if (PVE::CGroup::cgroup_mode() == 2) { + PVE::LXC::Command::unfreeze($vmid, 30); + } else { + PVE::LXC::CGroup->new($vmid)->freeze_thaw(0); + } +} + +sub create_ifaces_ipams_ips { + my ($conf, $vmid) = @_; + + return if !$have_sdn; + + for my $opt (keys %$conf) { + next if $opt !~ m/^net(\d+)$/; + my $net = PVE::LXC::Config->parse_lxc_network($conf->{$opt}); + next if $net->{type} ne 'veth'; + PVE::Network::SDN::Vnets::add_next_free_cidr($net->{bridge}, $conf->{hostname}, $net->{hwaddr}, $vmid, undef, 1); + } +} + +sub delete_ifaces_ipams_ips { + my ($conf, $vmid) = @_; + + return if !$have_sdn; + + for my $opt (keys %$conf) { + next if $opt !~ m/^net(\d+)$/; + my $net = PVE::LXC::Config->parse_lxc_network($conf->{$opt}); + eval { PVE::Network::SDN::Vnets::del_ips_from_mac($net->{bridge}, $net->{hwaddr}, $conf->{hostname}) }; + warn $@ if $@; + } +} + 1;