X-Git-Url: https://git.proxmox.com/?p=pve-container.git;a=blobdiff_plain;f=src%2FPVE%2FLXC.pm;h=bb1cbdbd18fd507669d7b4786b9b5e7ce4f99983;hp=c4d53e8e0fd72f78717e2ac6298126ea76d90c7f;hb=HEAD;hpb=9e5694881f54472a2093ee93e94a3cfd96904fa1 diff --git a/src/PVE/LXC.pm b/src/PVE/LXC.pm index c4d53e8..65d0fa8 100644 --- a/src/PVE/LXC.pm +++ b/src/PVE/LXC.pm @@ -3,23 +3,30 @@ package PVE::LXC; use strict; use warnings; -use POSIX qw(EINTR); - -use Socket; - +use Cwd qw(); +use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST); +use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode); use File::Path; use File::Spec; -use Cwd qw(); -use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY); -use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED ENOSYS EEXIST); -use IO::Socket::UNIX; use IO::Poll qw(POLLIN POLLHUP); +use IO::Socket::UNIX; +use POSIX qw(EINTR); +use Socket; +use Time::HiRes qw (gettimeofday); +use PVE::AccessControl; +use PVE::CGroup; +use PVE::CpuSet; use PVE::Exception qw(raise_perm_exc); -use PVE::Storage; -use PVE::SafeSyslog; +use PVE::Firewall; +use PVE::GuestHelpers qw(check_vnet_access safe_string_ne safe_num_ne safe_boolean_ne); use PVE::INotify; use PVE::JSONSchema qw(get_standard_option); +use PVE::Network; +use PVE::ProcFSTools; +use PVE::RESTEnvironment; +use PVE::SafeSyslog; +use PVE::Storage; use PVE::Tools qw( run_command dir_glob_foreach @@ -30,23 +37,17 @@ use PVE::Tools qw( $IPV4RE $IPV6RE ); -use PVE::CpuSet; -use PVE::Network; -use PVE::AccessControl; -use PVE::ProcFSTools; -use PVE::RESTEnvironment; use PVE::Syscall qw(:fsmount); -use PVE::LXC::Config; -use PVE::GuestHelpers qw(safe_string_ne safe_num_ne safe_boolean_ne); -use PVE::LXC::Tools; + use PVE::LXC::CGroup; +use PVE::LXC::Config; use PVE::LXC::Monitor; -use PVE::CGroup; +use PVE::LXC::Tools; -use Time::HiRes qw (gettimeofday); my $have_sdn; eval { require PVE::Network::SDN::Zones; + require PVE::Network::SDN::Vnets; $have_sdn = 1; }; @@ -69,7 +70,7 @@ sub config_list { my $d = $ids->{$vmid}; next if !$d->{node} || $d->{node} ne $nodename; next if !$d->{type} || $d->{type} ne 'lxc'; - $res->{$vmid} = { type => 'lxc', vmid => $vmid }; + $res->{$vmid} = { type => 'lxc', vmid => int($vmid) }; } return $res; } @@ -640,6 +641,19 @@ sub update_lxc_config { $raw .= "lxc.mount.auto = sys:mixed\n"; } + PVE::LXC::Config->foreach_passthrough_device($conf, sub { + my ($key, $device) = @_; + + die "Path is not defined for passthrough device $key\n" + if !defined($device->{path}); + + my ($mode, $rdev) = PVE::LXC::Tools::get_device_mode_and_rdev($device->{path}); + my $major = PVE::Tools::dev_t_major($rdev); + my $minor = PVE::Tools::dev_t_minor($rdev); + my $device_type_char = S_ISBLK($mode) ? 'b' : 'c'; + $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n"; + }); + # WARNING: DO NOT REMOVE this without making sure that loop device nodes # cannot be exposed to the container with r/w access (cgroup perms). # When this is enabled mounts will still remain in the monitor's namespace @@ -686,8 +700,11 @@ sub update_lxc_config { my $memory = $conf->{memory} || 512; my $swap = $conf->{swap} // 0; - my $lxcmem = int($memory*1024*1024); - $raw .= "lxc.cgroup2.memory.max = $lxcmem\n"; + # cgroup memory usage is limited by the hard 'max' limit (OOM-killer enforced) and the soft + # 'high' limit (cgroup processes get throttled and put under heavy reclaim pressure). + my ($lxc_mem_max, $lxc_mem_high) = PVE::LXC::Config::calculate_memory_constraints($memory); + $raw .= "lxc.cgroup2.memory.max = $lxc_mem_max\n"; + $raw .= "lxc.cgroup2.memory.high = $lxc_mem_high\n"; my $lxcswap = int($swap*1024*1024); $raw .= "lxc.cgroup2.memory.swap.max = $lxcswap\n"; @@ -731,14 +748,14 @@ sub update_lxc_config { $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr}); $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name}); + my $bridge_mtu = PVE::Network::read_bridge_mtu($d->{bridge}); + my $mtu = $d->{mtu} || $bridge_mtu; + # Keep container from starting with invalid mtu configuration - if (my $mtu = $d->{mtu}) { - my $bridge_mtu = PVE::Network::read_bridge_mtu($d->{bridge}); - die "$k: MTU size '$mtu' is bigger than bridge MTU '$bridge_mtu'\n" - if ($mtu > $bridge_mtu); + die "$k: MTU size '$mtu' is bigger than bridge MTU '$bridge_mtu'\n" + if ($mtu > $bridge_mtu); - $raw .= "lxc.net.$ind.mtu = $mtu\n"; - } + $raw .= "lxc.net.$ind.mtu = $mtu\n"; # Starting with lxc 4.0, we do not patch lxc to execute our up-scripts. if ($lxc_major >= 4) { @@ -896,6 +913,8 @@ sub destroy_lxc_container { }); } + delete_ifaces_ipams_ips($conf, $vmid); + rmdir "/var/lib/lxc/$vmid/rootfs"; unlink "/var/lib/lxc/$vmid/config"; rmdir "/var/lib/lxc/$vmid"; @@ -928,14 +947,17 @@ sub net_tap_plug : prototype($$) { return; } - my ($bridge, $tag, $firewall, $trunks, $rate, $hwaddr) = - $net->@{'bridge', 'tag', 'firewall', 'trunks', 'rate', 'hwaddr'}; + my ($bridge, $tag, $trunks, $rate, $hwaddr) = + $net->@{'bridge', 'tag', 'trunks', 'rate', 'hwaddr'}; + + # The nftable-based implementation from the newer proxmox-firewall does not requires FW bridges + my $create_firewall_bridges = $net->{firewall} && !PVE::Firewall::is_nftables(); if ($have_sdn) { - PVE::Network::SDN::Zones::tap_plug($iface, $bridge, $tag, $firewall, $trunks, $rate); - PVE::Network::SDN::Zones::add_bridge_fdb($iface, $hwaddr, $bridge, $firewall); + PVE::Network::SDN::Zones::tap_plug($iface, $bridge, $tag, $create_firewall_bridges, $trunks, $rate); + PVE::Network::SDN::Zones::add_bridge_fdb($iface, $hwaddr, $bridge); } else { - PVE::Network::tap_plug($iface, $bridge, $tag, $firewall, $trunks, $rate, { mac => $hwaddr }); + PVE::Network::tap_plug($iface, $bridge, $tag, $create_firewall_bridges, $trunks, $rate, { mac => $hwaddr }); } PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'up']); @@ -959,27 +981,47 @@ sub update_net { safe_string_ne($oldnet->{name}, $newnet->{name})) { PVE::Network::veth_delete($veth); + + if ($have_sdn && safe_string_ne($oldnet->{hwaddr}, $newnet->{hwaddr})) { + eval { PVE::Network::SDN::Vnets::del_ips_from_mac($oldnet->{bridge}, $oldnet->{hwaddr}, $conf->{hostname}) }; + warn $@ if $@; + + PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1); + PVE::Network::SDN::Vnets::add_dhcp_mapping($newnet->{bridge}, $newnet->{hwaddr}, $vmid, $conf->{hostname}); + } + delete $conf->{$opt}; PVE::LXC::Config->write_config($vmid, $conf); hotplug_net($vmid, $conf, $opt, $newnet, $netid); } else { - if (safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) || + my $bridge_changed = safe_string_ne($oldnet->{bridge}, $newnet->{bridge}); + + if ($bridge_changed || safe_num_ne($oldnet->{tag}, $newnet->{tag}) || safe_num_ne($oldnet->{firewall}, $newnet->{firewall}) || safe_boolean_ne($oldnet->{link_down}, $newnet->{link_down}) ) { - if ($oldnet->{bridge}) { + my $oldbridge = $oldnet->{bridge}; + PVE::Network::tap_unplug($veth); foreach (qw(bridge tag firewall)) { delete $oldnet->{$_}; } $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet); PVE::LXC::Config->write_config($vmid, $conf); + + if ($have_sdn && $bridge_changed) { + eval { PVE::Network::SDN::Vnets::del_ips_from_mac($oldbridge, $oldnet->{hwaddr}, $conf->{hostname}) }; + warn $@ if $@; + } } + if ($have_sdn && $bridge_changed) { + PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1); + } PVE::LXC::net_tap_plug($veth, $newnet); # This includes the rate: @@ -996,6 +1038,11 @@ sub update_net { PVE::LXC::Config->write_config($vmid, $conf); } } else { + if ($have_sdn) { + PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1); + PVE::Network::SDN::Vnets::add_dhcp_mapping($newnet->{bridge}, $newnet->{hwaddr}, $vmid, $conf->{hostname}); + } + hotplug_net($vmid, $conf, $opt, $newnet, $netid); } @@ -1034,6 +1081,32 @@ sub hotplug_net { PVE::LXC::Config->write_config($vmid, $conf); } +sub get_interfaces { + my ($vmid) = @_; + + my $pid = eval { find_lxc_pid($vmid); }; + return if $@; + + my $output; + # enters the network namespace of the container and executes 'ip a' + run_command(['nsenter', '-t', $pid, '--net', '--', 'ip', '--json', 'a'], + outfunc => sub { $output .= shift; }); + + my $config = JSON::decode_json($output); + + my $res; + for my $interface ($config->@*) { + my $obj = { name => $interface->{ifname} }; + for my $ip ($interface->{addr_info}->@*) { + $obj->{$ip->{family}} = $ip->{local} . "/" . $ip->{prefixlen}; + } + $obj->{hwaddr} = $interface->{address}; + push @$res, $obj + } + + return $res; +} + sub update_ipconfig { my ($vmid, $conf, $opt, $eth, $newnet, $rootdir) = @_; @@ -1312,8 +1385,13 @@ sub check_ct_modify_config_perm { } } elsif ($opt eq 'memory' || $opt eq 'swap') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']); - } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' || - $opt eq 'searchdomain' || $opt eq 'hostname') { + } elsif ($opt =~ m/^net\d+$/) { + $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); + check_bridge_access($rpcenv, $authuser, $oldconf->{$opt}) if $oldconf->{$opt}; + check_bridge_access($rpcenv, $authuser, $newconf->{$opt}) if $newconf->{$opt}; + } elsif ($opt =~ m/^dev\d+$/) { + raise_perm_exc("configuring device passthrough is only allowed for root\@pam"); + } elsif ($opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') { $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']); } elsif ($opt eq 'features') { raise_perm_exc("changing feature flags for privileged container is only allowed for root\@pam") @@ -1381,6 +1459,18 @@ sub check_ct_modify_config_perm { return 1; } +sub check_bridge_access { + my ($rpcenv, $authuser, $raw) = @_; + + return 1 if $authuser eq 'root@pam'; + + my $net = PVE::LXC::Config->parse_lxc_network($raw); + my ($bridge, $tag, $trunks) = $net->@{'bridge', 'tag', 'trunks'}; + check_vnet_access($rpcenv, $authuser, $bridge, $tag, $trunks); + + return 1; +}; + sub umount_all { my ($vmid, $storage_cfg, $conf, $noerr) = @_; @@ -1427,7 +1517,7 @@ sub mount_all { my $volid_list = PVE::LXC::Config->get_vm_volumes($conf); PVE::Storage::activate_volumes($storage_cfg, $volid_list); - my (undef, $rootuid, $rootgid) = parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = parse_id_maps($conf); eval { PVE::LXC::Config->foreach_volume($conf, sub { @@ -1435,7 +1525,7 @@ sub mount_all { $mountpoint->{ro} = 0 if $ignore_ro; - mountpoint_mount($mountpoint, $rootdir, $storage_cfg, undef, $rootuid, $rootgid); + mountpoint_mount($mountpoint, $rootdir, $storage_cfg, undef, $root_uid, $root_gid); }); }; if (my $err = $@) { @@ -1511,17 +1601,17 @@ sub run_with_loopdev { # * directory name of the last directory # So that the path $2/$3 should lead to $1 afterwards. sub walk_tree_nofollow($$$;$$) { - my ($start, $subdir, $mkdir, $rootuid, $rootgid) = @_; + my ($start, $subdir, $mkdir, $root_uid, $root_gid) = @_; sysopen(my $fd, $start, O_PATH | O_DIRECTORY) or die "failed to open start directory $start: $!\n"; - return walk_tree_nofollow_fd($start, $fd, $subdir, $mkdir, $rootuid, $rootgid); + return walk_tree_nofollow_fd($start, $fd, $subdir, $mkdir, $root_uid, $root_gid); } sub walk_tree_nofollow_fd($$$$;$$) { - my ($start_dirname, $start_fd, $subdir, $mkdir, $rootuid, $rootgid) = @_; + my ($start_dirname, $start_fd, $subdir, $mkdir, $root_uid, $root_gid) = @_; # splitdir() returns '' for empty components including the leading / my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir); @@ -1549,8 +1639,8 @@ sub walk_tree_nofollow_fd($$$$;$$) { $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY); die "failed to create path: $dir: $!\n" if !$next; - PVE::Tools::fchownat(fileno($next), '', $rootuid, $rootgid, PVE::Tools::AT_EMPTY_PATH) - if defined($rootuid) && defined($rootgid); + PVE::Tools::fchownat(fileno($next), '', $root_uid, $root_gid, PVE::Tools::AT_EMPTY_PATH) + if defined($root_uid) && defined($root_gid); } close $second if defined($last_component) && $second != $start_fd; @@ -1644,27 +1734,26 @@ sub bindmount { # from $rootdir and $mount and walk the path from $rootdir to the final # directory to check for symlinks. sub __mount_prepare_rootdir { - my ($rootdir, $mount, $rootuid, $rootgid) = @_; + my ($rootdir, $mount, $root_uid, $root_gid) = @_; $rootdir =~ s!/+!/!g; $rootdir =~ s!/+$!!; my $mount_path = "$rootdir/$mount"; - my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1, $rootuid, $rootgid); + my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1, $root_uid, $root_gid); return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir); } # use $rootdir = undef to just return the corresponding mount path sub mountpoint_mount { - my ($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid) = @_; - return __mountpoint_mount($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid, undef); + my ($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid) = @_; + return __mountpoint_mount($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid, undef); } sub mountpoint_stage { - my ($mountpoint, $stage_dir, $storage_cfg, $snapname, $rootuid, $rootgid) = @_; + my ($mountpoint, $stage_dir, $storage_cfg, $snapname, $root_uid, $root_gid) = @_; my ($path, $loop, $dev) = - __mountpoint_mount($mountpoint, $stage_dir, $storage_cfg, $snapname, $rootuid, $rootgid, 1); + __mountpoint_mount($mountpoint, $stage_dir, $storage_cfg, $snapname, $root_uid, $root_gid, 1); if (!defined($path)) { - return undef if $! == ENOSYS; die "failed to mount subvolume: $!\n"; } @@ -1679,14 +1768,14 @@ sub mountpoint_stage { } sub mountpoint_insert_staged { - my ($mount_fd, $rootdir_fd, $mp_dir, $opt, $rootuid, $rootgid) = @_; + my ($mount_fd, $rootdir_fd, $mp_dir, $opt, $root_uid, $root_gid) = @_; if (!defined($rootdir_fd)) { sysopen($rootdir_fd, '.', O_PATH | O_DIRECTORY) or die "failed to open '.': $!\n"; } - my $dest_fd = walk_tree_nofollow_fd('/', $rootdir_fd, $mp_dir, 1, $rootuid, $rootgid); + my $dest_fd = walk_tree_nofollow_fd('/', $rootdir_fd, $mp_dir, 1, $root_uid, $root_gid); PVE::Tools::move_mount( fileno($mount_fd), @@ -1699,15 +1788,8 @@ sub mountpoint_insert_staged { # Use $stage_mount, $rootdir is treated as a temporary path to "stage" the file system. The user # can then open a file descriptor to it which can be used with the `move_mount` syscall. -# Note that if the kernel does not support the new mount API, this will not perform any action -# and return `undef` with $! = ENOSYS. sub __mountpoint_mount { - my ($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid, $stage_mount) = @_; - - if (defined($stage_mount) && !PVE::LXC::Tools::can_use_new_mount_api()) { - $! = ENOSYS; - return undef; - } + my ($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid, $stage_mount) = @_; # When staging mount points we always mount to $rootdir directly (iow. as if `mp=/`). # This is required since __mount_prepare_rootdir() will return handles to the parent directory @@ -1728,7 +1810,7 @@ sub __mountpoint_mount { if (defined($rootdir)) { ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) = - __mount_prepare_rootdir($rootdir, $mount, $rootuid, $rootgid); + __mount_prepare_rootdir($rootdir, $mount, $root_uid, $root_gid); } if (defined($stage_mount)) { @@ -1747,8 +1829,10 @@ sub __mountpoint_mount { } my $acl = $mountpoint->{acl}; - if (defined($acl)) { - push @$optlist, ($acl ? 'acl' : 'noacl'); + + if ($acl) { + push @$optlist, 'acl'; + # NOTE: the else branch is handled below } my $optstring = join(',', @$optlist); @@ -1761,6 +1845,7 @@ sub __mountpoint_mount { my $scfg = PVE::Storage::storage_config($storage_cfg, $storage); + PVE::Storage::activate_volumes($storage_cfg, [$volid], $snapname); my $path = PVE::Storage::map_volume($storage_cfg, $volid, $snapname); $path = PVE::Storage::path($storage_cfg, $volid, $snapname) if !defined($path); @@ -1768,6 +1853,12 @@ sub __mountpoint_mount { my ($vtype, undef, undef, undef, undef, $isBase, $format) = PVE::Storage::parse_volname($storage_cfg, $volid); + if (defined($acl) && !$acl) { + # Does having this really makes sense or should we drop it with a future major release? + # Kernel 6.1 removed the noacl mount option for ext4, which is used for all raw volumes. + push @$optlist, 'noacl' if $format ne 'raw'; + } + $format = 'iso' if $vtype eq 'iso'; # allow to handle iso files if ($format eq 'subvol') { @@ -1850,7 +1941,7 @@ sub __mountpoint_mount { sub mountpoint_hotplug :prototype($$$$$) { my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_; - my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf); # We do the rest in a fork with an unshared mount namespace, because: # -) change our papparmor profile to that of /usr/bin/lxc-start @@ -1886,21 +1977,22 @@ sub mountpoint_hotplug :prototype($$$$$) { my $dir = get_staging_mount_path($opt); # Now switch our apparmor profile before mounting: - my $data = 'changeprofile /usr/bin/lxc-start'; - if (syswrite($aa_fd, $data, length($data)) != length($data)) { + my $data = 'changeprofile pve-container-mounthotplug'; + my $data_written = syswrite($aa_fd, $data, length($data)); + if (!defined($data_written) || $data_written != length($data)) { die "failed to change apparmor profile: $!\n"; } # Check errors on close as well: close($aa_fd) or die "failed to change apparmor profile (close() failed): $!\n"; - my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $rootuid, $rootgid); + my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $root_uid, $root_gid); PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS); chdir('/') or die "failed to change root directory within the container's mount namespace: $!\n"; - mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $rootuid, $rootgid); + mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $root_uid, $root_gid); }); } @@ -1916,7 +2008,7 @@ sub get_staging_mount_path($) { return $target; } -# Mount /run/pve/mountpoints as tmpfs +# Mount tmpfs for mount point staging and return the path. sub get_staging_tempfs() { # We choose a path in /var/lib/lxc/ here because the lxc-start apparmor profile restricts most # mounts to that. @@ -1933,7 +2025,7 @@ sub get_staging_tempfs() { } sub mkfs { - my ($dev, $rootuid, $rootgid) = @_; + my ($dev, $root_uid, $root_gid) = @_; run_command( [ @@ -1941,7 +2033,7 @@ sub mkfs { '-O', 'mmp', '-E', - "root_owner=$rootuid:$rootgid", + "root_owner=$root_uid:$root_gid", $dev, ], outfunc => sub { @@ -1959,10 +2051,12 @@ sub mkfs { } sub format_disk { - my ($storage_cfg, $volid, $rootuid, $rootgid) = @_; + my ($storage_cfg, $volid, $root_uid, $root_gid) = @_; if ($volid =~ m!^/dev/.+!) { - mkfs($volid); + # FIXME: remove in Proxmox VE 9 – this code path cannot really be reached currently, using + # block devices needs manual preparations by the user + mkfs($volid, $root_uid, $root_gid); return; } @@ -1982,7 +2076,7 @@ sub format_disk { die "cannot format volume '$volid' (format == $format)\n" if $format ne 'raw'; - mkfs($path, $rootuid, $rootgid); + mkfs($path, $root_uid, $root_gid); } sub destroy_disks { @@ -1995,7 +2089,7 @@ sub destroy_disks { } sub alloc_disk { - my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_; + my ($storecfg, $vmid, $storage, $size_kb, $root_uid, $root_gid) = @_; my $needs_chown = 0; my $volid; @@ -2022,7 +2116,7 @@ sub alloc_disk { } else { die "content type 'rootdir' is not available or configured on storage '$storage'\n"; } - format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format; + format_disk($storecfg, $volid, $root_uid, $root_gid) if $do_format; }; if (my $err = $@) { # in case formatting got interrupted: @@ -2042,7 +2136,7 @@ sub create_disks { my $vollist = []; eval { - my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf); my $chown_vollist = []; PVE::LXC::Config->foreach_volume($settings, sub { @@ -2059,7 +2153,7 @@ sub create_disks { my $size_kb = int(${size_gb}*1024) * 1024; my $needs_chown = 0; - ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid); + ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $root_uid, $root_gid); push @$chown_vollist, $volid if $needs_chown; push @$vollist, $volid; $mountpoint->{volume} = $volid; @@ -2078,7 +2172,7 @@ sub create_disks { PVE::Storage::activate_volumes($storecfg, $chown_vollist, undef); foreach my $volid (@$chown_vollist) { my $path = PVE::Storage::path($storecfg, $volid, undef); - chown($rootuid, $rootgid, $path); + chown($root_uid, $root_gid, $path); } PVE::Storage::deactivate_volumes($storecfg, $chown_vollist, undef); }; @@ -2104,8 +2198,8 @@ sub update_disksize { $changes = 1; print "$prefix updated volume size of '$mp->{volume}' in config.\n"; $mp->{size} = $size; - my $nomp = 1 if ($key eq 'rootfs'); - $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $nomp); + my $no_mp = $key eq 'rootfs'; # rootfs is handled different from other mount points + $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $no_mp); } }; @@ -2292,20 +2386,21 @@ sub parse_id_maps { my ($conf) = @_; my $id_map = []; - my $rootuid = 0; - my $rootgid = 0; + my $root_uid = 0; + my $root_gid = 0; my $lxc = $conf->{lxc}; foreach my $entry (@$lxc) { my ($key, $value) = @$entry; - # FIXME: remove the 'id_map' variant when lxc-3.0 arrives - next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map'; + + next if $key ne 'lxc.idmap'; + if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) { my ($type, $ct, $host, $length) = ($1, $2, $3, $4); push @$id_map, [$type, $ct, $host, $length]; if ($ct == 0) { - $rootuid = $host if $type eq 'u'; - $rootgid = $host if $type eq 'g'; + $root_uid = $host if $type eq 'u'; + $root_gid = $host if $type eq 'g'; } } else { die "failed to parse idmap: $value\n"; @@ -2316,10 +2411,73 @@ sub parse_id_maps { # Should we read them from /etc/subuid? $id_map = [ ['u', '0', '100000', '65536'], ['g', '0', '100000', '65536'] ]; - $rootuid = $rootgid = 100000; + $root_uid = $root_gid = 100000; + } + + return ($id_map, $root_uid, $root_gid); +} + +sub validate_id_maps { + my ($id_map) = @_; + + # $mappings->{$type}->{$side} = [ { line => $line, start => $start, count => $count }, ... ] + # $type: either "u" or "g" + # $side: either "container" or "host" + # $line: index of this mapping in @$id_map + # $start, $count: interval of this mapping + my $mappings = { u => {}, g => {} }; + for (my $i = 0; $i < scalar(@$id_map); $i++) { + my ($type, $ct_start, $host_start, $count) = $id_map->[$i]->@*; + my $sides = $mappings->{$type}; + push $sides->{host}->@*, { line => $i, start => $host_start, count => $count }; + push $sides->{container}->@*, { line => $i, start => $ct_start, count => $count }; + } + + # find the first conflict between two consecutive mappings when sorted by their start id + for my $type (qw(u g)) { + for my $side (qw(container host)) { + my @entries = sort { $a->{start} <=> $b->{start} } $mappings->{$type}->{$side}->@*; + for my $idx (1..scalar(@entries) - 1) { + my $previous = $entries[$idx - 1]; + my $current = $entries[$idx]; + if ($previous->{start} + $previous->{count} > $current->{start}) { + my $conflict = $current->{start}; + my @previous_line = $id_map->[$previous->{line}]->@*; + my @current_line = $id_map->[$current->{line}]->@*; + die "invalid map entry '@current_line': $side ${type}id $conflict " + ."is also mapped by entry '@previous_line'\n"; + } + } + } } +} + +sub map_ct_id_to_host { + my ($id, $id_map, $id_type) = @_; + + for my $mapping (@$id_map) { + my ($type, $ct, $host, $length) = @$mapping; + + next if ($type ne $id_type); + + if ($id >= $ct && $id < ($ct + $length)) { + return $host - $ct + $id; + } + } + + return $id; +} + +sub map_ct_uid_to_host { + my ($uid, $id_map) = @_; + + return map_ct_id_to_host($uid, $id_map, 'u'); +} + +sub map_ct_gid_to_host { + my ($gid, $id_map) = @_; - return ($id_map, $rootuid, $rootgid); + return map_ct_id_to_host($gid, $id_map, 'g'); } sub userns_command { @@ -2404,6 +2562,12 @@ sub vm_start { update_lxc_config($vmid, $conf); + eval { + my ($id_map, undef, undef) = PVE::LXC::parse_id_maps($conf); + PVE::LXC::validate_id_maps($id_map); + }; + warn "lxc.idmap: $@" if $@; + my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid"; if ($skiplock) { @@ -2533,7 +2697,7 @@ sub run_unshared { } my $copy_volume = sub { - my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname, $bwlimit, $rootuid, $rootgid) = @_; + my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname, $bwlimit, $root_uid, $root_gid) = @_; my $src_mp = { volume => $src_volid, mp => '/', ro => 1 }; $src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid); @@ -2545,10 +2709,10 @@ my $copy_volume = sub { eval { # mount and copy mkdir $src; - mountpoint_mount($src_mp, $src, $storage_cfg, $snapname, $rootuid, $rootgid); + mountpoint_mount($src_mp, $src, $storage_cfg, $snapname, $root_uid, $root_gid); push @mounted, $src; mkdir $dest; - mountpoint_mount($dst_mp, $dest, $storage_cfg, undef, $rootuid, $rootgid); + mountpoint_mount($dst_mp, $dest, $storage_cfg, undef, $root_uid, $root_gid); push @mounted, $dest; $bwlimit //= 0; @@ -2597,7 +2761,7 @@ sub copy_volume { my $src = "/var/lib/lxc/$vmid/.copy-volume-2"; # get id's for unprivileged container - my (undef, $rootuid, $rootgid) = parse_id_maps($conf); + my (undef, $root_uid, $root_gid) = parse_id_maps($conf); # Allocate the disk before unsharing in order to make sure zfs subvolumes # are visible in this namespace, otherwise the host only sees the empty @@ -2607,15 +2771,15 @@ sub copy_volume { # Make sure $mp contains a correct size. $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume}); my $needs_chown; - ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid); + ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $root_uid, $root_gid); if ($needs_chown) { PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef); my $path = PVE::Storage::path($storage_cfg, $new_volid, undef); - chown($rootuid, $rootgid, $path); + chown($root_uid, $root_gid, $path); } run_unshared(sub { - $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname, $bwlimit, $rootuid, $rootgid); + $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname, $bwlimit, $root_uid, $root_gid); }); }; if (my $err = $@) { @@ -2661,4 +2825,30 @@ sub thaw($) { } } +sub create_ifaces_ipams_ips { + my ($conf, $vmid) = @_; + + return if !$have_sdn; + + for my $opt (keys %$conf) { + next if $opt !~ m/^net(\d+)$/; + my $net = PVE::LXC::Config->parse_lxc_network($conf->{$opt}); + next if $net->{type} ne 'veth'; + PVE::Network::SDN::Vnets::add_next_free_cidr($net->{bridge}, $conf->{hostname}, $net->{hwaddr}, $vmid, undef, 1); + } +} + +sub delete_ifaces_ipams_ips { + my ($conf, $vmid) = @_; + + return if !$have_sdn; + + for my $opt (keys %$conf) { + next if $opt !~ m/^net(\d+)$/; + my $net = PVE::LXC::Config->parse_lxc_network($conf->{$opt}); + eval { PVE::Network::SDN::Vnets::del_ips_from_mac($net->{bridge}, $net->{hwaddr}, $conf->{hostname}) }; + warn $@ if $@; + } +} + 1;