use strict;
use warnings;
-use POSIX qw(EINTR);
-
-use Socket;
-
+use Cwd qw();
+use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED EEXIST);
+use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY :mode);
use File::Path;
use File::Spec;
-use Cwd qw();
-use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
-use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED ENOSYS EEXIST);
+use IO::Poll qw(POLLIN POLLHUP);
use IO::Socket::UNIX;
+use POSIX qw(EINTR);
+use Socket;
+use Time::HiRes qw (gettimeofday);
+use PVE::AccessControl;
+use PVE::CGroup;
+use PVE::CpuSet;
use PVE::Exception qw(raise_perm_exc);
-use PVE::Storage;
-use PVE::SafeSyslog;
+use PVE::Firewall;
+use PVE::GuestHelpers qw(check_vnet_access safe_string_ne safe_num_ne safe_boolean_ne);
use PVE::INotify;
use PVE::JSONSchema qw(get_standard_option);
+use PVE::Network;
+use PVE::ProcFSTools;
+use PVE::RESTEnvironment;
+use PVE::SafeSyslog;
+use PVE::Storage;
use PVE::Tools qw(
run_command
dir_glob_foreach
$IPV4RE
$IPV6RE
);
-use PVE::RPCEnvironment;
-use PVE::CpuSet;
-use PVE::Network;
-use PVE::AccessControl;
-use PVE::ProcFSTools;
use PVE::Syscall qw(:fsmount);
-use PVE::LXC::Config;
-use PVE::GuestHelpers qw(safe_string_ne safe_num_ne safe_boolean_ne);
-use PVE::LXC::Tools;
+
use PVE::LXC::CGroup;
+use PVE::LXC::Config;
use PVE::LXC::Monitor;
-use PVE::CGroup;
+use PVE::LXC::Tools;
-use Time::HiRes qw (gettimeofday);
my $have_sdn;
eval {
require PVE::Network::SDN::Zones;
+ require PVE::Network::SDN::Vnets;
$have_sdn = 1;
};
my $d = $ids->{$vmid};
next if !$d->{node} || $d->{node} ne $nodename;
next if !$d->{type} || $d->{type} ne 'lxc';
- $res->{$vmid} = { type => 'lxc', vmid => $vmid };
+ $res->{$vmid} = { type => 'lxc', vmid => int($vmid) };
}
return $res;
}
$raw .= "lxc.mount.auto = sys:mixed\n";
}
+ PVE::LXC::Config->foreach_passthrough_device($conf, sub {
+ my ($key, $device) = @_;
+
+ die "Path is not defined for passthrough device $key\n"
+ if !defined($device->{path});
+
+ my ($mode, $rdev) = PVE::LXC::Tools::get_device_mode_and_rdev($device->{path});
+ my $major = PVE::Tools::dev_t_major($rdev);
+ my $minor = PVE::Tools::dev_t_minor($rdev);
+ my $device_type_char = S_ISBLK($mode) ? 'b' : 'c';
+ $raw .= "lxc.cgroup2.devices.allow = $device_type_char $major:$minor rw\n";
+ });
+
# WARNING: DO NOT REMOVE this without making sure that loop device nodes
# cannot be exposed to the container with r/w access (cgroup perms).
# When this is enabled mounts will still remain in the monitor's namespace
# some init scripts expect a linux terminal (turnkey).
$raw .= "lxc.environment = TERM=linux\n";
-
+
my $utsname = $conf->{hostname} || "CT$vmid";
$raw .= "lxc.uts.name = $utsname\n";
my $memory = $conf->{memory} || 512;
my $swap = $conf->{swap} // 0;
- my $lxcmem = int($memory*1024*1024);
- $raw .= "lxc.cgroup2.memory.max = $lxcmem\n";
+ # cgroup memory usage is limited by the hard 'max' limit (OOM-killer enforced) and the soft
+ # 'high' limit (cgroup processes get throttled and put under heavy reclaim pressure).
+ my ($lxc_mem_max, $lxc_mem_high) = PVE::LXC::Config::calculate_memory_constraints($memory);
+ $raw .= "lxc.cgroup2.memory.max = $lxc_mem_max\n";
+ $raw .= "lxc.cgroup2.memory.high = $lxc_mem_high\n";
my $lxcswap = int($swap*1024*1024);
$raw .= "lxc.cgroup2.memory.swap.max = $lxcswap\n";
$raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr});
$raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name});
+ my $bridge_mtu = PVE::Network::read_bridge_mtu($d->{bridge});
+ my $mtu = $d->{mtu} || $bridge_mtu;
+
# Keep container from starting with invalid mtu configuration
- if (my $mtu = $d->{mtu}) {
- my $bridge_mtu = PVE::Network::read_bridge_mtu($d->{bridge});
- die "$k: MTU size '$mtu' is bigger than bridge MTU '$bridge_mtu'\n"
- if ($mtu > $bridge_mtu);
+ die "$k: MTU size '$mtu' is bigger than bridge MTU '$bridge_mtu'\n"
+ if ($mtu > $bridge_mtu);
- $raw .= "lxc.net.$ind.mtu = $mtu\n";
- }
+ $raw .= "lxc.net.$ind.mtu = $mtu\n";
# Starting with lxc 4.0, we do not patch lxc to execute our up-scripts.
if ($lxc_major >= 4) {
});
}
+ delete_ifaces_ipams_ips($conf, $vmid);
+
rmdir "/var/lib/lxc/$vmid/rootfs";
unlink "/var/lib/lxc/$vmid/config";
rmdir "/var/lib/lxc/$vmid";
warn $@ if $@; # avoid errors - just warn
}
+sub net_tap_plug : prototype($$) {
+ my ($iface, $net) = @_;
+
+ if (defined($net->{link_down})) {
+ PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'down']);
+ # Don't add disconnected interfaces to the bridge, otherwise e.g. applying any network
+ # change (e.g. `ifreload -a`) could (re-)activate it unintentionally.
+ return;
+ }
+
+ my ($bridge, $tag, $trunks, $rate, $hwaddr) =
+ $net->@{'bridge', 'tag', 'trunks', 'rate', 'hwaddr'};
+
+ # The nftable-based implementation from the newer proxmox-firewall does not requires FW bridges
+ my $create_firewall_bridges = $net->{firewall} && !PVE::Firewall::is_nftables();
+
+ if ($have_sdn) {
+ PVE::Network::SDN::Zones::tap_plug($iface, $bridge, $tag, $create_firewall_bridges, $trunks, $rate);
+ PVE::Network::SDN::Zones::add_bridge_fdb($iface, $hwaddr, $bridge);
+ } else {
+ PVE::Network::tap_plug($iface, $bridge, $tag, $create_firewall_bridges, $trunks, $rate, { mac => $hwaddr });
+ }
+
+ PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'up']);
+}
+
sub update_net {
my ($vmid, $conf, $opt, $newnet, $netid, $rootdir) = @_;
safe_string_ne($oldnet->{name}, $newnet->{name})) {
PVE::Network::veth_delete($veth);
+
+ if ($have_sdn && safe_string_ne($oldnet->{hwaddr}, $newnet->{hwaddr})) {
+ eval { PVE::Network::SDN::Vnets::del_ips_from_mac($oldnet->{bridge}, $oldnet->{hwaddr}, $conf->{hostname}) };
+ warn $@ if $@;
+
+ PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1);
+ PVE::Network::SDN::Vnets::add_dhcp_mapping($newnet->{bridge}, $newnet->{hwaddr}, $vmid, $conf->{hostname});
+ }
+
delete $conf->{$opt};
PVE::LXC::Config->write_config($vmid, $conf);
hotplug_net($vmid, $conf, $opt, $newnet, $netid);
} else {
- if (safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) ||
- safe_num_ne($oldnet->{tag}, $newnet->{tag}) ||
- safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) {
+ my $bridge_changed = safe_string_ne($oldnet->{bridge}, $newnet->{bridge});
+ if ($bridge_changed ||
+ safe_num_ne($oldnet->{tag}, $newnet->{tag}) ||
+ safe_num_ne($oldnet->{firewall}, $newnet->{firewall}) ||
+ safe_boolean_ne($oldnet->{link_down}, $newnet->{link_down})
+ ) {
if ($oldnet->{bridge}) {
+ my $oldbridge = $oldnet->{bridge};
+
PVE::Network::tap_unplug($veth);
foreach (qw(bridge tag firewall)) {
delete $oldnet->{$_};
}
$conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet);
PVE::LXC::Config->write_config($vmid, $conf);
+
+ if ($have_sdn && $bridge_changed) {
+ eval { PVE::Network::SDN::Vnets::del_ips_from_mac($oldbridge, $oldnet->{hwaddr}, $conf->{hostname}) };
+ warn $@ if $@;
+ }
}
- if ($have_sdn) {
- PVE::Network::SDN::Zones::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
- PVE::Network::SDN::Zones::add_bridge_fdb($veth, $newnet->{hwaddr}, $newnet->{bridge}, $newnet->{firewall});
- } else {
- PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
- PVE::Network::add_bridge_fdb($veth, $newnet->{hwaddr}, $newnet->{firewall}); # early returns if brport has learning on
+ if ($have_sdn && $bridge_changed) {
+ PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1);
}
+ PVE::LXC::net_tap_plug($veth, $newnet);
# This includes the rate:
- foreach (qw(bridge tag firewall rate)) {
+ foreach (qw(bridge tag firewall rate link_down)) {
$oldnet->{$_} = $newnet->{$_} if $newnet->{$_};
}
} elsif (safe_string_ne($oldnet->{rate}, $newnet->{rate})) {
PVE::LXC::Config->write_config($vmid, $conf);
}
} else {
+ if ($have_sdn) {
+ PVE::Network::SDN::Vnets::add_next_free_cidr($newnet->{bridge}, $conf->{hostname}, $newnet->{hwaddr}, $vmid, undef, 1);
+ PVE::Network::SDN::Vnets::add_dhcp_mapping($newnet->{bridge}, $newnet->{hwaddr}, $vmid, $conf->{hostname});
+ }
+
hotplug_net($vmid, $conf, $opt, $newnet, $netid);
}
if ($have_sdn) {
PVE::Network::SDN::Zones::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr});
- PVE::Network::SDN::Zones::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
- PVE::Network::SDN::Zones::add_bridge_fdb($veth, $newnet->{hwaddr}, $newnet->{bridge}, $newnet->{firewall});
} else {
PVE::Network::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr});
- PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
- PVE::Network::add_bridge_fdb($veth, $newnet->{hwaddr}, $newnet->{firewall}); # early returns if brport has learning on
}
+ PVE::LXC::net_tap_plug($veth, $newnet);
+
# attach peer in container
my $cmd = ['lxc-device', '-n', $vmid, 'add', $vethpeer, "$eth" ];
PVE::Tools::run_command($cmd);
PVE::Tools::run_command($cmd);
my $done = { type => 'veth' };
- foreach (qw(bridge tag firewall hwaddr name)) {
+ foreach (qw(bridge tag firewall hwaddr name link_down)) {
$done->{$_} = $newnet->{$_} if $newnet->{$_};
}
$conf->{$opt} = PVE::LXC::Config->print_lxc_network($done);
PVE::LXC::Config->write_config($vmid, $conf);
}
+sub get_interfaces {
+ my ($vmid) = @_;
+
+ my $pid = eval { find_lxc_pid($vmid); };
+ return if $@;
+
+ my $output;
+ # enters the network namespace of the container and executes 'ip a'
+ run_command(['nsenter', '-t', $pid, '--net', '--', 'ip', '--json', 'a'],
+ outfunc => sub { $output .= shift; });
+
+ my $config = JSON::decode_json($output);
+
+ my $res;
+ for my $interface ($config->@*) {
+ my $obj = { name => $interface->{ifname} };
+ for my $ip ($interface->{addr_info}->@*) {
+ $obj->{$ip->{family}} = $ip->{local} . "/" . $ip->{prefixlen};
+ }
+ $obj->{hwaddr} = $interface->{address};
+ push @$res, $obj
+ }
+
+ return $res;
+}
+
sub update_ipconfig {
my ($vmid, $conf, $opt, $eth, $newnet, $rootdir) = @_;
}
} elsif ($opt eq 'memory' || $opt eq 'swap') {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']);
- } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' ||
- $opt eq 'searchdomain' || $opt eq 'hostname') {
+ } elsif ($opt =~ m/^net\d+$/) {
+ $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
+ check_bridge_access($rpcenv, $authuser, $oldconf->{$opt}) if $oldconf->{$opt};
+ check_bridge_access($rpcenv, $authuser, $newconf->{$opt}) if $newconf->{$opt};
+ } elsif ($opt =~ m/^dev\d+$/) {
+ raise_perm_exc("configuring device passthrough is only allowed for root\@pam");
+ } elsif ($opt eq 'nameserver' || $opt eq 'searchdomain' || $opt eq 'hostname') {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
} elsif ($opt eq 'features') {
raise_perm_exc("changing feature flags for privileged container is only allowed for root\@pam")
return 1;
}
+sub check_bridge_access {
+ my ($rpcenv, $authuser, $raw) = @_;
+
+ return 1 if $authuser eq 'root@pam';
+
+ my $net = PVE::LXC::Config->parse_lxc_network($raw);
+ my ($bridge, $tag, $trunks) = $net->@{'bridge', 'tag', 'trunks'};
+ check_vnet_access($rpcenv, $authuser, $bridge, $tag, $trunks);
+
+ return 1;
+};
+
sub umount_all {
my ($vmid, $storage_cfg, $conf, $noerr) = @_;
my $volid_list = PVE::LXC::Config->get_vm_volumes($conf);
PVE::Storage::activate_volumes($storage_cfg, $volid_list);
- my (undef, $rootuid, $rootgid) = parse_id_maps($conf);
+ my (undef, $root_uid, $root_gid) = parse_id_maps($conf);
eval {
PVE::LXC::Config->foreach_volume($conf, sub {
$mountpoint->{ro} = 0 if $ignore_ro;
- mountpoint_mount($mountpoint, $rootdir, $storage_cfg, undef, $rootuid, $rootgid);
+ mountpoint_mount($mountpoint, $rootdir, $storage_cfg, undef, $root_uid, $root_gid);
});
};
if (my $err = $@) {
# * directory name of the last directory
# So that the path $2/$3 should lead to $1 afterwards.
sub walk_tree_nofollow($$$;$$) {
- my ($start, $subdir, $mkdir, $rootuid, $rootgid) = @_;
+ my ($start, $subdir, $mkdir, $root_uid, $root_gid) = @_;
sysopen(my $fd, $start, O_PATH | O_DIRECTORY)
or die "failed to open start directory $start: $!\n";
- return walk_tree_nofollow_fd($start, $fd, $subdir, $mkdir, $rootuid, $rootgid);
+ return walk_tree_nofollow_fd($start, $fd, $subdir, $mkdir, $root_uid, $root_gid);
}
sub walk_tree_nofollow_fd($$$$;$$) {
- my ($start_dirname, $start_fd, $subdir, $mkdir, $rootuid, $rootgid) = @_;
+ my ($start_dirname, $start_fd, $subdir, $mkdir, $root_uid, $root_gid) = @_;
# splitdir() returns '' for empty components including the leading /
my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir);
$next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY);
die "failed to create path: $dir: $!\n" if !$next;
- PVE::Tools::fchownat(fileno($next), '', $rootuid, $rootgid, PVE::Tools::AT_EMPTY_PATH)
- if defined($rootuid) && defined($rootgid);
+ PVE::Tools::fchownat(fileno($next), '', $root_uid, $root_gid, PVE::Tools::AT_EMPTY_PATH)
+ if defined($root_uid) && defined($root_gid);
}
close $second if defined($last_component) && $second != $start_fd;
# from $rootdir and $mount and walk the path from $rootdir to the final
# directory to check for symlinks.
sub __mount_prepare_rootdir {
- my ($rootdir, $mount, $rootuid, $rootgid) = @_;
+ my ($rootdir, $mount, $root_uid, $root_gid) = @_;
$rootdir =~ s!/+!/!g;
$rootdir =~ s!/+$!!;
my $mount_path = "$rootdir/$mount";
- my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1, $rootuid, $rootgid);
+ my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1, $root_uid, $root_gid);
return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir);
}
# use $rootdir = undef to just return the corresponding mount path
sub mountpoint_mount {
- my ($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid) = @_;
- return __mountpoint_mount($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid, undef);
+ my ($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid) = @_;
+ return __mountpoint_mount($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid, undef);
}
sub mountpoint_stage {
- my ($mountpoint, $stage_dir, $storage_cfg, $snapname, $rootuid, $rootgid) = @_;
+ my ($mountpoint, $stage_dir, $storage_cfg, $snapname, $root_uid, $root_gid) = @_;
my ($path, $loop, $dev) =
- __mountpoint_mount($mountpoint, $stage_dir, $storage_cfg, $snapname, $rootuid, $rootgid, 1);
+ __mountpoint_mount($mountpoint, $stage_dir, $storage_cfg, $snapname, $root_uid, $root_gid, 1);
if (!defined($path)) {
- return undef if $! == ENOSYS;
die "failed to mount subvolume: $!\n";
}
}
sub mountpoint_insert_staged {
- my ($mount_fd, $rootdir_fd, $mp_dir, $opt, $rootuid, $rootgid) = @_;
+ my ($mount_fd, $rootdir_fd, $mp_dir, $opt, $root_uid, $root_gid) = @_;
if (!defined($rootdir_fd)) {
sysopen($rootdir_fd, '.', O_PATH | O_DIRECTORY)
or die "failed to open '.': $!\n";
}
- my $dest_fd = walk_tree_nofollow_fd('/', $rootdir_fd, $mp_dir, 1, $rootuid, $rootgid);
+ my $dest_fd = walk_tree_nofollow_fd('/', $rootdir_fd, $mp_dir, 1, $root_uid, $root_gid);
PVE::Tools::move_mount(
fileno($mount_fd),
# Use $stage_mount, $rootdir is treated as a temporary path to "stage" the file system. The user
# can then open a file descriptor to it which can be used with the `move_mount` syscall.
-# Note that if the kernel does not support the new mount API, this will not perform any action
-# and return `undef` with $! = ENOSYS.
sub __mountpoint_mount {
- my ($mountpoint, $rootdir, $storage_cfg, $snapname, $rootuid, $rootgid, $stage_mount) = @_;
-
- if (defined($stage_mount) && !PVE::LXC::Tools::can_use_new_mount_api()) {
- $! = ENOSYS;
- return undef;
- }
+ my ($mountpoint, $rootdir, $storage_cfg, $snapname, $root_uid, $root_gid, $stage_mount) = @_;
# When staging mount points we always mount to $rootdir directly (iow. as if `mp=/`).
# This is required since __mount_prepare_rootdir() will return handles to the parent directory
my $type = $mountpoint->{type};
my $quota = !$snapname && !$mountpoint->{ro} && $mountpoint->{quota};
my $mounted_dev;
-
+
return if !$volid || !$mount;
$mount =~ s!/+!/!g;
my $mount_path;
my ($mpfd, $parentfd, $last_dir);
-
+
if (defined($rootdir)) {
($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) =
- __mount_prepare_rootdir($rootdir, $mount, $rootuid, $rootgid);
+ __mount_prepare_rootdir($rootdir, $mount, $root_uid, $root_gid);
}
if (defined($stage_mount)) {
$mount_path = $rootdir;
}
-
+
my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
die "unknown snapshot path for '$volid'" if !$storage && defined($snapname);
}
my $acl = $mountpoint->{acl};
- if (defined($acl)) {
- push @$optlist, ($acl ? 'acl' : 'noacl');
+
+ if ($acl) {
+ push @$optlist, 'acl';
+ # NOTE: the else branch is handled below
}
my $optstring = join(',', @$optlist);
my $scfg = PVE::Storage::storage_config($storage_cfg, $storage);
+ PVE::Storage::activate_volumes($storage_cfg, [$volid], $snapname);
my $path = PVE::Storage::map_volume($storage_cfg, $volid, $snapname);
$path = PVE::Storage::path($storage_cfg, $volid, $snapname) if !defined($path);
my ($vtype, undef, undef, undef, undef, $isBase, $format) =
PVE::Storage::parse_volname($storage_cfg, $volid);
+ if (defined($acl) && !$acl) {
+ # Does having this really makes sense or should we drop it with a future major release?
+ # Kernel 6.1 removed the noacl mount option for ext4, which is used for all raw volumes.
+ push @$optlist, 'noacl' if $format ne 'raw';
+ }
+
$format = 'iso' if $vtype eq 'iso'; # allow to handle iso files
if ($format eq 'subvol') {
warn "cannot enable quota control for bind mounts\n" if $quota;
return wantarray ? ($volid, 0, undef) : $volid;
}
-
+
die "unsupported storage";
}
sub mountpoint_hotplug :prototype($$$$$) {
my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
- my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
+ my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf);
# We do the rest in a fork with an unshared mount namespace, because:
# -) change our papparmor profile to that of /usr/bin/lxc-start
my $dir = get_staging_mount_path($opt);
# Now switch our apparmor profile before mounting:
- my $data = 'changeprofile /usr/bin/lxc-start';
- if (syswrite($aa_fd, $data, length($data)) != length($data)) {
+ my $data = 'changeprofile pve-container-mounthotplug';
+ my $data_written = syswrite($aa_fd, $data, length($data));
+ if (!defined($data_written) || $data_written != length($data)) {
die "failed to change apparmor profile: $!\n";
}
# Check errors on close as well:
close($aa_fd)
or die "failed to change apparmor profile (close() failed): $!\n";
- my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $rootuid, $rootgid);
+ my $mount_fd = mountpoint_stage($mp, $dir, $storage_cfg, undef, $root_uid, $root_gid);
PVE::Tools::setns(fileno($ct_mnt_ns), PVE::Tools::CLONE_NEWNS);
chdir('/')
or die "failed to change root directory within the container's mount namespace: $!\n";
- mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $rootuid, $rootgid);
+ mountpoint_insert_staged($mount_fd, undef, $mp->{mp}, $opt, $root_uid, $root_gid);
});
}
return $target;
}
-# Mount /run/pve/mountpoints as tmpfs
+# Mount tmpfs for mount point staging and return the path.
sub get_staging_tempfs() {
# We choose a path in /var/lib/lxc/ here because the lxc-start apparmor profile restricts most
# mounts to that.
}
sub mkfs {
- my ($dev, $rootuid, $rootgid) = @_;
+ my ($dev, $root_uid, $root_gid) = @_;
run_command(
[
'-O',
'mmp',
'-E',
- "root_owner=$rootuid:$rootgid",
+ "root_owner=$root_uid:$root_gid",
$dev,
],
outfunc => sub {
}
sub format_disk {
- my ($storage_cfg, $volid, $rootuid, $rootgid) = @_;
+ my ($storage_cfg, $volid, $root_uid, $root_gid) = @_;
if ($volid =~ m!^/dev/.+!) {
- mkfs($volid);
+ # FIXME: remove in Proxmox VE 9 – this code path cannot really be reached currently, using
+ # block devices needs manual preparations by the user
+ mkfs($volid, $root_uid, $root_gid);
return;
}
die "cannot format volume '$volid' (format == $format)\n"
if $format ne 'raw';
- mkfs($path, $rootuid, $rootgid);
+ mkfs($path, $root_uid, $root_gid);
}
sub destroy_disks {
}
sub alloc_disk {
- my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_;
+ my ($storecfg, $vmid, $storage, $size_kb, $root_uid, $root_gid) = @_;
my $needs_chown = 0;
my $volid;
} else {
die "content type 'rootdir' is not available or configured on storage '$storage'\n";
}
- format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format;
+ format_disk($storecfg, $volid, $root_uid, $root_gid) if $do_format;
};
if (my $err = $@) {
# in case formatting got interrupted:
my $vollist = [];
eval {
- my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
+ my (undef, $root_uid, $root_gid) = PVE::LXC::parse_id_maps($conf);
my $chown_vollist = [];
PVE::LXC::Config->foreach_volume($settings, sub {
my $size_kb = int(${size_gb}*1024) * 1024;
my $needs_chown = 0;
- ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid);
+ ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $root_uid, $root_gid);
push @$chown_vollist, $volid if $needs_chown;
push @$vollist, $volid;
$mountpoint->{volume} = $volid;
PVE::Storage::activate_volumes($storecfg, $chown_vollist, undef);
foreach my $volid (@$chown_vollist) {
my $path = PVE::Storage::path($storecfg, $volid, undef);
- chown($rootuid, $rootgid, $path);
+ chown($root_uid, $root_gid, $path);
}
PVE::Storage::deactivate_volumes($storecfg, $chown_vollist, undef);
};
$changes = 1;
print "$prefix updated volume size of '$mp->{volume}' in config.\n";
$mp->{size} = $size;
- my $nomp = 1 if ($key eq 'rootfs');
- $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $nomp);
+ my $no_mp = $key eq 'rootfs'; # rootfs is handled different from other mount points
+ $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $no_mp);
}
};
my ($conf) = @_;
my $id_map = [];
- my $rootuid = 0;
- my $rootgid = 0;
+ my $root_uid = 0;
+ my $root_gid = 0;
my $lxc = $conf->{lxc};
foreach my $entry (@$lxc) {
my ($key, $value) = @$entry;
- # FIXME: remove the 'id_map' variant when lxc-3.0 arrives
- next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map';
+
+ next if $key ne 'lxc.idmap';
+
if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) {
my ($type, $ct, $host, $length) = ($1, $2, $3, $4);
push @$id_map, [$type, $ct, $host, $length];
if ($ct == 0) {
- $rootuid = $host if $type eq 'u';
- $rootgid = $host if $type eq 'g';
+ $root_uid = $host if $type eq 'u';
+ $root_gid = $host if $type eq 'g';
}
} else {
die "failed to parse idmap: $value\n";
# Should we read them from /etc/subuid?
$id_map = [ ['u', '0', '100000', '65536'],
['g', '0', '100000', '65536'] ];
- $rootuid = $rootgid = 100000;
+ $root_uid = $root_gid = 100000;
+ }
+
+ return ($id_map, $root_uid, $root_gid);
+}
+
+sub validate_id_maps {
+ my ($id_map) = @_;
+
+ # $mappings->{$type}->{$side} = [ { line => $line, start => $start, count => $count }, ... ]
+ # $type: either "u" or "g"
+ # $side: either "container" or "host"
+ # $line: index of this mapping in @$id_map
+ # $start, $count: interval of this mapping
+ my $mappings = { u => {}, g => {} };
+ for (my $i = 0; $i < scalar(@$id_map); $i++) {
+ my ($type, $ct_start, $host_start, $count) = $id_map->[$i]->@*;
+ my $sides = $mappings->{$type};
+ push $sides->{host}->@*, { line => $i, start => $host_start, count => $count };
+ push $sides->{container}->@*, { line => $i, start => $ct_start, count => $count };
+ }
+
+ # find the first conflict between two consecutive mappings when sorted by their start id
+ for my $type (qw(u g)) {
+ for my $side (qw(container host)) {
+ my @entries = sort { $a->{start} <=> $b->{start} } $mappings->{$type}->{$side}->@*;
+ for my $idx (1..scalar(@entries) - 1) {
+ my $previous = $entries[$idx - 1];
+ my $current = $entries[$idx];
+ if ($previous->{start} + $previous->{count} > $current->{start}) {
+ my $conflict = $current->{start};
+ my @previous_line = $id_map->[$previous->{line}]->@*;
+ my @current_line = $id_map->[$current->{line}]->@*;
+ die "invalid map entry '@current_line': $side ${type}id $conflict "
+ ."is also mapped by entry '@previous_line'\n";
+ }
+ }
+ }
}
+}
+
+sub map_ct_id_to_host {
+ my ($id, $id_map, $id_type) = @_;
+
+ for my $mapping (@$id_map) {
+ my ($type, $ct, $host, $length) = @$mapping;
- return ($id_map, $rootuid, $rootgid);
+ next if ($type ne $id_type);
+
+ if ($id >= $ct && $id < ($ct + $length)) {
+ return $host - $ct + $id;
+ }
+ }
+
+ return $id;
+}
+
+sub map_ct_uid_to_host {
+ my ($uid, $id_map) = @_;
+
+ return map_ct_id_to_host($uid, $id_map, 'u');
+}
+
+sub map_ct_gid_to_host {
+ my ($gid, $id_map) = @_;
+
+ return map_ct_id_to_host($gid, $id_map, 'g');
}
sub userns_command {
my $log = eval { file_get_contents($log_fn) };
return if !$log;
- my $rpcenv = eval { PVE::RPCEnvironment::get() };
-
- my $warn_fn = $rpcenv ? sub { $rpcenv->warn($_[0]) } : sub { print STDERR "WARN: $_[0]\n" };
-
while ($log =~ /^\h*\s*(.*?)\h*$/gm) {
- my $line = $1;
- $warn_fn->($line);
+ PVE::RESTEnvironment::log_warn($1);
}
unlink $log_fn or warn "could not unlink '$log_fn' - $!\n";
}
update_lxc_config($vmid, $conf);
+ eval {
+ my ($id_map, undef, undef) = PVE::LXC::parse_id_maps($conf);
+ PVE::LXC::validate_id_maps($id_map);
+ };
+ warn "lxc.idmap: $@" if $@;
+
my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid";
if ($skiplock) {
}
eval { run_command($cmd, timeout => $shutdown_timeout) };
+
+ # Wait until the command socket is closed.
+ # In case the lxc-stop call failed, reading from the command socket may block forever,
+ # so poll with another timeout to avoid freezing the shutdown task.
if (my $err = $@) {
- warn $@ if $@;
- }
+ warn $err if $err;
- my $result = <$sock>;
+ my $poll = IO::Poll->new();
+ $poll->mask($sock => POLLIN | POLLHUP); # watch for input and EOF events
+ $poll->poll($shutdown_timeout); # IO::Poll timeout is in seconds
+ return if ($poll->events($sock) & POLLHUP);
+ } else {
+ my $result = <$sock>;
+ return if !defined $result; # monitor is gone and the ct has stopped.
+ }
- return if !defined $result; # monitor is gone and the ct has stopped.
die "container did not stop\n";
}
}
my $copy_volume = sub {
- my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname, $bwlimit, $rootuid, $rootgid) = @_;
+ my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname, $bwlimit, $root_uid, $root_gid) = @_;
my $src_mp = { volume => $src_volid, mp => '/', ro => 1 };
$src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid);
eval {
# mount and copy
mkdir $src;
- mountpoint_mount($src_mp, $src, $storage_cfg, $snapname, $rootuid, $rootgid);
+ mountpoint_mount($src_mp, $src, $storage_cfg, $snapname, $root_uid, $root_gid);
push @mounted, $src;
mkdir $dest;
- mountpoint_mount($dst_mp, $dest, $storage_cfg, undef, $rootuid, $rootgid);
+ mountpoint_mount($dst_mp, $dest, $storage_cfg, undef, $root_uid, $root_gid);
push @mounted, $dest;
$bwlimit //= 0;
my $src = "/var/lib/lxc/$vmid/.copy-volume-2";
# get id's for unprivileged container
- my (undef, $rootuid, $rootgid) = parse_id_maps($conf);
+ my (undef, $root_uid, $root_gid) = parse_id_maps($conf);
# Allocate the disk before unsharing in order to make sure zfs subvolumes
# are visible in this namespace, otherwise the host only sees the empty
# Make sure $mp contains a correct size.
$mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume});
my $needs_chown;
- ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid);
+ ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $root_uid, $root_gid);
if ($needs_chown) {
PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef);
my $path = PVE::Storage::path($storage_cfg, $new_volid, undef);
- chown($rootuid, $rootgid, $path);
+ chown($root_uid, $root_gid, $path);
}
run_unshared(sub {
- $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname, $bwlimit, $rootuid, $rootgid);
+ $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname, $bwlimit, $root_uid, $root_gid);
});
};
if (my $err = $@) {
}
}
+sub create_ifaces_ipams_ips {
+ my ($conf, $vmid) = @_;
+
+ return if !$have_sdn;
+
+ for my $opt (keys %$conf) {
+ next if $opt !~ m/^net(\d+)$/;
+ my $net = PVE::LXC::Config->parse_lxc_network($conf->{$opt});
+ next if $net->{type} ne 'veth';
+ PVE::Network::SDN::Vnets::add_next_free_cidr($net->{bridge}, $conf->{hostname}, $net->{hwaddr}, $vmid, undef, 1);
+ }
+}
+
+sub delete_ifaces_ipams_ips {
+ my ($conf, $vmid) = @_;
+
+ return if !$have_sdn;
+
+ for my $opt (keys %$conf) {
+ next if $opt !~ m/^net(\d+)$/;
+ my $net = PVE::LXC::Config->parse_lxc_network($conf->{$opt});
+ eval { PVE::Network::SDN::Vnets::del_ips_from_mac($net->{bridge}, $net->{hwaddr}, $conf->{hostname}) };
+ warn $@ if $@;
+ }
+}
+
1;