use strict;
use warnings;
-use POSIX qw(EINTR);
-
-use Socket;
-
-use File::Path;
-use File::Spec;
use Cwd qw();
-use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED ENOSYS EEXIST);
+use Fcntl qw(O_RDONLY O_WRONLY O_NOFOLLOW O_DIRECTORY);
+use File::Path;
+use File::Spec;
+use IO::Poll qw(POLLIN POLLHUP);
use IO::Socket::UNIX;
+use POSIX qw(EINTR);
+use Socket;
+use Time::HiRes qw (gettimeofday);
+use PVE::AccessControl;
+use PVE::CGroup;
+use PVE::CpuSet;
use PVE::Exception qw(raise_perm_exc);
-use PVE::Storage;
-use PVE::SafeSyslog;
+use PVE::GuestHelpers qw(check_vnet_access safe_string_ne safe_num_ne safe_boolean_ne);
use PVE::INotify;
use PVE::JSONSchema qw(get_standard_option);
+use PVE::Network;
+use PVE::ProcFSTools;
+use PVE::RESTEnvironment;
+use PVE::SafeSyslog;
+use PVE::Storage;
use PVE::Tools qw(
run_command
dir_glob_foreach
$IPV4RE
$IPV6RE
);
-use PVE::RPCEnvironment;
-use PVE::CpuSet;
-use PVE::Network;
-use PVE::AccessControl;
-use PVE::ProcFSTools;
use PVE::Syscall qw(:fsmount);
-use PVE::LXC::Config;
-use PVE::GuestHelpers qw(safe_string_ne safe_num_ne safe_boolean_ne);
-use PVE::LXC::Tools;
+
use PVE::LXC::CGroup;
+use PVE::LXC::Config;
use PVE::LXC::Monitor;
-use PVE::CGroup;
+use PVE::LXC::Tools;
-use Time::HiRes qw (gettimeofday);
my $have_sdn;
eval {
require PVE::Network::SDN::Zones;
# some init scripts expect a linux terminal (turnkey).
$raw .= "lxc.environment = TERM=linux\n";
-
+
my $utsname = $conf->{hostname} || "CT$vmid";
$raw .= "lxc.uts.name = $utsname\n";
my $memory = $conf->{memory} || 512;
my $swap = $conf->{swap} // 0;
- my $lxcmem = int($memory*1024*1024);
- $raw .= "lxc.cgroup2.memory.max = $lxcmem\n";
+ # cgroup memory usage is limited by the hard 'max' limit (OOM-killer enforced) and the soft
+ # 'high' limit (cgroup processes get throttled and put under heavy reclaim pressure).
+ my ($lxc_mem_max, $lxc_mem_high) = PVE::LXC::Config::calculate_memory_constraints($memory);
+ $raw .= "lxc.cgroup2.memory.max = $lxc_mem_max\n";
+ $raw .= "lxc.cgroup2.memory.high = $lxc_mem_high\n";
my $lxcswap = int($swap*1024*1024);
$raw .= "lxc.cgroup2.memory.swap.max = $lxcswap\n";
$raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n";
}
- my $shares = $conf->{cpuunits} || 1024;
+ my $shares = PVE::CGroup::clamp_cpu_shares($conf->{cpuunits});
$raw .= "lxc.cgroup.cpu.shares = $shares\n";
} elsif ($cgv2->{cpu}) {
# See PVE::CGroup
}
if (defined(my $shares = $conf->{cpuunits})) {
- die "cpu weight (shares) must be in range [1, 10000]\n"
- if $shares < 1 || $shares > 10000;
+ $shares = PVE::CGroup::clamp_cpu_shares($shares);
$raw .= "lxc.cgroup2.cpu.weight = $shares\n";
}
}
$raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n";
$raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr});
$raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name});
- $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu});
+
+ my $bridge_mtu = PVE::Network::read_bridge_mtu($d->{bridge});
+ my $mtu = $d->{mtu} || $bridge_mtu;
+
+ # Keep container from starting with invalid mtu configuration
+ die "$k: MTU size '$mtu' is bigger than bridge MTU '$bridge_mtu'\n"
+ if ($mtu > $bridge_mtu);
+
+ $raw .= "lxc.net.$ind.mtu = $mtu\n";
# Starting with lxc 4.0, we do not patch lxc to execute our up-scripts.
if ($lxc_major >= 4) {
warn $@ if $@; # avoid errors - just warn
}
+sub net_tap_plug : prototype($$) {
+ my ($iface, $net) = @_;
+
+ if (defined($net->{link_down})) {
+ PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'down']);
+ # Don't add disconnected interfaces to the bridge, otherwise e.g. applying any network
+ # change (e.g. `ifreload -a`) could (re-)activate it unintentionally.
+ return;
+ }
+
+ my ($bridge, $tag, $firewall, $trunks, $rate, $hwaddr) =
+ $net->@{'bridge', 'tag', 'firewall', 'trunks', 'rate', 'hwaddr'};
+
+ if ($have_sdn) {
+ PVE::Network::SDN::Zones::tap_plug($iface, $bridge, $tag, $firewall, $trunks, $rate);
+ PVE::Network::SDN::Zones::add_bridge_fdb($iface, $hwaddr, $bridge, $firewall);
+ } else {
+ PVE::Network::tap_plug($iface, $bridge, $tag, $firewall, $trunks, $rate, { mac => $hwaddr });
+ }
+
+ PVE::Tools::run_command(['/sbin/ip', 'link', 'set', 'dev', $iface, 'up']);
+}
+
sub update_net {
my ($vmid, $conf, $opt, $newnet, $netid, $rootdir) = @_;
} else {
if (safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) ||
safe_num_ne($oldnet->{tag}, $newnet->{tag}) ||
- safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) {
+ safe_num_ne($oldnet->{firewall}, $newnet->{firewall}) ||
+ safe_boolean_ne($oldnet->{link_down}, $newnet->{link_down})
+ ) {
if ($oldnet->{bridge}) {
PVE::Network::tap_unplug($veth);
PVE::LXC::Config->write_config($vmid, $conf);
}
- if ($have_sdn) {
- PVE::Network::SDN::Zones::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
- } else {
- PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
- }
+ PVE::LXC::net_tap_plug($veth, $newnet);
# This includes the rate:
- foreach (qw(bridge tag firewall rate)) {
+ foreach (qw(bridge tag firewall rate link_down)) {
$oldnet->{$_} = $newnet->{$_} if $newnet->{$_};
}
} elsif (safe_string_ne($oldnet->{rate}, $newnet->{rate})) {
if ($have_sdn) {
PVE::Network::SDN::Zones::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr});
- PVE::Network::SDN::Zones::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
} else {
PVE::Network::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr});
- PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
}
+ PVE::LXC::net_tap_plug($veth, $newnet);
+
# attach peer in container
my $cmd = ['lxc-device', '-n', $vmid, 'add', $vethpeer, "$eth" ];
PVE::Tools::run_command($cmd);
PVE::Tools::run_command($cmd);
my $done = { type => 'veth' };
- foreach (qw(bridge tag firewall hwaddr name)) {
+ foreach (qw(bridge tag firewall hwaddr name link_down)) {
$done->{$_} = $newnet->{$_} if $newnet->{$_};
}
$conf->{$opt} = PVE::LXC::Config->print_lxc_network($done);
} elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' ||
$opt eq 'searchdomain' || $opt eq 'hostname') {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
+ PVE::LXC::check_bridge_access($rpcenv, $authuser, $newconf->{$opt});
} elsif ($opt eq 'features') {
raise_perm_exc("changing feature flags for privileged container is only allowed for root\@pam")
if !$unprivileged;
} elsif ($opt eq 'hookscript') {
# For now this is restricted to root@pam
raise_perm_exc("changing the hookscript is only allowed for root\@pam");
+ } elsif ($opt eq 'tags') {
+ my $old = $oldconf->{$opt};
+ my $new = $delete ? '' : $newconf->{$opt};
+ PVE::GuestHelpers::assert_tag_permissions($vmid, $old, $new, $rpcenv, $authuser);
} else {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']);
}
return 1;
}
+sub check_bridge_access {
+ my ($rpcenv, $authuser, $raw) = @_;
+
+ return 1 if $authuser eq 'root@pam';
+
+ my $net = PVE::LXC::Config->parse_lxc_network($raw);
+ my $bridge = $net->{bridge};
+ my $tag = $net->{tag};
+ my $trunks = $net->{trunks};
+ check_vnet_access($rpcenv, $authuser, $bridge, $tag, $trunks);
+
+ return 1;
+};
+
sub umount_all {
my ($vmid, $storage_cfg, $conf, $noerr) = @_;
my $type = $mountpoint->{type};
my $quota = !$snapname && !$mountpoint->{ro} && $mountpoint->{quota};
my $mounted_dev;
-
+
return if !$volid || !$mount;
$mount =~ s!/+!/!g;
my $mount_path;
my ($mpfd, $parentfd, $last_dir);
-
+
if (defined($rootdir)) {
($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) =
__mount_prepare_rootdir($rootdir, $mount, $rootuid, $rootgid);
if (defined($stage_mount)) {
$mount_path = $rootdir;
}
-
+
my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
die "unknown snapshot path for '$volid'" if !$storage && defined($snapname);
warn "cannot enable quota control for bind mounts\n" if $quota;
return wantarray ? ($volid, 0, undef) : $volid;
}
-
+
die "unsupported storage";
}
-sub mountpoint_hotplug($$$) {
+sub mountpoint_hotplug :prototype($$$$$) {
my ($vmid, $conf, $opt, $mp, $storage_cfg) = @_;
my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
$changes = 1;
print "$prefix updated volume size of '$mp->{volume}' in config.\n";
$mp->{size} = $size;
- my $nomp = 1 if ($key eq 'rootfs');
- $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $nomp);
+ my $no_mp = $key eq 'rootfs'; # rootfs is handled different from other mount points
+ $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $no_mp);
}
};
my $lxc = $conf->{lxc};
foreach my $entry (@$lxc) {
my ($key, $value) = @$entry;
- # FIXME: remove the 'id_map' variant when lxc-3.0 arrives
- next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map';
+
+ next if $key ne 'lxc.idmap';
+
if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) {
my ($type, $ct, $host, $length) = ($1, $2, $3, $4);
push @$id_map, [$type, $ct, $host, $length];
return ($id_map, $rootuid, $rootgid);
}
+sub validate_id_maps {
+ my ($id_map) = @_;
+
+ # $mappings->{$type}->{$side} = [ { line => $line, start => $start, count => $count }, ... ]
+ # $type: either "u" or "g"
+ # $side: either "container" or "host"
+ # $line: index of this mapping in @$id_map
+ # $start, $count: interval of this mapping
+ my $mappings = { u => {}, g => {} };
+ for (my $i = 0; $i < scalar(@$id_map); $i++) {
+ my ($type, $ct_start, $host_start, $count) = $id_map->[$i]->@*;
+ my $sides = $mappings->{$type};
+ push $sides->{host}->@*, { line => $i, start => $host_start, count => $count };
+ push $sides->{container}->@*, { line => $i, start => $ct_start, count => $count };
+ }
+
+ # find the first conflict between two consecutive mappings when sorted by their start id
+ for my $type (qw(u g)) {
+ for my $side (qw(container host)) {
+ my @entries = sort { $a->{start} <=> $b->{start} } $mappings->{$type}->{$side}->@*;
+ for my $idx (1..scalar(@entries) - 1) {
+ my $previous = $entries[$idx - 1];
+ my $current = $entries[$idx];
+ if ($previous->{start} + $previous->{count} > $current->{start}) {
+ my $conflict = $current->{start};
+ my @previous_line = $id_map->[$previous->{line}]->@*;
+ my @current_line = $id_map->[$current->{line}]->@*;
+ die "invalid map entry '@current_line': $side ${type}id $conflict "
+ ."is also mapped by entry '@previous_line'\n";
+ }
+ }
+ }
+ }
+}
+
sub userns_command {
my ($id_map) = @_;
if (@$id_map) {
my $log = eval { file_get_contents($log_fn) };
return if !$log;
- my $rpcenv = eval { PVE::RPCEnvironment::get() };
-
- my $warn_fn = $rpcenv ? sub { $rpcenv->warn($_[0]) } : sub { print STDERR "WARN: $_[0]\n" };
-
while ($log =~ /^\h*\s*(.*?)\h*$/gm) {
- my $line = $1;
- $warn_fn->($line);
+ PVE::RESTEnvironment::log_warn($1);
}
unlink $log_fn or warn "could not unlink '$log_fn' - $!\n";
}
update_lxc_config($vmid, $conf);
+ eval {
+ my ($id_map, undef, undef) = PVE::LXC::parse_id_maps($conf);
+ PVE::LXC::validate_id_maps($id_map);
+ };
+ warn "lxc.idmap: $@" if $@;
+
my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid";
if ($skiplock) {
}
eval { run_command($cmd, timeout => $shutdown_timeout) };
+
+ # Wait until the command socket is closed.
+ # In case the lxc-stop call failed, reading from the command socket may block forever,
+ # so poll with another timeout to avoid freezing the shutdown task.
if (my $err = $@) {
- warn $@ if $@;
- }
+ warn $err if $err;
- my $result = <$sock>;
+ my $poll = IO::Poll->new();
+ $poll->mask($sock => POLLIN | POLLHUP); # watch for input and EOF events
+ $poll->poll($shutdown_timeout); # IO::Poll timeout is in seconds
+ return if ($poll->events($sock) & POLLHUP);
+ } else {
+ my $result = <$sock>;
+ return if !defined $result; # monitor is gone and the ct has stopped.
+ }
- return if !defined $result; # monitor is gone and the ct has stopped.
die "container did not stop\n";
}