use File::Spec;
use Cwd qw();
use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY);
-use Errno qw(ELOOP EROFS);
+use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED);
+use IO::Socket::UNIX;
use PVE::Exception qw(raise_perm_exc);
use PVE::Storage;
use PVE::SafeSyslog;
use PVE::INotify;
+use PVE::JSONSchema qw(get_standard_option);
use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH);
+use PVE::CpuSet;
use PVE::Network;
use PVE::AccessControl;
use PVE::ProcFSTools;
+use PVE::Syscall;
use PVE::LXC::Config;
+
use Time::HiRes qw (gettimeofday);
-use Data::Dumper;
+my $LXC_CONFIG_PATH = '/usr/share/lxc/config';
my $nodename = PVE::INotify::nodename();
my $cpuinfo= PVE::ProcFSTools::read_cpuinfo();
-our $COMMON_TAR_FLAGS = [ '--sparse', '--numeric-owner', '--acls',
- '--xattrs',
- '--xattrs-include=user.*',
- '--xattrs-include=security.capability',
- '--warning=no-xattr-write' ];
-
sub config_list {
my $vmlist = PVE::Cluster::get_vmlist();
my $res = {};
my $d = $ids->{$vmid};
next if !$d->{node} || $d->{node} ne $nodename;
next if !$d->{type} || $d->{type} ne 'lxc';
- $res->{$vmid}->{type} = 'lxc';
+ $res->{$vmid} = { type => 'lxc', vmid => $vmid };
}
return $res;
}
my $last_proc_vmid_stat;
my $parse_cpuacct_stat = sub {
- my ($vmid) = @_;
+ my ($vmid, $unprivileged) = @_;
- my $raw = read_cgroup_value('cpuacct', $vmid, 'cpuacct.stat', 1);
+ my $raw = read_cgroup_value('cpuacct', $vmid, $unprivileged, 'cpuacct.stat', 1);
my $stat = {};
return $stat;
};
+our $vmstatus_return_properties = {
+ vmid => get_standard_option('pve-vmid'),
+ status => {
+ description => "LXC Container status.",
+ type => 'string',
+ enum => ['stopped', 'running'],
+ },
+ maxmem => {
+ description => "Maximum memory in bytes.",
+ type => 'integer',
+ optional => 1,
+ renderer => 'bytes',
+ },
+ maxswap => {
+ description => "Maximum SWAP memory in bytes.",
+ type => 'integer',
+ optional => 1,
+ renderer => 'bytes',
+ },
+ maxdisk => {
+ description => "Root disk size in bytes.",
+ type => 'integer',
+ optional => 1,
+ renderer => 'bytes',
+ },
+ name => {
+ description => "Container name.",
+ type => 'string',
+ optional => 1,
+ },
+ uptime => {
+ description => "Uptime.",
+ type => 'integer',
+ optional => 1,
+ renderer => 'duration',
+ },
+ cpus => {
+ description => "Maximum usable CPUs.",
+ type => 'number',
+ optional => 1,
+ },
+};
+
sub vmstatus {
my ($opt_vmid) = @_;
- my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc' }} : config_list();
+ my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => $opt_vmid }} : config_list();
my $active_hash = list_active_containers();
my $cdtime = gettimeofday;
my $uptime = (PVE::ProcFSTools::read_proc_uptime(1))[0];
+ my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
+
+ my $unprivileged = {};
foreach my $vmid (keys %$list) {
my $d = $list->{$vmid};
my $cfspath = PVE::LXC::Config->cfs_config_path($vmid);
my $conf = PVE::Cluster::cfs_read_file($cfspath) || {};
+ $unprivileged->{$vmid} = $conf->{unprivileged};
+
$d->{name} = $conf->{'hostname'} || "CT$vmid";
$d->{name} =~ s/[\s]//g;
- $d->{cpus} = $conf->{cpulimit} || $cpucount;
+ $d->{cpus} = $conf->{cores} || $conf->{cpulimit};
+ $d->{cpus} = $cpucount if !$d->{cpus};
$d->{lock} = $conf->{lock} || '';
next if !$pid; # skip stopped CTs
- my $ctime = (stat("/proc/$pid"))[10]; # 10 = ctime
- $d->{uptime} = time - $ctime; # the method lxcfs uses
+ my $proc_pid_stat = PVE::ProcFSTools::read_proc_pid_stat($pid);
+ $d->{uptime} = int(($uptime - $proc_pid_stat->{starttime}) / $clock_ticks); # the method lxcfs uses
+
+ my $unpriv = $unprivileged->{$vmid};
+
+ if (-d '/sys/fs/cgroup/memory') {
+ my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat');
+ my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes');
- $d->{mem} = read_cgroup_value('memory', $vmid, 'memory.usage_in_bytes');
- $d->{swap} = read_cgroup_value('memory', $vmid, 'memory.memsw.usage_in_bytes') - $d->{mem};
+ $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache};
+ $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes;
+ } else {
+ $d->{mem} = 0;
+ $d->{swap} = 0;
+ }
- my $blkio_bytes = read_cgroup_value('blkio', $vmid, 'blkio.throttle.io_service_bytes', 1);
- my @bytes = split(/\n/, $blkio_bytes);
- foreach my $byte (@bytes) {
- if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) {
- $d->{diskread} = $2 if $key eq 'Read';
- $d->{diskwrite} = $2 if $key eq 'Write';
+ if (-d '/sys/fs/cgroup/blkio') {
+ my $blkio_bytes = read_cgroup_value('blkio', $vmid, $unpriv, 'blkio.throttle.io_service_bytes', 1);
+ my @bytes = split(/\n/, $blkio_bytes);
+ foreach my $byte (@bytes) {
+ if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) {
+ $d->{diskread} += $2 if $key eq 'Read';
+ $d->{diskwrite} += $2 if $key eq 'Write';
+ }
}
+ } else {
+ $d->{diskread} = 0;
+ $d->{diskwrite} = 0;
}
- my $pstat = &$parse_cpuacct_stat($vmid);
+ if (-d '/sys/fs/cgroup/cpuacct') {
+ my $pstat = $parse_cpuacct_stat->($vmid, $unpriv);
- my $used = $pstat->{utime} + $pstat->{stime};
+ my $used = $pstat->{utime} + $pstat->{stime};
- my $old = $last_proc_vmid_stat->{$vmid};
- if (!$old) {
- $last_proc_vmid_stat->{$vmid} = {
- time => $cdtime,
- used => $used,
- cpu => 0,
- };
- next;
- }
+ my $old = $last_proc_vmid_stat->{$vmid};
+ if (!$old) {
+ $last_proc_vmid_stat->{$vmid} = {
+ time => $cdtime,
+ used => $used,
+ cpu => 0,
+ };
+ next;
+ }
- my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz};
+ my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz};
- if ($dtime > 1000) {
- my $dutime = $used - $old->{used};
+ if ($dtime > 1000) {
+ my $dutime = $used - $old->{used};
- $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus};
- $last_proc_vmid_stat->{$vmid} = {
- time => $cdtime,
- used => $used,
- cpu => $d->{cpu},
- };
+ $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus};
+ $last_proc_vmid_stat->{$vmid} = {
+ time => $cdtime,
+ used => $used,
+ cpu => $d->{cpu},
+ };
+ } else {
+ $d->{cpu} = $old->{cpu};
+ }
} else {
- $d->{cpu} = $old->{cpu};
+ $d->{cpu} = 0;
}
}
return $list;
}
-sub read_cgroup_value {
- my ($group, $vmid, $name, $full) = @_;
+sub read_cgroup_list($$$$) {
+ my ($group, $vmid, $unprivileged, $name) = @_;
- my $path = "/sys/fs/cgroup/$group/lxc/$vmid/$name";
+ my $content = read_cgroup_value($group, $vmid, $unprivileged, $name, 1);
+
+ return { split(/\s+/, $content) };
+}
+
+sub read_cgroup_value($$$$$) {
+ my ($group, $vmid, $unprivileged, $name, $full) = @_;
+
+ my $nsdir = $unprivileged ? '' : 'ns/';
+ my $path = "/sys/fs/cgroup/$group/lxc/$vmid/${nsdir}$name";
return PVE::Tools::file_get_contents($path) if $full;
die "unable to parse ipv4 address/mask\n";
}
+sub get_cgroup_subsystems {
+ my $v1 = {};
+ my $v2 = 0;
+ my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
+ while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
+ my $type = $1;
+ if (length($type)) {
+ $v1->{$_} = 1 foreach split(/,/, $type);
+ } else {
+ $v2 = 1;
+ }
+ }
+ return wantarray ? ($v1, $v2) : $v1;
+}
+
+# Currently we do not need to create seccomp profile 'files' as the only
+# choice our configuration actually allows is "with or without keyctl()",
+# so we distinguish between using lxc's "default" seccomp profile and our
+# added pve-userns.seccomp file.
+#
+# This returns a configuration line added to the raw lxc config.
+sub make_seccomp_config {
+ my ($conf, $unprivileged, $features) = @_;
+ # User-configured profile has precedence, note that the user's entry would
+ # be written 'after' this line anyway...
+ if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) {
+ # Warn the user if this conflicts with a feature:
+ if ($features->{keyctl}) {
+ warn "explicitly configured lxc.seccomp.profile overrides the following settings: features:keyctl\n";
+ }
+ return '';
+ }
+
+ # Privileged containers keep using the default (which is already part of
+ # the files included via lxc.include, so we don't need to write it out,
+ # that way it stays admin-configurable via /usr/share/lxc/config/... as
+ # well)
+ return '' if !$unprivileged;
+
+ # Unprivileged containers will get keyctl() disabled by default as a
+ # workaround for systemd-networkd behavior. But we have an option to
+ # explicitly enable it:
+ return '' if $features->{keyctl};
+
+ # Finally we're in an unprivileged container without `keyctl` set
+ # explicitly. We have a file prepared for this:
+ return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n";
+}
+
+# Since lxc-3.0.2 we can have lxc generate a profile for the container
+# automatically. The default should be equivalent to the old
+# `lxc-container-default-cgns` profile.
+#
+# Additionally this also added `lxc.apparmor.raw` which can be used to inject
+# additional lines into the profile. We can use that to allow mounting specific
+# file systems.
+sub make_apparmor_config {
+ my ($conf, $unprivileged, $features) = @_;
+
+ # user-configured profile has precedence, but first we go through our own
+ # code to figure out whether we should warn the user:
+
+ my $raw = "lxc.apparmor.profile = generated\n";
+ my @profile_uses;
+
+ # There's lxc.apparmor.allow_nesting now, which will add the necessary
+ # apparmor lines, create an apparmor namespace for the container, but also
+ # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have
+ # lxcfs mounted over them, because that would prevent the container from
+ # mounting new instances of them for nested containers.
+ if ($features->{nesting}) {
+ push @profile_uses, 'features:nesting';
+ $raw .= "lxc.apparmor.allow_nesting = 1\n"
+ } else {
+ # In the default profile in /etc/apparmor.d we patch this in because
+ # otherwise a container can for example run `chown` on /sys, breaking
+ # access to it for non-CAP_DAC_OVERRIDE tools on the host:
+ $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n";
+ $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n";
+ # Preferably we could use the 'remount' flag but this does not sit well
+ # with apparmor_parser currently:
+ # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,
+ }
+
+ if (my $mount = $features->{mount}) {
+ push @profile_uses, 'features:mount';
+ foreach my $fs (PVE::Tools::split_list($mount)) {
+ $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n";
+ }
+ }
+
+ # More to come?
+
+ if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) {
+ if (length(my $used = join(', ', @profile_uses))) {
+ warn "explicitly configured lxc.apparmor.profile overrides the following settings: $used\n";
+ }
+ return '';
+ }
+
+ return $raw;
+}
sub update_lxc_config {
my ($vmid, $conf) = @_;
die "missing 'arch' - internal error" if !$conf->{arch};
$raw .= "lxc.arch = $conf->{arch}\n";
- my $unprivileged = $conf->{unprivileged};
- my $custom_idmap = grep { $_->[0] eq 'lxc.id_map' } @{$conf->{lxc}};
+ my $custom_idmap = PVE::LXC::Config->has_lxc_entry($conf, 'lxc.idmap');
+ my $unprivileged = $conf->{unprivileged} || $custom_idmap;
my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error";
- if ($ostype =~ /^(?:debian | ubuntu | centos | fedora | opensuse | archlinux | alpine | gentoo | unmanaged)$/x) {
- my $inc ="/usr/share/lxc/config/$ostype.common.conf";
- $inc ="/usr/share/lxc/config/common.conf" if !-f $inc;
+
+ my $cfgpath = '/usr/share/lxc/config';
+ my $inc = "$cfgpath/$ostype.common.conf";
+ $inc ="$cfgpath/common.conf" if !-f $inc;
+ $raw .= "lxc.include = $inc\n";
+ if ($unprivileged) {
+ $inc = "$cfgpath/$ostype.userns.conf";
+ $inc = "$cfgpath/userns.conf" if !-f $inc;
$raw .= "lxc.include = $inc\n";
- if ($unprivileged || $custom_idmap) {
- $inc = "/usr/share/lxc/config/$ostype.userns.conf";
- $inc = "/usr/share/lxc/config/userns.conf" if !-f $inc;
- $raw .= "lxc.include = $inc\n"
- }
- } else {
- die "implement me (ostype $ostype)";
}
+ my $features = PVE::LXC::Config->parse_features($conf->{features});
+
+ $raw .= make_seccomp_config($conf, $unprivileged, $features);
+ $raw .= make_apparmor_config($conf, $unprivileged, $features);
+
# WARNING: DO NOT REMOVE this without making sure that loop device nodes
# cannot be exposed to the container with r/w access (cgroup perms).
# When this is enabled mounts will still remain in the monitor's namespace
# files while the container is running!
$raw .= "lxc.monitor.unshare = 1\n";
+ my $cgv1 = get_cgroup_subsystems();
+
# Should we read them from /etc/subuid?
if ($unprivileged && !$custom_idmap) {
- $raw .= "lxc.id_map = u 0 100000 65536\n";
- $raw .= "lxc.id_map = g 0 100000 65536\n";
+ $raw .= "lxc.idmap = u 0 100000 65536\n";
+ $raw .= "lxc.idmap = g 0 100000 65536\n";
}
if (!PVE::LXC::Config->has_dev_console($conf)) {
- $raw .= "lxc.console = none\n";
- $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n";
+ $raw .= "lxc.console.path = none\n";
+ $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n" if $cgv1->{devices};
}
my $ttycount = PVE::LXC::Config->get_tty_count($conf);
- $raw .= "lxc.tty = $ttycount\n";
+ $raw .= "lxc.tty.max = $ttycount\n";
# some init scripts expect a linux terminal (turnkey).
$raw .= "lxc.environment = TERM=linux\n";
my $utsname = $conf->{hostname} || "CT$vmid";
- $raw .= "lxc.utsname = $utsname\n";
-
- my $memory = $conf->{memory} || 512;
- my $swap = $conf->{swap} // 0;
+ $raw .= "lxc.uts.name = $utsname\n";
- my $lxcmem = int($memory*1024*1024);
- $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n";
+ if ($cgv1->{memory}) {
+ my $memory = $conf->{memory} || 512;
+ my $swap = $conf->{swap} // 0;
- my $lxcswap = int(($memory + $swap)*1024*1024);
- $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n";
+ my $lxcmem = int($memory*1024*1024);
+ $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n";
- if (my $cpulimit = $conf->{cpulimit}) {
- $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n";
- my $value = int(100000*$cpulimit);
- $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n";
+ my $lxcswap = int(($memory + $swap)*1024*1024);
+ $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n";
}
- my $shares = $conf->{cpuunits} || 1024;
- $raw .= "lxc.cgroup.cpu.shares = $shares\n";
+ if ($cgv1->{cpu}) {
+ if (my $cpulimit = $conf->{cpulimit}) {
+ $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n";
+ my $value = int(100000*$cpulimit);
+ $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n";
+ }
+
+ my $shares = $conf->{cpuunits} || 1024;
+ $raw .= "lxc.cgroup.cpu.shares = $shares\n";
+ }
die "missing 'rootfs' configuration\n"
if !defined($conf->{rootfs});
my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs});
- $raw .= "lxc.rootfs = $dir/rootfs\n";
+ $raw .= "lxc.rootfs.path = $dir/rootfs\n";
- my $netcount = 0;
foreach my $k (sort keys %$conf) {
next if $k !~ m/^net(\d+)$/;
my $ind = $1;
my $d = PVE::LXC::Config->parse_lxc_network($conf->{$k});
- $netcount++;
- $raw .= "lxc.network.type = veth\n";
- $raw .= "lxc.network.veth.pair = veth${vmid}i${ind}\n";
- $raw .= "lxc.network.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr});
- $raw .= "lxc.network.name = $d->{name}\n" if defined($d->{name});
- $raw .= "lxc.network.mtu = $d->{mtu}\n" if defined($d->{mtu});
+ $raw .= "lxc.net.$ind.type = veth\n";
+ $raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n";
+ $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr});
+ $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name});
+ $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu});
}
- if (my $lxcconf = $conf->{lxc}) {
- foreach my $entry (@$lxcconf) {
- my ($k, $v) = @$entry;
- $netcount++ if $k eq 'lxc.network.type';
- $raw .= "$k = $v\n";
+ if ($cgv1->{cpuset}) {
+ my $had_cpuset = 0;
+ if (my $lxcconf = $conf->{lxc}) {
+ foreach my $entry (@$lxcconf) {
+ my ($k, $v) = @$entry;
+ $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus';
+ $raw .= "$k = $v\n";
+ }
+ }
+
+ my $cores = $conf->{cores};
+ if (!$had_cpuset && $cores) {
+ my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') };
+ $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset;
+ my @members = $cpuset->members();
+ while (scalar(@members) > $cores) {
+ my $randidx = int(rand(scalar(@members)));
+ $cpuset->delete($members[$randidx]);
+ splice(@members, $randidx, 1); # keep track of the changes
+ }
+ $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n";
}
}
- $raw .= "lxc.network.type = empty\n" if !$netcount;
-
File::Path::mkpath("$dir/rootfs");
PVE::Tools::file_set_contents("$dir/config", $raw);
}
sub get_console_command {
- my ($vmid, $conf) = @_;
+ my ($vmid, $conf, $escapechar) = @_;
+
+ # '-1' as $escapechar disables keyboard escape sequence
+ # any other passed char (a-z) will result in <Ctrl+$escapechar q>
my $cmode = PVE::LXC::Config->get_cmode($conf);
+ my $cmd = [];
if ($cmode eq 'console') {
- return ['lxc-console', '-n', $vmid, '-t', 0];
+ push @$cmd, 'lxc-console', '-n', $vmid, '-t', 0;
+ push @$cmd, '-e', $escapechar if $escapechar;
} elsif ($cmode eq 'tty') {
- return ['lxc-console', '-n', $vmid];
+ push @$cmd, 'lxc-console', '-n', $vmid;
+ push @$cmd, '-e', $escapechar if $escapechar;
} elsif ($cmode eq 'shell') {
- return ['lxc-attach', '--clear-env', '-n', $vmid];
+ push @$cmd, 'lxc-attach', '--clear-env', '-n', $vmid;
} else {
die "internal error";
}
+
+ return $cmd;
}
sub get_primary_ips {
my $newip = $newnet->{$ip};
my $newgw = $newnet->{$gw};
my $oldip = $optdata->{$ip};
+ my $oldgw = $optdata->{$gw};
my $change_ip = &$safe_string_ne($oldip, $newip);
- my $change_gw = &$safe_string_ne($optdata->{$gw}, $newgw);
+ my $change_gw = &$safe_string_ne($oldgw, $newgw);
return if !$change_ip && !$change_gw;
# warn and continue
warn $@ if $@;
}
+ if ($oldgw && $oldip && !PVE::Network::is_ip_in_cidr($oldgw, $oldip)) {
+ eval { &$ipcmd($family_opt, 'route', 'del', $oldgw, 'dev', $eth); };
+ # warn if the route was deleted manually
+ warn $@ if $@;
+ }
}
# from this point on we save the configuration
my $storecfg = PVE::Storage::config();
- my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs});
- my $volid = $rootinfo->{volume};
+ PVE::LXC::Config->foreach_mountpoint($conf, sub {
+ my ($ms, $mountpoint) = @_;
+
+ my $volid = $mountpoint->{volume};
+
+ die "Template feature is not available for '$volid'\n"
+ if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid);
+ });
- die "Template feature is not available for '$volid'\n"
- if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid);
+ PVE::LXC::Config->foreach_mountpoint($conf, sub {
+ my ($ms, $mountpoint) = @_;
+
+ my $volid = $mountpoint->{volume};
- PVE::Storage::activate_volumes($storecfg, [$volid]);
+ PVE::Storage::activate_volumes($storecfg, [$volid]);
- my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid);
- $rootinfo->{volume} = $template_volid;
- $conf->{rootfs} = PVE::LXC::Config->print_ct_mountpoint($rootinfo, 1);
+ my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid);
+ $mountpoint->{volume} = $template_volid;
+ $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq "rootfs");
+ });
PVE::LXC::Config->write_config($vmid, $conf);
}
my $check = sub {
my ($opt, $delete) = @_;
- if ($opt eq 'cpus' || $opt eq 'cpuunits' || $opt eq 'cpulimit') {
+ if ($opt eq 'cores' || $opt eq 'cpuunits' || $opt eq 'cpulimit') {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.CPU']);
} elsif ($opt eq 'rootfs' || $opt =~ /^mp\d+$/) {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk']);
return if $delete;
my $data = $opt eq 'rootfs' ? PVE::LXC::Config->parse_ct_rootfs($newconf->{$opt})
: PVE::LXC::Config->parse_ct_mountpoint($newconf->{$opt});
- raise_perm_exc("mountpoint type $data->{type}") if $data->{type} ne 'volume';
+ raise_perm_exc("mount point type $data->{type} is only allowed for root\@pam")
+ if $data->{type} ne 'volume';
} elsif ($opt eq 'memory' || $opt eq 'swap') {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']);
} elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' ||
$opt eq 'searchdomain' || $opt eq 'hostname') {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
+ } elsif ($opt eq 'features') {
+ # For now this is restricted to root@pam
+ raise_perm_exc("changing feature flags is only allowed for root\@pam");
} else {
$rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']);
}
# The loop device is always detached afterwards (or set to autoclear).
# Returns the loop device.
sub run_with_loopdev {
- my ($func, $file) = @_;
+ my ($func, $file, $readonly) = @_;
my $device = query_loopdev($file);
# Try to reuse an existing device
if ($device) {
$device = $1;
}
};
- PVE::Tools::run_command(['losetup', '--show', '-f', $file], outfunc => $parser);
+ my $losetup_cmd = [
+ 'losetup',
+ '--show',
+ '-f',
+ $file,
+ ];
+ push @$losetup_cmd, '-r' if $readonly;
+ PVE::Tools::run_command($losetup_cmd, outfunc => $parser);
die "failed to setup loop device for $file\n" if !$device;
eval { &$func($device); };
my $err = $@;
if (!$next) {
# failed, check for symlinks and try to create the path
- die "symlink encountered at: $dir\n" if $! == ELOOP;
+ die "symlink encountered at: $dir\n" if $! == ELOOP || $! == ENOTDIR;
die "cannot open directory $dir: $!\n" if !$mkdir;
# We don't check for errors on mkdirat() here and just try to
die "failed to open mount point: $!\n" if !$destdh;
if ($ro) {
my $dot = '.';
- # 269: faccessat()
# no separate function because 99% of the time it's the wrong thing to use.
- if (syscall(269, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) {
+ if (syscall(PVE::Syscall::faccessat, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) {
die "failed to mark bind mount read only\n";
}
die "read-only check failed: $!\n" if $! != EROFS;
}
my $readonly = $mountpoint->{ro};
- my @extra_opts = ('-o', $optstring) if $optstring;
+ my @extra_opts;
+ @extra_opts = ('-o', $optstring) if $optstring;
if ($storage) {
my $scfg = PVE::Storage::storage_config($storage_cfg, $storage);
- # early sanity checks:
- # we otherwise call realpath on the rbd url
- die "containers on rbd storage without krbd are not supported\n"
- if $scfg->{type} eq 'rbd' && !$scfg->{krbd};
+ my $path = PVE::Storage::map_volume($storage_cfg, $volid, $snapname);
- my $path = PVE::Storage::path($storage_cfg, $volid, $snapname);
+ $path = PVE::Storage::path($storage_cfg, $volid, $snapname) if !defined($path);
my ($vtype, undef, undef, undef, undef, $isBase, $format) =
PVE::Storage::parse_volname($storage_cfg, $volid);
};
my $use_loopdev = 0;
if ($scfg->{path}) {
- $mounted_dev = run_with_loopdev($domount, $path);
+ $mounted_dev = run_with_loopdev($domount, $path, $readonly);
$use_loopdev = 1;
} elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' ||
$scfg->{type} eq 'rbd' || $scfg->{type} eq 'lvmthin') {
PVE::Storage::activate_volumes($storage_cfg, [$volid]);
- my $path = PVE::Storage::path($storage_cfg, $volid);
+ my $path = PVE::Storage::map_volume($storage_cfg, $volid);
+
+ $path = PVE::Storage::path($storage_cfg, $volid) if !defined($path);
my ($vtype, undef, undef, undef, undef, $isBase, $format) =
PVE::Storage::parse_volname($storage_cfg, $volid);
}
}
+sub alloc_disk {
+ my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_;
+
+ my $needs_chown = 0;
+ my $volid;
+
+ my $scfg = PVE::Storage::storage_config($storecfg, $storage);
+ # fixme: use better naming ct-$vmid-disk-X.raw?
+
+ eval {
+ my $do_format = 0;
+ if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs' || $scfg->{type} eq 'cifs' ) {
+ if ($size_kb > 0) {
+ $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw',
+ undef, $size_kb);
+ $do_format = 1;
+ } else {
+ $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol',
+ undef, 0);
+ $needs_chown = 1;
+ }
+ } elsif ($scfg->{type} eq 'zfspool') {
+
+ $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol',
+ undef, $size_kb);
+ $needs_chown = 1;
+ } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') {
+
+ $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb);
+ $do_format = 1;
+
+ } elsif ($scfg->{type} eq 'rbd') {
+
+ $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb);
+ $do_format = 1;
+ } else {
+ die "unable to create containers on storage type '$scfg->{type}'\n";
+ }
+ format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format;
+ };
+ if (my $err = $@) {
+ # in case formatting got interrupted:
+ if (defined($volid)) {
+ eval { PVE::Storage::vdisk_free($storecfg, $volid); };
+ warn $@ if $@;
+ }
+ die $err;
+ }
+
+ return ($volid, $needs_chown);
+}
+
+our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/;
sub create_disks {
my ($storecfg, $vmid, $settings, $conf) = @_;
my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
- if ($storage && ($volid =~ m/^([^:\s]+):(\d+(\.\d+)?)$/)) {
+ if ($storage && ($volid =~ $NEW_DISK_RE)) {
my ($storeid, $size_gb) = ($1, $2);
my $size_kb = int(${size_gb}*1024) * 1024;
- my $scfg = PVE::Storage::storage_config($storecfg, $storage);
- # fixme: use better naming ct-$vmid-disk-X.raw?
-
- if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs') {
- if ($size_kb > 0) {
- $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw',
- undef, $size_kb);
- format_disk($storecfg, $volid, $rootuid, $rootgid);
- } else {
- $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol',
- undef, 0);
- push @$chown_vollist, $volid;
- }
- } elsif ($scfg->{type} eq 'zfspool') {
-
- $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol',
- undef, $size_kb);
- push @$chown_vollist, $volid;
- } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') {
-
- $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb);
- format_disk($storecfg, $volid, $rootuid, $rootgid);
-
- } elsif ($scfg->{type} eq 'rbd') {
-
- die "krbd option must be enabled on storage type '$scfg->{type}'\n" if !$scfg->{krbd};
- $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb);
- format_disk($storecfg, $volid, $rootuid, $rootgid);
- } else {
- die "unable to create containers on storage type '$scfg->{type}'\n";
- }
+ my $needs_chown = 0;
+ ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid);
+ push @$chown_vollist, $volid if $needs_chown;
push @$vollist, $volid;
$mountpoint->{volume} = $volid;
$mountpoint->{size} = $size_kb * 1024;
my $lxc = $conf->{lxc};
foreach my $entry (@$lxc) {
my ($key, $value) = @$entry;
- next if $key ne 'lxc.id_map';
+ # FIXME: remove the 'id_map' variant when lxc-3.0 arrives
+ next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map';
if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) {
my ($type, $ct, $host, $length) = ($1, $2, $3, $4);
push @$id_map, [$type, $ct, $host, $length];
$rootgid = $host if $type eq 'g';
}
} else {
- die "failed to parse id_map: $value\n";
+ die "failed to parse idmap: $value\n";
}
}
return [];
}
+sub vm_start {
+ my ($vmid, $conf, $skiplock) = @_;
+
+ update_lxc_config($vmid, $conf);
+
+ my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid";
+
+ if ($skiplock) {
+ open(my $fh, '>', $skiplock_flag_fn) || die "failed to open $skiplock_flag_fn for writing: $!\n";
+ close($fh);
+ }
+
+ my $cmd = ['systemctl', 'start', "pve-container\@$vmid"];
+
+ eval { PVE::Tools::run_command($cmd); };
+ if (my $err = $@) {
+ unlink $skiplock_flag_fn;
+ die $err;
+ }
+
+ return;
+}
+
+# Helper to stop a container completely and make sure it has stopped completely.
+# This is necessary because we want the post-stop hook to have completed its
+# unmount-all step, but post-stop happens after lxc puts the container into the
+# STOPPED state.
+sub vm_stop {
+ my ($vmid, $kill, $shutdown_timeout, $exit_timeout) = @_;
+
+ # Open the container's command socket.
+ my $path = "\0/var/lib/lxc/$vmid/command";
+ my $sock = IO::Socket::UNIX->new(
+ Type => SOCK_STREAM(),
+ Peer => $path,
+ );
+ if (!$sock) {
+ return if $! == ECONNREFUSED; # The container is not running
+ die "failed to open container ${vmid}'s command socket: $!\n";
+ }
+
+ # Stop the container:
+
+ my $cmd = ['lxc-stop', '-n', $vmid];
+
+ if ($kill) {
+ push @$cmd, '--kill'; # doesn't allow timeouts
+ } elsif (defined($shutdown_timeout)) {
+ push @$cmd, '--timeout', $shutdown_timeout;
+ # Give run_command 5 extra seconds
+ $shutdown_timeout += 5;
+ }
+
+ eval { PVE::Tools::run_command($cmd, timeout => $shutdown_timeout) };
+ if (my $err = $@) {
+ warn $@ if $@;
+ }
+
+ my $result = 1;
+ my $wait = sub { $result = <$sock>; };
+ if (defined($exit_timeout)) {
+ PVE::Tools::run_with_timeout($exit_timeout, $wait);
+ } else {
+ $wait->();
+ }
+
+ return if !defined $result; # monitor is gone and the ct has stopped.
+ die "container did not stop\n";
+}
+
+sub run_unshared {
+ my ($code) = @_;
+
+ return PVE::Tools::run_fork(sub {
+ # Unshare the mount namespace
+ die "failed to unshare mount namespace: $!\n"
+ if !PVE::Tools::unshare(PVE::Tools::CLONE_NEWNS);
+ PVE::Tools::run_command(['mount', '--make-rslave', '/']);
+ return $code->();
+ });
+}
+
+my $copy_volume = sub {
+ my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname) = @_;
+
+ my $src_mp = { volume => $src_volid, mp => '/', ro => 1 };
+ $src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid);
+
+ my $dst_mp = { volume => $dst_volid, mp => '/', ro => 0 };
+ $dst_mp->{type} = PVE::LXC::Config->classify_mountpoint($dst_volid);
+
+ my @mounted;
+ eval {
+ # mount and copy
+ mkdir $src;
+ mountpoint_mount($src_mp, $src, $storage_cfg, $snapname);
+ push @mounted, $src;
+ mkdir $dest;
+ mountpoint_mount($dst_mp, $dest, $storage_cfg);
+ push @mounted, $dest;
+
+ PVE::Tools::run_command(['/usr/bin/rsync', '--stats', '-X', '-A', '--numeric-ids',
+ '-aH', '--whole-file', '--sparse', '--one-file-system',
+ "$src/", $dest]);
+ };
+ my $err = $@;
+ foreach my $mount (reverse @mounted) {
+ eval { PVE::Tools::run_command(['/bin/umount', '--lazy', $mount], errfunc => sub{})};
+ warn "Can't umount $mount\n" if $@;
+ }
+
+ # If this fails they're used as mount points in a concurrent operation
+ # (which should not happen but there's also no real need to get rid of them).
+ rmdir $dest;
+ rmdir $src;
+
+ die $err if $err;
+};
+
+# Should not be called after unsharing the mount namespace!
+sub copy_volume {
+ my ($mp, $vmid, $storage, $storage_cfg, $conf, $snapname) = @_;
+
+ die "cannot copy volumes of type $mp->{type}\n" if $mp->{type} ne 'volume';
+ File::Path::make_path("/var/lib/lxc/$vmid");
+ my $dest = "/var/lib/lxc/$vmid/.copy-volume-1";
+ my $src = "/var/lib/lxc/$vmid/.copy-volume-2";
+
+ # get id's for unprivileged container
+ my (undef, $rootuid, $rootgid) = parse_id_maps($conf);
+
+ # Allocate the disk before unsharing in order to make sure zfs subvolumes
+ # are visible in this namespace, otherwise the host only sees the empty
+ # (not-mounted) directory.
+ my $new_volid;
+ eval {
+ # Make sure $mp contains a correct size.
+ $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume});
+ my $needs_chown;
+ ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid);
+ if ($needs_chown) {
+ PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef);
+ my $path = PVE::Storage::path($storage_cfg, $new_volid, undef);
+ chown($rootuid, $rootgid, $path);
+ }
+
+ run_unshared(sub {
+ $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname);
+ });
+ };
+ if (my $err = $@) {
+ PVE::Storage::vdisk_free($storage_cfg, $new_volid)
+ if defined($new_volid);
+ die $err;
+ }
+
+ return $new_volid;
+}
1;