]> git.proxmox.com Git - pve-container.git/blame - src/PVE/LXC.pm
fixup: slight code cleanup
[pve-container.git] / src / PVE / LXC.pm
CommitLineData
f76a2828
DM
1package PVE::LXC;
2
3use strict;
4use warnings;
67afe46e 5
d14a9a1b 6use POSIX qw(EINTR);
f76a2828 7
34fdb3d7
WB
8use Socket;
9
f76a2828 10use File::Path;
2cfae16e
WB
11use File::Spec;
12use Cwd qw();
ab3722b3 13use Fcntl qw(O_RDONLY O_NOFOLLOW O_DIRECTORY);
b1bad293
WB
14use Errno qw(ELOOP ENOTDIR EROFS ECONNREFUSED);
15use IO::Socket::UNIX;
f76a2828 16
f1ba1a4b 17use PVE::Exception qw(raise_perm_exc);
c65e0a6d 18use PVE::Storage;
f76a2828
DM
19use PVE::SafeSyslog;
20use PVE::INotify;
8233d33d 21use PVE::JSONSchema qw(get_standard_option);
ab3722b3 22use PVE::Tools qw($IPV6RE $IPV4RE dir_glob_foreach lock_file lock_file_full O_PATH);
92902047 23use PVE::CpuSet;
68fba17b 24use PVE::Network;
52389a07 25use PVE::AccessControl;
228a5a1d 26use PVE::ProcFSTools;
0389da0d 27use PVE::Syscall;
67afe46e 28use PVE::LXC::Config;
8233d33d 29
688afc63 30use Time::HiRes qw (gettimeofday);
f76a2828 31
5a63f1c5
WB
32my $LXC_CONFIG_PATH = '/usr/share/lxc/config';
33
27916659
DM
34my $nodename = PVE::INotify::nodename();
35
688afc63
WL
36my $cpuinfo= PVE::ProcFSTools::read_cpuinfo();
37
f76a2828
DM
38sub config_list {
39 my $vmlist = PVE::Cluster::get_vmlist();
40 my $res = {};
41 return $res if !$vmlist || !$vmlist->{ids};
42 my $ids = $vmlist->{ids};
43
44 foreach my $vmid (keys %$ids) {
45 next if !$vmid; # skip CT0
46 my $d = $ids->{$vmid};
47 next if !$d->{node} || $d->{node} ne $nodename;
48 next if !$d->{type} || $d->{type} ne 'lxc';
8233d33d 49 $res->{$vmid} = { type => 'lxc', vmid => $vmid };
f76a2828
DM
50 }
51 return $res;
52}
53
5b4657d0
DM
54sub destroy_config {
55 my ($vmid) = @_;
56
d7d48be6
TL
57 my $config_fn = PVE::LXC::Config->config_file($vmid, $nodename);
58 unlink $config_fn or die "failed to remove config file: $!\n";
3cc56749
FG
59}
60
822de0c3
DM
61# container status helpers
62
63sub list_active_containers {
cbb03fea 64
822de0c3
DM
65 my $filename = "/proc/net/unix";
66
67 # similar test is used by lcxcontainers.c: list_active_containers
68 my $res = {};
cbb03fea 69
822de0c3
DM
70 my $fh = IO::File->new ($filename, "r");
71 return $res if !$fh;
72
73 while (defined(my $line = <$fh>)) {
28df2cde 74 if ($line =~ m/^[a-f0-9]+:\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\d+\s+(\S+)$/) {
822de0c3 75 my $path = $1;
27916659 76 if ($path =~ m!^@/var/lib/lxc/(\d+)/command$!) {
822de0c3
DM
77 $res->{$1} = 1;
78 }
79 }
80 }
81
82 close($fh);
cbb03fea 83
822de0c3
DM
84 return $res;
85}
f76a2828 86
5c752bbf
DM
87# warning: this is slow
88sub check_running {
89 my ($vmid) = @_;
90
91 my $active_hash = list_active_containers();
92
93 return 1 if defined($active_hash->{$vmid});
cbb03fea 94
5c752bbf
DM
95 return undef;
96}
97
10fc3ba5 98sub get_container_disk_usage {
73e03cb7 99 my ($vmid, $pid) = @_;
10fc3ba5 100
73e03cb7 101 return PVE::Tools::df("/proc/$pid/root/", 1);
10fc3ba5
DM
102}
103
688afc63
WL
104my $last_proc_vmid_stat;
105
106my $parse_cpuacct_stat = sub {
3b5070d4 107 my ($vmid, $unprivileged) = @_;
688afc63 108
3b5070d4 109 my $raw = read_cgroup_value('cpuacct', $vmid, $unprivileged, 'cpuacct.stat', 1);
688afc63
WL
110
111 my $stat = {};
112
113 if ($raw =~ m/^user (\d+)\nsystem (\d+)\n/) {
114
115 $stat->{utime} = $1;
116 $stat->{stime} = $2;
117
118 }
119
120 return $stat;
121};
122
8233d33d
DM
123our $vmstatus_return_properties = {
124 vmid => get_standard_option('pve-vmid'),
125 status => {
126 description => "LXC Container status.",
127 type => 'string',
128 enum => ['stopped', 'running'],
129 },
130 maxmem => {
131 description => "Maximum memory in bytes.",
132 type => 'integer',
133 optional => 1,
134 renderer => 'bytes',
135 },
136 maxswap => {
137 description => "Maximum SWAP memory in bytes.",
138 type => 'integer',
139 optional => 1,
140 renderer => 'bytes',
141 },
142 maxdisk => {
143 description => "Root disk size in bytes.",
144 type => 'integer',
145 optional => 1,
146 renderer => 'bytes',
147 },
148 name => {
149 description => "Container name.",
150 type => 'string',
151 optional => 1,
152 },
153 uptime => {
154 description => "Uptime.",
155 type => 'integer',
156 optional => 1,
157 renderer => 'duration',
158 },
159 cpus => {
160 description => "Maximum usable CPUs.",
161 type => 'number',
162 optional => 1,
163 },
164};
165
f76a2828
DM
166sub vmstatus {
167 my ($opt_vmid) = @_;
168
8233d33d 169 my $list = $opt_vmid ? { $opt_vmid => { type => 'lxc', vmid => $opt_vmid }} : config_list();
f76a2828 170
822de0c3 171 my $active_hash = list_active_containers();
cbb03fea 172
688afc63
WL
173 my $cpucount = $cpuinfo->{cpus} || 1;
174
175 my $cdtime = gettimeofday;
176
177 my $uptime = (PVE::ProcFSTools::read_proc_uptime(1))[0];
80d56111 178 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
688afc63 179
3b5070d4
WB
180 my $unprivileged = {};
181
f76a2828 182 foreach my $vmid (keys %$list) {
f76a2828 183 my $d = $list->{$vmid};
10fc3ba5 184
d5588ee3
DM
185 eval { $d->{pid} = find_lxc_pid($vmid) if defined($active_hash->{$vmid}); };
186 warn $@ if $@; # ignore errors (consider them stopped)
cbb03fea 187
d5588ee3 188 $d->{status} = $d->{pid} ? 'running' : 'stopped';
f76a2828 189
67afe46e 190 my $cfspath = PVE::LXC::Config->cfs_config_path($vmid);
238a56cb 191 my $conf = PVE::Cluster::cfs_read_file($cfspath) || {};
cbb03fea 192
3b5070d4
WB
193 $unprivileged->{$vmid} = $conf->{unprivileged};
194
27916659 195 $d->{name} = $conf->{'hostname'} || "CT$vmid";
238a56cb 196 $d->{name} =~ s/[\s]//g;
cbb03fea 197
f2357408
DM
198 $d->{cpus} = $conf->{cores} || $conf->{cpulimit};
199 $d->{cpus} = $cpucount if !$d->{cpus};
44da0641 200
d0226204
WB
201 $d->{lock} = $conf->{lock} || '';
202
d5588ee3
DM
203 if ($d->{pid}) {
204 my $res = get_container_disk_usage($vmid, $d->{pid});
27916659
DM
205 $d->{disk} = $res->{used};
206 $d->{maxdisk} = $res->{total};
207 } else {
208 $d->{disk} = 0;
209 # use 4GB by default ??
210 if (my $rootfs = $conf->{rootfs}) {
1b4cf758 211 my $rootinfo = PVE::LXC::Config->parse_ct_rootfs($rootfs);
af02245c 212 $d->{maxdisk} = $rootinfo->{size} || (4*1024*1024*1024);
27916659
DM
213 } else {
214 $d->{maxdisk} = 4*1024*1024*1024;
10fc3ba5 215 }
238a56cb 216 }
cbb03fea 217
238a56cb
DM
218 $d->{mem} = 0;
219 $d->{swap} = 0;
95df9a12
DM
220 $d->{maxmem} = ($conf->{memory}||512)*1024*1024;
221 $d->{maxswap} = ($conf->{swap}//0)*1024*1024;
e901d418 222
238a56cb
DM
223 $d->{uptime} = 0;
224 $d->{cpu} = 0;
e901d418 225
238a56cb
DM
226 $d->{netout} = 0;
227 $d->{netin} = 0;
f76a2828 228
238a56cb
DM
229 $d->{diskread} = 0;
230 $d->{diskwrite} = 0;
bb1ac2de 231
67afe46e 232 $d->{template} = PVE::LXC::Config->is_template($conf);
f76a2828 233 }
cbb03fea 234
238a56cb
DM
235 foreach my $vmid (keys %$list) {
236 my $d = $list->{$vmid};
d5588ee3
DM
237 my $pid = $d->{pid};
238
239 next if !$pid; # skip stopped CTs
f76a2828 240
80d56111
DC
241 my $proc_pid_stat = PVE::ProcFSTools::read_proc_pid_stat($pid);
242 $d->{uptime} = int(($uptime - $proc_pid_stat->{starttime}) / $clock_ticks); # the method lxcfs uses
22a77285 243
3b5070d4
WB
244 my $unpriv = $unprivileged->{$vmid};
245
0b6a2f0e
WB
246 if (-d '/sys/fs/cgroup/memory') {
247 my $memory_stat = read_cgroup_list('memory', $vmid, $unpriv, 'memory.stat');
248 my $mem_usage_in_bytes = read_cgroup_value('memory', $vmid, $unpriv, 'memory.usage_in_bytes');
249
250 $d->{mem} = $mem_usage_in_bytes - $memory_stat->{total_cache};
251 $d->{swap} = read_cgroup_value('memory', $vmid, $unpriv, 'memory.memsw.usage_in_bytes') - $mem_usage_in_bytes;
252 } else {
253 $d->{mem} = 0;
254 $d->{swap} = 0;
255 }
256
257 if (-d '/sys/fs/cgroup/blkio') {
0defbe08 258 my $blkio_bytes = read_cgroup_value('blkio', $vmid, 0, 'blkio.throttle.io_service_bytes', 1); # don't check if unpriv
0b6a2f0e
WB
259 my @bytes = split(/\n/, $blkio_bytes);
260 foreach my $byte (@bytes) {
261 if (my ($key, $value) = $byte =~ /(Read|Write)\s+(\d+)/) {
262 $d->{diskread} += $2 if $key eq 'Read';
263 $d->{diskwrite} += $2 if $key eq 'Write';
264 }
1e647c7c 265 }
0b6a2f0e
WB
266 } else {
267 $d->{diskread} = 0;
268 $d->{diskwrite} = 0;
b5289322 269 }
688afc63 270
0b6a2f0e
WB
271 if (-d '/sys/fs/cgroup/cpuacct') {
272 my $pstat = $parse_cpuacct_stat->($vmid, $unpriv);
688afc63 273
0b6a2f0e 274 my $used = $pstat->{utime} + $pstat->{stime};
688afc63 275
0b6a2f0e
WB
276 my $old = $last_proc_vmid_stat->{$vmid};
277 if (!$old) {
278 $last_proc_vmid_stat->{$vmid} = {
279 time => $cdtime,
280 used => $used,
281 cpu => 0,
282 };
283 next;
284 }
688afc63 285
0b6a2f0e 286 my $dtime = ($cdtime - $old->{time}) * $cpucount * $cpuinfo->{user_hz};
688afc63 287
0b6a2f0e
WB
288 if ($dtime > 1000) {
289 my $dutime = $used - $old->{used};
688afc63 290
0b6a2f0e
WB
291 $d->{cpu} = (($dutime/$dtime)* $cpucount) / $d->{cpus};
292 $last_proc_vmid_stat->{$vmid} = {
293 time => $cdtime,
294 used => $used,
295 cpu => $d->{cpu},
296 };
297 } else {
298 $d->{cpu} = $old->{cpu};
299 }
688afc63 300 } else {
0b6a2f0e 301 $d->{cpu} = 0;
688afc63 302 }
238a56cb 303 }
cbb03fea 304
68b8f4d1
WL
305 my $netdev = PVE::ProcFSTools::read_proc_net_dev();
306
307 foreach my $dev (keys %$netdev) {
308 next if $dev !~ m/^veth([1-9]\d*)i/;
309 my $vmid = $1;
310 my $d = $list->{$vmid};
311
312 next if !$d;
313
314 $d->{netout} += $netdev->{$dev}->{receive};
315 $d->{netin} += $netdev->{$dev}->{transmit};
316
317 }
318
f76a2828
DM
319 return $list;
320}
321
3b5070d4
WB
322sub read_cgroup_list($$$$) {
323 my ($group, $vmid, $unprivileged, $name) = @_;
b3059d35 324
3b5070d4 325 my $content = read_cgroup_value($group, $vmid, $unprivileged, $name, 1);
b3059d35
TL
326
327 return { split(/\s+/, $content) };
328}
329
3b5070d4
WB
330sub read_cgroup_value($$$$$) {
331 my ($group, $vmid, $unprivileged, $name, $full) = @_;
238a56cb 332
3b5070d4
WB
333 my $nsdir = $unprivileged ? '' : 'ns/';
334 my $path = "/sys/fs/cgroup/$group/lxc/$vmid/${nsdir}$name";
238a56cb
DM
335
336 return PVE::Tools::file_get_contents($path) if $full;
337
338 return PVE::Tools::file_read_firstline($path);
339}
340
bf0b8c43
AD
341sub write_cgroup_value {
342 my ($group, $vmid, $name, $value) = @_;
343
344 my $path = "/sys/fs/cgroup/$group/lxc/$vmid/$name";
345 PVE::ProcFSTools::write_proc_entry($path, $value) if -e $path;
346
347}
348
52f1d76b
DM
349sub find_lxc_console_pids {
350
351 my $res = {};
352
353 PVE::Tools::dir_glob_foreach('/proc', '\d+', sub {
354 my ($pid) = @_;
355
356 my $cmdline = PVE::Tools::file_read_firstline("/proc/$pid/cmdline");
357 return if !$cmdline;
358
359 my @args = split(/\0/, $cmdline);
360
c31ad455 361 # search for lxc-console -n <vmid>
cbb03fea 362 return if scalar(@args) != 3;
52f1d76b
DM
363 return if $args[1] ne '-n';
364 return if $args[2] !~ m/^\d+$/;
365 return if $args[0] !~ m|^(/usr/bin/)?lxc-console$|;
cbb03fea 366
52f1d76b 367 my $vmid = $args[2];
cbb03fea 368
52f1d76b
DM
369 push @{$res->{$vmid}}, $pid;
370 });
371
372 return $res;
373}
374
bedeaaf1
AD
375sub find_lxc_pid {
376 my ($vmid) = @_;
377
378 my $pid = undef;
379 my $parser = sub {
380 my $line = shift;
8b25977f 381 $pid = $1 if $line =~ m/^PID:\s+(\d+)$/;
bedeaaf1 382 };
c39aa40a 383 PVE::Tools::run_command(['lxc-info', '-n', $vmid, '-p'], outfunc => $parser);
bedeaaf1 384
8b25977f 385 die "unable to get PID for CT $vmid (not running?)\n" if !$pid;
cbb03fea 386
8b25977f 387 return $pid;
bedeaaf1
AD
388}
389
cbb03fea 390# Note: we cannot use Net:IP, because that only allows strict
55fa4e09
DM
391# CIDR networks
392sub parse_ipv4_cidr {
393 my ($cidr, $noerr) = @_;
394
f7a7b413
WB
395 if ($cidr =~ m!^($IPV4RE)(?:/(\d+))$! && ($2 > 7) && ($2 <= 32)) {
396 return { address => $1, netmask => $PVE::Network::ipv4_reverse_mask->[$2] };
55fa4e09 397 }
cbb03fea 398
55fa4e09 399 return undef if $noerr;
cbb03fea 400
55fa4e09
DM
401 die "unable to parse ipv4 address/mask\n";
402}
93285df8 403
0b6a2f0e
WB
404sub get_cgroup_subsystems {
405 my $v1 = {};
406 my $v2 = 0;
407 my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
408 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
409 my $type = $1;
410 if (length($type)) {
411 $v1->{$_} = 1 foreach split(/,/, $type);
412 } else {
413 $v2 = 1;
414 }
415 }
416 return wantarray ? ($v1, $v2) : $v1;
417}
e22af68f 418
5a63f1c5
WB
419# Currently we do not need to create seccomp profile 'files' as the only
420# choice our configuration actually allows is "with or without keyctl()",
421# so we distinguish between using lxc's "default" seccomp profile and our
422# added pve-userns.seccomp file.
423#
424# This returns a configuration line added to the raw lxc config.
425sub make_seccomp_config {
426 my ($conf, $unprivileged, $features) = @_;
427 # User-configured profile has precedence, note that the user's entry would
428 # be written 'after' this line anyway...
429 if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.seccomp.profile')) {
430 # Warn the user if this conflicts with a feature:
431 if ($features->{keyctl}) {
432 warn "explicitly configured lxc.seccomp.profile overrides the following settings: features:keyctl\n";
433 }
434 return '';
435 }
436
437 # Privileged containers keep using the default (which is already part of
438 # the files included via lxc.include, so we don't need to write it out,
439 # that way it stays admin-configurable via /usr/share/lxc/config/... as
440 # well)
441 return '' if !$unprivileged;
442
443 # Unprivileged containers will get keyctl() disabled by default as a
444 # workaround for systemd-networkd behavior. But we have an option to
445 # explicitly enable it:
446 return '' if $features->{keyctl};
447
448 # Finally we're in an unprivileged container without `keyctl` set
449 # explicitly. We have a file prepared for this:
450 return "lxc.seccomp.profile = $LXC_CONFIG_PATH/pve-userns.seccomp\n";
451}
452
453# Since lxc-3.0.2 we can have lxc generate a profile for the container
454# automatically. The default should be equivalent to the old
455# `lxc-container-default-cgns` profile.
456#
457# Additionally this also added `lxc.apparmor.raw` which can be used to inject
458# additional lines into the profile. We can use that to allow mounting specific
459# file systems.
460sub make_apparmor_config {
461 my ($conf, $unprivileged, $features) = @_;
462
463 # user-configured profile has precedence, but first we go through our own
464 # code to figure out whether we should warn the user:
465
466 my $raw = "lxc.apparmor.profile = generated\n";
467 my @profile_uses;
468
96f8d2a2
WB
469 if ($features->{fuse}) {
470 # For the informational warning:
471 push @profile_uses, 'features:fuse';
472 }
473
5a63f1c5
WB
474 # There's lxc.apparmor.allow_nesting now, which will add the necessary
475 # apparmor lines, create an apparmor namespace for the container, but also
476 # adds proc and sysfs mounts to /dev/.lxc/{proc,sys}. These do not have
477 # lxcfs mounted over them, because that would prevent the container from
478 # mounting new instances of them for nested containers.
479 if ($features->{nesting}) {
480 push @profile_uses, 'features:nesting';
481 $raw .= "lxc.apparmor.allow_nesting = 1\n"
482 } else {
483 # In the default profile in /etc/apparmor.d we patch this in because
484 # otherwise a container can for example run `chown` on /sys, breaking
485 # access to it for non-CAP_DAC_OVERRIDE tools on the host:
486 $raw .= "lxc.apparmor.raw = deny mount -> /proc/,\n";
487 $raw .= "lxc.apparmor.raw = deny mount -> /sys/,\n";
488 # Preferably we could use the 'remount' flag but this does not sit well
489 # with apparmor_parser currently:
490 # mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,
491 }
492
493 if (my $mount = $features->{mount}) {
494 push @profile_uses, 'features:mount';
495 foreach my $fs (PVE::Tools::split_list($mount)) {
496 $raw .= "lxc.apparmor.raw = mount fstype=$fs,\n";
497 }
498 }
499
500 # More to come?
501
502 if (PVE::LXC::Config->has_lxc_entry($conf, 'lxc.apparmor.profile')) {
503 if (length(my $used = join(', ', @profile_uses))) {
504 warn "explicitly configured lxc.apparmor.profile overrides the following settings: $used\n";
505 }
506 return '';
507 }
508
509 return $raw;
510}
511
27916659 512sub update_lxc_config {
f91f3669 513 my ($vmid, $conf) = @_;
b80dd50a 514
bb1ac2de
DM
515 my $dir = "/var/lib/lxc/$vmid";
516
517 if ($conf->{template}) {
518
519 unlink "$dir/config";
520
521 return;
522 }
523
27916659 524 my $raw = '';
b80dd50a 525
27916659
DM
526 die "missing 'arch' - internal error" if !$conf->{arch};
527 $raw .= "lxc.arch = $conf->{arch}\n";
b80dd50a 528
5a63f1c5
WB
529 my $custom_idmap = PVE::LXC::Config->has_lxc_entry($conf, 'lxc.idmap');
530 my $unprivileged = $conf->{unprivileged} || $custom_idmap;
425b62cb 531
27916659 532 my $ostype = $conf->{ostype} || die "missing 'ostype' - internal error";
866e0611 533
059f7bb4
WB
534 my $cfgpath = '/usr/share/lxc/config';
535 my $inc = "$cfgpath/$ostype.common.conf";
536 $inc ="$cfgpath/common.conf" if !-f $inc;
866e0611 537 $raw .= "lxc.include = $inc\n";
5a63f1c5 538 if ($unprivileged) {
059f7bb4
WB
539 $inc = "$cfgpath/$ostype.userns.conf";
540 $inc = "$cfgpath/userns.conf" if !-f $inc;
541 $raw .= "lxc.include = $inc\n";
27916659 542 }
b80dd50a 543
5a63f1c5
WB
544 my $features = PVE::LXC::Config->parse_features($conf->{features});
545
546 $raw .= make_seccomp_config($conf, $unprivileged, $features);
547 $raw .= make_apparmor_config($conf, $unprivileged, $features);
96f8d2a2
WB
548 if ($features->{fuse}) {
549 $raw .= "lxc.apparmor.raw = mount fstype=fuse,\n";
550 $raw .= "lxc.mount.entry = /dev/fuse dev/fuse none bind,create=file 0 0\n";
551 }
5a63f1c5 552
50df544c
WB
553 # WARNING: DO NOT REMOVE this without making sure that loop device nodes
554 # cannot be exposed to the container with r/w access (cgroup perms).
555 # When this is enabled mounts will still remain in the monitor's namespace
556 # after the container unmounted them and thus will not detach from their
557 # files while the container is running!
c16b8890 558 $raw .= "lxc.monitor.unshare = 1\n";
58cc92a9 559
0b6a2f0e
WB
560 my $cgv1 = get_cgroup_subsystems();
561
425b62cb
WB
562 # Should we read them from /etc/subuid?
563 if ($unprivileged && !$custom_idmap) {
108c6cab
WB
564 $raw .= "lxc.idmap = u 0 100000 65536\n";
565 $raw .= "lxc.idmap = g 0 100000 65536\n";
425b62cb
WB
566 }
567
d250604f 568 if (!PVE::LXC::Config->has_dev_console($conf)) {
108c6cab 569 $raw .= "lxc.console.path = none\n";
0b6a2f0e 570 $raw .= "lxc.cgroup.devices.deny = c 5:1 rwm\n" if $cgv1->{devices};
eeaea429 571 }
4f958489 572
1b4cf758 573 my $ttycount = PVE::LXC::Config->get_tty_count($conf);
108c6cab 574 $raw .= "lxc.tty.max = $ttycount\n";
cbb03fea 575
c31ad455 576 # some init scripts expect a linux terminal (turnkey).
a691a5a3
DM
577 $raw .= "lxc.environment = TERM=linux\n";
578
27916659 579 my $utsname = $conf->{hostname} || "CT$vmid";
108c6cab 580 $raw .= "lxc.uts.name = $utsname\n";
cbb03fea 581
0b6a2f0e
WB
582 if ($cgv1->{memory}) {
583 my $memory = $conf->{memory} || 512;
584 my $swap = $conf->{swap} // 0;
a12a36e0 585
0b6a2f0e
WB
586 my $lxcmem = int($memory*1024*1024);
587 $raw .= "lxc.cgroup.memory.limit_in_bytes = $lxcmem\n";
27916659 588
0b6a2f0e
WB
589 my $lxcswap = int(($memory + $swap)*1024*1024);
590 $raw .= "lxc.cgroup.memory.memsw.limit_in_bytes = $lxcswap\n";
a12a36e0
WL
591 }
592
0b6a2f0e
WB
593 if ($cgv1->{cpu}) {
594 if (my $cpulimit = $conf->{cpulimit}) {
595 $raw .= "lxc.cgroup.cpu.cfs_period_us = 100000\n";
596 my $value = int(100000*$cpulimit);
597 $raw .= "lxc.cgroup.cpu.cfs_quota_us = $value\n";
598 }
599
600 my $shares = $conf->{cpuunits} || 1024;
601 $raw .= "lxc.cgroup.cpu.shares = $shares\n";
602 }
27916659 603
fddaa91b
DM
604 die "missing 'rootfs' configuration\n"
605 if !defined($conf->{rootfs});
606
1b4cf758 607 my $mountpoint = PVE::LXC::Config->parse_ct_rootfs($conf->{rootfs});
a3076d81 608
108c6cab 609 $raw .= "lxc.rootfs.path = $dir/rootfs\n";
27916659 610
e756bdd6 611 foreach my $k (sort keys %$conf) {
27916659
DM
612 next if $k !~ m/^net(\d+)$/;
613 my $ind = $1;
1b4cf758 614 my $d = PVE::LXC::Config->parse_lxc_network($conf->{$k});
108c6cab
WB
615 $raw .= "lxc.net.$ind.type = veth\n";
616 $raw .= "lxc.net.$ind.veth.pair = veth${vmid}i${ind}\n";
617 $raw .= "lxc.net.$ind.hwaddr = $d->{hwaddr}\n" if defined($d->{hwaddr});
618 $raw .= "lxc.net.$ind.name = $d->{name}\n" if defined($d->{name});
619 $raw .= "lxc.net.$ind.mtu = $d->{mtu}\n" if defined($d->{mtu});
a12a36e0
WL
620 }
621
0b6a2f0e
WB
622 if ($cgv1->{cpuset}) {
623 my $had_cpuset = 0;
624 if (my $lxcconf = $conf->{lxc}) {
625 foreach my $entry (@$lxcconf) {
626 my ($k, $v) = @$entry;
627 $had_cpuset = 1 if $k eq 'lxc.cgroup.cpuset.cpus';
628 $raw .= "$k = $v\n";
629 }
e576f689 630 }
27916659 631
0b6a2f0e
WB
632 my $cores = $conf->{cores};
633 if (!$had_cpuset && $cores) {
634 my $cpuset = eval { PVE::CpuSet->new_from_cgroup('lxc', 'effective_cpus') };
635 $cpuset = PVE::CpuSet->new_from_cgroup('', 'effective_cpus') if !$cpuset;
636 my @members = $cpuset->members();
637 while (scalar(@members) > $cores) {
638 my $randidx = int(rand(scalar(@members)));
639 $cpuset->delete($members[$randidx]);
640 splice(@members, $randidx, 1); # keep track of the changes
641 }
642 $raw .= "lxc.cgroup.cpuset.cpus = ".$cpuset->short_string()."\n";
92902047 643 }
92902047 644 }
0b6a2f0e 645
27916659
DM
646 File::Path::mkpath("$dir/rootfs");
647
648 PVE::Tools::file_set_contents("$dir/config", $raw);
b80dd50a
DM
649}
650
117636e5
DM
651# verify and cleanup nameserver list (replace \0 with ' ')
652sub verify_nameserver_list {
653 my ($nameserver_list) = @_;
654
655 my @list = ();
656 foreach my $server (PVE::Tools::split_list($nameserver_list)) {
657 PVE::JSONSchema::pve_verify_ip($server);
658 push @list, $server;
659 }
660
661 return join(' ', @list);
662}
663
664sub verify_searchdomain_list {
665 my ($searchdomain_list) = @_;
666
667 my @list = ();
668 foreach my $server (PVE::Tools::split_list($searchdomain_list)) {
669 # todo: should we add checks for valid dns domains?
670 push @list, $server;
671 }
672
673 return join(' ', @list);
674}
675
aca816ad 676sub get_console_command {
65213b67 677 my ($vmid, $conf, $escapechar) = @_;
39413635 678
65213b67
TM
679 # '-1' as $escapechar disables keyboard escape sequence
680 # any other passed char (a-z) will result in <Ctrl+$escapechar q>
aca816ad 681
1b4cf758 682 my $cmode = PVE::LXC::Config->get_cmode($conf);
aca816ad 683
4d494664 684 my $cmd = [];
aca816ad 685 if ($cmode eq 'console') {
4d494664 686 push @$cmd, 'lxc-console', '-n', $vmid, '-t', 0;
65213b67 687 push @$cmd, '-e', $escapechar if $escapechar;
aca816ad 688 } elsif ($cmode eq 'tty') {
4d494664 689 push @$cmd, 'lxc-console', '-n', $vmid;
65213b67 690 push @$cmd, '-e', $escapechar if $escapechar;
aca816ad 691 } elsif ($cmode eq 'shell') {
4d494664 692 push @$cmd, 'lxc-attach', '--clear-env', '-n', $vmid;
aca816ad
DM
693 } else {
694 die "internal error";
695 }
4d494664
DC
696
697 return $cmd;
aca816ad
DM
698}
699
c325b32f
DM
700sub get_primary_ips {
701 my ($conf) = @_;
702
703 # return data from net0
cbb03fea 704
27916659 705 return undef if !defined($conf->{net0});
1b4cf758 706 my $net = PVE::LXC::Config->parse_lxc_network($conf->{net0});
c325b32f
DM
707
708 my $ipv4 = $net->{ip};
db78a181
WB
709 if ($ipv4) {
710 if ($ipv4 =~ /^(dhcp|manual)$/) {
711 $ipv4 = undef
712 } else {
713 $ipv4 =~ s!/\d+$!!;
714 }
715 }
65e5eaa3 716 my $ipv6 = $net->{ip6};
db78a181 717 if ($ipv6) {
5f291c7d 718 if ($ipv6 =~ /^(auto|dhcp|manual)$/) {
db78a181
WB
719 $ipv6 = undef;
720 } else {
721 $ipv6 =~ s!/\d+$!!;
722 }
723 }
cbb03fea 724
c325b32f
DM
725 return ($ipv4, $ipv6);
726}
148d1cb4 727
b407293b
WB
728sub delete_mountpoint_volume {
729 my ($storage_cfg, $vmid, $volume) = @_;
730
d250604f 731 return if PVE::LXC::Config->classify_mountpoint($volume) ne 'volume';
b407293b
WB
732
733 my ($vtype, $name, $owner) = PVE::Storage::parse_volname($storage_cfg, $volume);
734 PVE::Storage::vdisk_free($storage_cfg, $volume) if $vmid == $owner;
735}
ef241384 736
27916659 737sub destroy_lxc_container {
bccaa371 738 my ($storage_cfg, $vmid, $conf, $replacement_conf) = @_;
148d1cb4 739
d250604f 740 PVE::LXC::Config->foreach_mountpoint($conf, sub {
db8989e1 741 my ($ms, $mountpoint) = @_;
b407293b 742 delete_mountpoint_volume($storage_cfg, $vmid, $mountpoint->{volume});
db8989e1
WB
743 });
744
27916659
DM
745 rmdir "/var/lib/lxc/$vmid/rootfs";
746 unlink "/var/lib/lxc/$vmid/config";
747 rmdir "/var/lib/lxc/$vmid";
bccaa371
FG
748 if (defined $replacement_conf) {
749 PVE::LXC::Config->write_config($vmid, $replacement_conf);
750 } else {
751 destroy_config($vmid);
752 }
27916659
DM
753
754 #my $cmd = ['lxc-destroy', '-n', $vmid ];
755 #PVE::Tools::run_command($cmd);
148d1cb4 756}
68fba17b 757
ef241384 758sub vm_stop_cleanup {
5fa890f0 759 my ($storage_cfg, $vmid, $conf, $keepActive) = @_;
ef241384
DM
760
761 eval {
762 if (!$keepActive) {
bf9d912c 763
d250604f 764 my $vollist = PVE::LXC::Config->get_vm_volumes($conf);
a8b6b8a7 765 PVE::Storage::deactivate_volumes($storage_cfg, $vollist);
ef241384
DM
766 }
767 };
768 warn $@ if $@; # avoid errors - just warn
769}
770
93cdbbfb
AD
771my $safe_num_ne = sub {
772 my ($a, $b) = @_;
773
774 return 0 if !defined($a) && !defined($b);
775 return 1 if !defined($a);
776 return 1 if !defined($b);
777
778 return $a != $b;
779};
780
781my $safe_string_ne = sub {
782 my ($a, $b) = @_;
783
784 return 0 if !defined($a) && !defined($b);
785 return 1 if !defined($a);
786 return 1 if !defined($b);
787
788 return $a ne $b;
789};
790
791sub update_net {
bedeaaf1 792 my ($vmid, $conf, $opt, $newnet, $netid, $rootdir) = @_;
93cdbbfb 793
18862537
WB
794 if ($newnet->{type} ne 'veth') {
795 # for when there are physical interfaces
796 die "cannot update interface of type $newnet->{type}";
797 }
798
799 my $veth = "veth${vmid}i${netid}";
93cdbbfb
AD
800 my $eth = $newnet->{name};
801
18862537 802 if (my $oldnetcfg = $conf->{$opt}) {
1b4cf758 803 my $oldnet = PVE::LXC::Config->parse_lxc_network($oldnetcfg);
18862537
WB
804
805 if (&$safe_string_ne($oldnet->{hwaddr}, $newnet->{hwaddr}) ||
806 &$safe_string_ne($oldnet->{name}, $newnet->{name})) {
93cdbbfb 807
18862537 808 PVE::Network::veth_delete($veth);
bedeaaf1 809 delete $conf->{$opt};
67afe46e 810 PVE::LXC::Config->write_config($vmid, $conf);
93cdbbfb 811
18862537 812 hotplug_net($vmid, $conf, $opt, $newnet, $netid);
bedeaaf1 813
380962c7
WB
814 } else {
815 if (&$safe_string_ne($oldnet->{bridge}, $newnet->{bridge}) ||
816 &$safe_num_ne($oldnet->{tag}, $newnet->{tag}) ||
817 &$safe_num_ne($oldnet->{firewall}, $newnet->{firewall})) {
bedeaaf1 818
18862537 819 if ($oldnet->{bridge}) {
bedeaaf1 820 PVE::Network::tap_unplug($veth);
18862537
WB
821 foreach (qw(bridge tag firewall)) {
822 delete $oldnet->{$_};
823 }
1b4cf758 824 $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet);
67afe46e 825 PVE::LXC::Config->write_config($vmid, $conf);
bedeaaf1 826 }
93cdbbfb 827
380962c7
WB
828 PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
829 # This includes the rate:
830 foreach (qw(bridge tag firewall rate)) {
18862537
WB
831 $oldnet->{$_} = $newnet->{$_} if $newnet->{$_};
832 }
380962c7
WB
833 } elsif (&$safe_string_ne($oldnet->{rate}, $newnet->{rate})) {
834 # Rate can be applied on its own but any change above needs to
835 # include the rate in tap_plug since OVS resets everything.
836 PVE::Network::tap_rate_limit($veth, $newnet->{rate});
837 $oldnet->{rate} = $newnet->{rate}
838 }
839 $conf->{$opt} = PVE::LXC::Config->print_lxc_network($oldnet);
840 PVE::LXC::Config->write_config($vmid, $conf);
93cdbbfb
AD
841 }
842 } else {
18862537 843 hotplug_net($vmid, $conf, $opt, $newnet, $netid);
93cdbbfb
AD
844 }
845
bedeaaf1 846 update_ipconfig($vmid, $conf, $opt, $eth, $newnet, $rootdir);
93cdbbfb
AD
847}
848
849sub hotplug_net {
18862537 850 my ($vmid, $conf, $opt, $newnet, $netid) = @_;
93cdbbfb 851
18862537 852 my $veth = "veth${vmid}i${netid}";
cbb03fea 853 my $vethpeer = $veth . "p";
93cdbbfb
AD
854 my $eth = $newnet->{name};
855
856 PVE::Network::veth_create($veth, $vethpeer, $newnet->{bridge}, $newnet->{hwaddr});
380962c7 857 PVE::Network::tap_plug($veth, $newnet->{bridge}, $newnet->{tag}, $newnet->{firewall}, $newnet->{trunks}, $newnet->{rate});
93cdbbfb 858
cbb03fea 859 # attach peer in container
93cdbbfb
AD
860 my $cmd = ['lxc-device', '-n', $vmid, 'add', $vethpeer, "$eth" ];
861 PVE::Tools::run_command($cmd);
862
cbb03fea 863 # link up peer in container
93cdbbfb
AD
864 $cmd = ['lxc-attach', '-n', $vmid, '-s', 'NETWORK', '--', '/sbin/ip', 'link', 'set', $eth ,'up' ];
865 PVE::Tools::run_command($cmd);
bedeaaf1 866
18862537
WB
867 my $done = { type => 'veth' };
868 foreach (qw(bridge tag firewall hwaddr name)) {
869 $done->{$_} = $newnet->{$_} if $newnet->{$_};
870 }
1b4cf758 871 $conf->{$opt} = PVE::LXC::Config->print_lxc_network($done);
bedeaaf1 872
67afe46e 873 PVE::LXC::Config->write_config($vmid, $conf);
93cdbbfb
AD
874}
875
68a05bb3 876sub update_ipconfig {
bedeaaf1
AD
877 my ($vmid, $conf, $opt, $eth, $newnet, $rootdir) = @_;
878
f2104b80 879 my $lxc_setup = PVE::LXC::Setup->new($conf, $rootdir);
bedeaaf1 880
1b4cf758 881 my $optdata = PVE::LXC::Config->parse_lxc_network($conf->{$opt});
84e0c123
WB
882 my $deleted = [];
883 my $added = [];
8d723477
WB
884 my $nscmd = sub {
885 my $cmdargs = shift;
886 PVE::Tools::run_command(['lxc-attach', '-n', $vmid, '-s', 'NETWORK', '--', @_], %$cmdargs);
84e0c123 887 };
8d723477 888 my $ipcmd = sub { &$nscmd({}, '/sbin/ip', @_) };
2bfd1615 889
84e0c123 890 my $change_ip_config = sub {
f39002a6
DM
891 my ($ipversion) = @_;
892
893 my $family_opt = "-$ipversion";
894 my $suffix = $ipversion == 4 ? '' : $ipversion;
84e0c123
WB
895 my $gw= "gw$suffix";
896 my $ip= "ip$suffix";
bedeaaf1 897
6178b0dd
WB
898 my $newip = $newnet->{$ip};
899 my $newgw = $newnet->{$gw};
900 my $oldip = $optdata->{$ip};
ded5d25a 901 my $oldgw = $optdata->{$gw};
6178b0dd
WB
902
903 my $change_ip = &$safe_string_ne($oldip, $newip);
ded5d25a 904 my $change_gw = &$safe_string_ne($oldgw, $newgw);
bedeaaf1 905
84e0c123 906 return if !$change_ip && !$change_gw;
68a05bb3 907
84e0c123 908 # step 1: add new IP, if this fails we cancel
292aff54
WB
909 my $is_real_ip = ($newip && $newip !~ /^(?:auto|dhcp|manual)$/);
910 if ($change_ip && $is_real_ip) {
8d723477 911 eval { &$ipcmd($family_opt, 'addr', 'add', $newip, 'dev', $eth); };
84e0c123
WB
912 if (my $err = $@) {
913 warn $err;
914 return;
915 }
bedeaaf1 916 }
bedeaaf1 917
84e0c123
WB
918 # step 2: replace gateway
919 # If this fails we delete the added IP and cancel.
920 # If it succeeds we save the config and delete the old IP, ignoring
921 # errors. The config is then saved.
922 # Note: 'ip route replace' can add
923 if ($change_gw) {
6178b0dd 924 if ($newgw) {
292aff54
WB
925 eval {
926 if ($is_real_ip && !PVE::Network::is_ip_in_cidr($newgw, $newip, $ipversion)) {
927 &$ipcmd($family_opt, 'route', 'add', $newgw, 'dev', $eth);
928 }
929 &$ipcmd($family_opt, 'route', 'replace', 'default', 'via', $newgw);
930 };
84e0c123
WB
931 if (my $err = $@) {
932 warn $err;
933 # the route was not replaced, the old IP is still available
934 # rollback (delete new IP) and cancel
935 if ($change_ip) {
8d723477 936 eval { &$ipcmd($family_opt, 'addr', 'del', $newip, 'dev', $eth); };
84e0c123
WB
937 warn $@ if $@; # no need to die here
938 }
939 return;
940 }
941 } else {
8d723477 942 eval { &$ipcmd($family_opt, 'route', 'del', 'default'); };
84e0c123
WB
943 # if the route was not deleted, the guest might have deleted it manually
944 # warn and continue
945 warn $@ if $@;
946 }
ded5d25a
DL
947 if ($oldgw && $oldip && !PVE::Network::is_ip_in_cidr($oldgw, $oldip)) {
948 eval { &$ipcmd($family_opt, 'route', 'del', $oldgw, 'dev', $eth); };
949 # warn if the route was deleted manually
950 warn $@ if $@;
951 }
2bfd1615 952 }
2bfd1615 953
6178b0dd 954 # from this point on we save the configuration
84e0c123 955 # step 3: delete old IP ignoring errors
6178b0dd 956 if ($change_ip && $oldip && $oldip !~ /^(?:auto|dhcp)$/) {
8d723477
WB
957 # We need to enable promote_secondaries, otherwise our newly added
958 # address will be removed along with the old one.
959 my $promote = 0;
960 eval {
961 if ($ipversion == 4) {
962 &$nscmd({ outfunc => sub { $promote = int(shift) } },
963 'cat', "/proc/sys/net/ipv4/conf/$eth/promote_secondaries");
964 &$nscmd({}, 'sysctl', "net.ipv4.conf.$eth.promote_secondaries=1");
965 }
966 &$ipcmd($family_opt, 'addr', 'del', $oldip, 'dev', $eth);
967 };
84e0c123 968 warn $@ if $@; # no need to die here
8d723477
WB
969
970 if ($ipversion == 4) {
971 &$nscmd({}, 'sysctl', "net.ipv4.conf.$eth.promote_secondaries=$promote");
972 }
bedeaaf1
AD
973 }
974
84e0c123
WB
975 foreach my $property ($ip, $gw) {
976 if ($newnet->{$property}) {
977 $optdata->{$property} = $newnet->{$property};
978 } else {
979 delete $optdata->{$property};
980 }
bedeaaf1 981 }
1b4cf758 982 $conf->{$opt} = PVE::LXC::Config->print_lxc_network($optdata);
67afe46e 983 PVE::LXC::Config->write_config($vmid, $conf);
84e0c123
WB
984 $lxc_setup->setup_network($conf);
985 };
bedeaaf1 986
f39002a6
DM
987 &$change_ip_config(4);
988 &$change_ip_config(6);
489e960d
WL
989
990}
991
34fdb3d7
WB
992my $enter_namespace = sub {
993 my ($vmid, $pid, $which, $type) = @_;
994 sysopen my $fd, "/proc/$pid/ns/$which", O_RDONLY
995 or die "failed to open $which namespace of container $vmid: $!\n";
996 PVE::Tools::setns(fileno($fd), $type)
997 or die "failed to enter $which namespace of container $vmid: $!\n";
998 close $fd;
999};
1000
1001my $do_syncfs = sub {
1002 my ($vmid, $pid, $socket) = @_;
1003
1004 &$enter_namespace($vmid, $pid, 'mnt', PVE::Tools::CLONE_NEWNS);
1005
1006 # Tell the parent process to start reading our /proc/mounts
1007 print {$socket} "go\n";
1008 $socket->flush();
1009
1010 # Receive /proc/self/mounts
1011 my $mountdata = do { local $/ = undef; <$socket> };
1012 close $socket;
1013
1014 # Now sync all mountpoints...
1015 my $mounts = PVE::ProcFSTools::parse_mounts($mountdata);
1016 foreach my $mp (@$mounts) {
1017 my ($what, $dir, $fs) = @$mp;
1018 next if $fs eq 'fuse.lxcfs';
1019 eval { PVE::Tools::sync_mountpoint($dir); };
1020 warn $@ if $@;
1021 }
1022};
1023
1024sub sync_container_namespace {
1025 my ($vmid) = @_;
1026 my $pid = find_lxc_pid($vmid);
1027
1028 # SOCK_DGRAM is nicer for barriers but cannot be slurped
1029 socketpair my $pfd, my $cfd, AF_UNIX, SOCK_STREAM, PF_UNSPEC
1030 or die "failed to create socketpair: $!\n";
1031
1032 my $child = fork();
1033 die "fork failed: $!\n" if !defined($child);
1034
1035 if (!$child) {
1036 eval {
1037 close $pfd;
1038 &$do_syncfs($vmid, $pid, $cfd);
1039 };
1040 if (my $err = $@) {
1041 warn $err;
1042 POSIX::_exit(1);
1043 }
1044 POSIX::_exit(0);
1045 }
1046 close $cfd;
1047 my $go = <$pfd>;
1048 die "failed to enter container namespace\n" if $go ne "go\n";
1049
1050 open my $mounts, '<', "/proc/$child/mounts"
1051 or die "failed to open container's /proc/mounts: $!\n";
1052 my $mountdata = do { local $/ = undef; <$mounts> };
1053 close $mounts;
1054 print {$pfd} $mountdata;
1055 close $pfd;
1056
1057 while (waitpid($child, 0) != $child) {}
1058 die "failed to sync container namespace\n" if $? != 0;
1059}
1060
bb1ac2de
DM
1061sub template_create {
1062 my ($vmid, $conf) = @_;
1063
1064 my $storecfg = PVE::Storage::config();
1065
9d1cb46b
WL
1066 PVE::LXC::Config->foreach_mountpoint($conf, sub {
1067 my ($ms, $mountpoint) = @_;
bb1ac2de 1068
9d1cb46b 1069 my $volid = $mountpoint->{volume};
bb1ac2de 1070
9d1cb46b
WL
1071 die "Template feature is not available for '$volid'\n"
1072 if !PVE::Storage::volume_has_feature($storecfg, 'template', $volid);
1073 });
bb1ac2de 1074
9d1cb46b
WL
1075 PVE::LXC::Config->foreach_mountpoint($conf, sub {
1076 my ($ms, $mountpoint) = @_;
1077
1078 my $volid = $mountpoint->{volume};
1079
1080 PVE::Storage::activate_volumes($storecfg, [$volid]);
1081
1082 my $template_volid = PVE::Storage::vdisk_create_base($storecfg, $volid);
1083 $mountpoint->{volume} = $template_volid;
1084 $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq "rootfs");
1085 });
bb1ac2de 1086
67afe46e 1087 PVE::LXC::Config->write_config($vmid, $conf);
bb1ac2de
DM
1088}
1089
52389a07 1090sub check_ct_modify_config_perm {
f1ba1a4b 1091 my ($rpcenv, $authuser, $vmid, $pool, $newconf, $delete) = @_;
52389a07 1092
c81f19d1 1093 return 1 if $authuser eq 'root@pam';
52389a07 1094
f1ba1a4b
WB
1095 my $check = sub {
1096 my ($opt, $delete) = @_;
f2357408 1097 if ($opt eq 'cores' || $opt eq 'cpuunits' || $opt eq 'cpulimit') {
52389a07 1098 $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.CPU']);
e59a61ed 1099 } elsif ($opt eq 'rootfs' || $opt =~ /^mp\d+$/) {
52389a07 1100 $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Disk']);
f1ba1a4b 1101 return if $delete;
1b4cf758
FG
1102 my $data = $opt eq 'rootfs' ? PVE::LXC::Config->parse_ct_rootfs($newconf->{$opt})
1103 : PVE::LXC::Config->parse_ct_mountpoint($newconf->{$opt});
9d294016
FG
1104 raise_perm_exc("mount point type $data->{type} is only allowed for root\@pam")
1105 if $data->{type} ne 'volume';
52389a07
DM
1106 } elsif ($opt eq 'memory' || $opt eq 'swap') {
1107 $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Memory']);
1108 } elsif ($opt =~ m/^net\d+$/ || $opt eq 'nameserver' ||
1109 $opt eq 'searchdomain' || $opt eq 'hostname') {
1110 $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Network']);
5a63f1c5
WB
1111 } elsif ($opt eq 'features') {
1112 # For now this is restricted to root@pam
1113 raise_perm_exc("changing feature flags is only allowed for root\@pam");
52389a07
DM
1114 } else {
1115 $rpcenv->check_vm_perm($authuser, $vmid, $pool, ['VM.Config.Options']);
1116 }
f1ba1a4b
WB
1117 };
1118
1119 foreach my $opt (keys %$newconf) {
1120 &$check($opt, 0);
1121 }
1122 foreach my $opt (@$delete) {
1123 &$check($opt, 1);
52389a07
DM
1124 }
1125
1126 return 1;
1127}
1128
9622e848 1129sub umount_all {
da629848 1130 my ($vmid, $storage_cfg, $conf, $noerr) = @_;
9622e848
DM
1131
1132 my $rootdir = "/var/lib/lxc/$vmid/rootfs";
d250604f 1133 my $volid_list = PVE::LXC::Config->get_vm_volumes($conf);
9622e848 1134
d250604f 1135 PVE::LXC::Config->foreach_mountpoint_reverse($conf, sub {
9622e848
DM
1136 my ($ms, $mountpoint) = @_;
1137
1138 my $volid = $mountpoint->{volume};
1139 my $mount = $mountpoint->{mp};
1140
1141 return if !$volid || !$mount;
1142
d18f96b4 1143 my $mount_path = "$rootdir/$mount";
f845a93d 1144 $mount_path =~ s!/+!/!g;
9622e848 1145
228a5a1d
WL
1146 return if !PVE::ProcFSTools::is_mounted($mount_path);
1147
9622e848 1148 eval {
d18f96b4 1149 PVE::Tools::run_command(['umount', '-d', $mount_path]);
9622e848
DM
1150 };
1151 if (my $err = $@) {
1152 if ($noerr) {
1153 warn $err;
1154 } else {
1155 die $err;
1156 }
1157 }
1158 });
9622e848
DM
1159}
1160
1161sub mount_all {
25321b68 1162 my ($vmid, $storage_cfg, $conf, $ignore_ro) = @_;
9622e848
DM
1163
1164 my $rootdir = "/var/lib/lxc/$vmid/rootfs";
1adc7e53 1165 File::Path::make_path($rootdir);
9622e848 1166
d250604f 1167 my $volid_list = PVE::LXC::Config->get_vm_volumes($conf);
9622e848
DM
1168 PVE::Storage::activate_volumes($storage_cfg, $volid_list);
1169
1170 eval {
d250604f 1171 PVE::LXC::Config->foreach_mountpoint($conf, sub {
9622e848
DM
1172 my ($ms, $mountpoint) = @_;
1173
25321b68
FG
1174 $mountpoint->{ro} = 0 if $ignore_ro;
1175
da629848 1176 mountpoint_mount($mountpoint, $rootdir, $storage_cfg);
9622e848
DM
1177 });
1178 };
1179 if (my $err = $@) {
e2007ac2 1180 warn "mounting container failed\n";
9622e848 1181 umount_all($vmid, $storage_cfg, $conf, 1);
e2007ac2 1182 die $err;
9622e848
DM
1183 }
1184
da629848 1185 return $rootdir;
9622e848
DM
1186}
1187
1188
b15c75fc 1189sub mountpoint_mount_path {
da629848 1190 my ($mountpoint, $storage_cfg, $snapname) = @_;
b15c75fc 1191
da629848 1192 return mountpoint_mount($mountpoint, undef, $storage_cfg, $snapname);
b15c75fc 1193}
cc6b0307 1194
21f292ff
WB
1195sub query_loopdev {
1196 my ($path) = @_;
1197 my $found;
1198 my $parser = sub {
1199 my $line = shift;
1200 if ($line =~ m@^(/dev/loop\d+):@) {
1201 $found = $1;
1202 }
1203 };
1204 my $cmd = ['losetup', '--associated', $path];
1205 PVE::Tools::run_command($cmd, outfunc => $parser);
1206 return $found;
1207}
1208
50df544c
WB
1209# Run a function with a file attached to a loop device.
1210# The loop device is always detached afterwards (or set to autoclear).
1211# Returns the loop device.
1212sub run_with_loopdev {
fd8cab92 1213 my ($func, $file, $readonly) = @_;
54d11e5c
WB
1214 my $device = query_loopdev($file);
1215 # Try to reuse an existing device
1216 if ($device) {
1217 # We assume that whoever setup the loop device is responsible for
1218 # detaching it.
1219 &$func($device);
1220 return $device;
1221 }
1222
50df544c
WB
1223 my $parser = sub {
1224 my $line = shift;
1225 if ($line =~ m@^(/dev/loop\d+)$@) {
1226 $device = $1;
1227 }
1228 };
fd8cab92
DL
1229 my $losetup_cmd = [
1230 'losetup',
1231 '--show',
1232 '-f',
1233 $file,
1234 ];
1235 push @$losetup_cmd, '-r' if $readonly;
1236 PVE::Tools::run_command($losetup_cmd, outfunc => $parser);
50df544c
WB
1237 die "failed to setup loop device for $file\n" if !$device;
1238 eval { &$func($device); };
1239 my $err = $@;
1240 PVE::Tools::run_command(['losetup', '-d', $device]);
1241 die $err if $err;
1242 return $device;
1243}
1244
ab3722b3
WB
1245# In scalar mode: returns a file handle to the deepest directory node.
1246# In list context: returns a list of:
1247# * the deepest directory node
1248# * the 2nd deepest directory (parent of the above)
1249# * directory name of the last directory
1250# So that the path $2/$3 should lead to $1 afterwards.
1251sub walk_tree_nofollow($$$) {
1252 my ($start, $subdir, $mkdir) = @_;
1253
1254 # splitdir() returns '' for empty components including the leading /
1255 my @comps = grep { length($_)>0 } File::Spec->splitdir($subdir);
1256
1257 sysopen(my $fd, $start, O_PATH | O_DIRECTORY)
1258 or die "failed to open start directory $start: $!\n";
1259
1260 my $dir = $start;
1261 my $last_component = undef;
1262 my $second = $fd;
1263 foreach my $component (@comps) {
1264 $dir .= "/$component";
1265 my $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY);
1266
1267 if (!$next) {
1268 # failed, check for symlinks and try to create the path
3eb5f47b 1269 die "symlink encountered at: $dir\n" if $! == ELOOP || $! == ENOTDIR;
ab3722b3
WB
1270 die "cannot open directory $dir: $!\n" if !$mkdir;
1271
1272 # We don't check for errors on mkdirat() here and just try to
1273 # openat() again, since at least one error (EEXIST) is an
1274 # expected possibility if multiple containers start
1275 # simultaneously. If someone else injects a symlink now then
1276 # the subsequent openat() will fail due to O_NOFOLLOW anyway.
1277 PVE::Tools::mkdirat(fileno($fd), $component, 0755);
1278
1279 $next = PVE::Tools::openat(fileno($fd), $component, O_NOFOLLOW | O_DIRECTORY);
1280 die "failed to create path: $dir: $!\n" if !$next;
1281 }
1282
1283 close $second if defined($last_component);
1284 $last_component = $component;
1285 $second = $fd;
1286 $fd = $next;
1287 }
1288
1289 return ($fd, defined($last_component) && $second, $last_component) if wantarray;
1290 close $second if defined($last_component);
1291 return $fd;
1292}
1293
619f27b4
WB
1294# To guard against symlink attack races against other currently running
1295# containers with shared recursive bind mount hierarchies we prepare a
1296# directory handle for the directory we're mounting over to verify the
1297# mountpoint afterwards.
1298sub __bindmount_prepare {
1299 my ($hostroot, $dir) = @_;
1300 my $srcdh = walk_tree_nofollow($hostroot, $dir, 0);
1301 return $srcdh;
1302}
ab3722b3 1303
619f27b4
WB
1304# Assuming we mount to rootfs/a/b/c, verify with the directory handle to 'b'
1305# ($parentfd) that 'b/c' (openat($parentfd, 'c')) really leads to the directory
1306# we intended to bind mount.
1307sub __bindmount_verify {
1308 my ($srcdh, $parentfd, $last_dir, $ro) = @_;
ab3722b3
WB
1309 my $destdh;
1310 if ($parentfd) {
1311 # Open the mount point path coming from the parent directory since the
1312 # filehandle we would have gotten as first result of walk_tree_nofollow
1313 # earlier is still a handle to the underlying directory instead of the
1314 # mounted path.
619f27b4
WB
1315 $destdh = PVE::Tools::openat(fileno($parentfd), $last_dir, PVE::Tools::O_PATH | O_NOFOLLOW | O_DIRECTORY);
1316 die "failed to open mount point: $!\n" if !$destdh;
1317 if ($ro) {
1318 my $dot = '.';
619f27b4 1319 # no separate function because 99% of the time it's the wrong thing to use.
0389da0d 1320 if (syscall(PVE::Syscall::faccessat, fileno($destdh), $dot, &POSIX::W_OK, 0) != -1) {
619f27b4
WB
1321 die "failed to mark bind mount read only\n";
1322 }
1323 die "read-only check failed: $!\n" if $! != EROFS;
1324 }
ab3722b3
WB
1325 } else {
1326 # For the rootfs we don't have a parentfd so we open the path directly.
1327 # Note that this means bindmounting any prefix of the host's
1328 # /var/lib/lxc/$vmid path into another container is considered a grave
1329 # security error.
1330 sysopen $destdh, $last_dir, O_PATH | O_DIRECTORY;
619f27b4 1331 die "failed to open mount point: $!\n" if !$destdh;
ab3722b3 1332 }
ab3722b3
WB
1333
1334 my ($srcdev, $srcinode) = stat($srcdh);
1335 my ($dstdev, $dstinode) = stat($destdh);
1336 close $srcdh;
1337 close $destdh;
1338
619f27b4
WB
1339 return ($srcdev == $dstdev && $srcinode == $dstinode);
1340}
1341
1342# Perform the actual bind mounting:
1343sub __bindmount_do {
1344 my ($dir, $dest, $ro, @extra_opts) = @_;
1345 PVE::Tools::run_command(['mount', '-o', 'bind', @extra_opts, $dir, $dest]);
1346 if ($ro) {
1347 eval { PVE::Tools::run_command(['mount', '-o', 'bind,remount,ro', $dest]); };
1348 if (my $err = $@) {
1349 warn "bindmount error\n";
1350 # don't leave writable bind-mounts behind...
1351 PVE::Tools::run_command(['umount', $dest]);
1352 die $err;
1353 }
1354 }
1355}
1356
1357sub bindmount {
1358 my ($dir, $parentfd, $last_dir, $dest, $ro, @extra_opts) = @_;
1359
1360 my $srcdh = __bindmount_prepare('/', $dir);
1361
1362 __bindmount_do($dir, $dest, $ro, @extra_opts);
1363
1364 if (!__bindmount_verify($srcdh, $parentfd, $last_dir, $ro)) {
ab3722b3
WB
1365 PVE::Tools::run_command(['umount', $dest]);
1366 die "detected mount path change at: $dir\n";
1367 }
c2744c97
WB
1368}
1369
619f27b4
WB
1370# Cleanup $rootdir a bit (double and trailing slashes), build the mount path
1371# from $rootdir and $mount and walk the path from $rootdir to the final
1372# directory to check for symlinks.
1373sub __mount_prepare_rootdir {
1374 my ($rootdir, $mount) = @_;
1375 $rootdir =~ s!/+!/!g;
1376 $rootdir =~ s!/+$!!;
1377 my $mount_path = "$rootdir/$mount";
1378 my ($mpfd, $parentfd, $last_dir) = walk_tree_nofollow($rootdir, $mount, 1);
1379 return ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir);
1380}
1381
b15c75fc 1382# use $rootdir = undef to just return the corresponding mount path
cc6b0307 1383sub mountpoint_mount {
da629848 1384 my ($mountpoint, $rootdir, $storage_cfg, $snapname) = @_;
cc6b0307
AD
1385
1386 my $volid = $mountpoint->{volume};
1387 my $mount = $mountpoint->{mp};
7c921c80 1388 my $type = $mountpoint->{type};
50df544c
WB
1389 my $quota = !$snapname && !$mountpoint->{ro} && $mountpoint->{quota};
1390 my $mounted_dev;
b15c75fc 1391
cc6b0307
AD
1392 return if !$volid || !$mount;
1393
ab3722b3
WB
1394 $mount =~ s!/+!/!g;
1395
b15c75fc 1396 my $mount_path;
ab3722b3 1397 my ($mpfd, $parentfd, $last_dir);
b15c75fc
DM
1398
1399 if (defined($rootdir)) {
619f27b4
WB
1400 ($rootdir, $mount_path, $mpfd, $parentfd, $last_dir) =
1401 __mount_prepare_rootdir($rootdir, $mount);
116ce06f 1402 }
b15c75fc
DM
1403
1404 my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
cc6b0307 1405
b15c75fc 1406 die "unknown snapshot path for '$volid'" if !$storage && defined($snapname);
cc6b0307 1407
471dd315 1408 my $optstring = '';
719129ea
WB
1409 my $acl = $mountpoint->{acl};
1410 if (defined($acl)) {
1411 $optstring .= ($acl ? 'acl' : 'noacl');
471dd315 1412 }
c2744c97 1413 my $readonly = $mountpoint->{ro};
471dd315 1414
9de0505c
WL
1415 my @extra_opts;
1416 @extra_opts = ('-o', $optstring) if $optstring;
471dd315 1417
b15c75fc
DM
1418 if ($storage) {
1419
1420 my $scfg = PVE::Storage::storage_config($storage_cfg, $storage);
7c138f58 1421
841fba68 1422 my $path = PVE::Storage::map_volume($storage_cfg, $volid, $snapname);
7c138f58 1423
841fba68 1424 $path = PVE::Storage::path($storage_cfg, $volid, $snapname) if !defined($path);
b15c75fc
DM
1425
1426 my ($vtype, undef, undef, undef, undef, $isBase, $format) =
1427 PVE::Storage::parse_volname($storage_cfg, $volid);
1428
c87b9dd8
DM
1429 $format = 'iso' if $vtype eq 'iso'; # allow to handle iso files
1430
b15c75fc 1431 if ($format eq 'subvol') {
30de33be
DM
1432 if ($mount_path) {
1433 if ($snapname) {
e84f7f5d
DM
1434 if ($scfg->{type} eq 'zfspool') {
1435 my $path_arg = $path;
1436 $path_arg =~ s!^/+!!;
471dd315 1437 PVE::Tools::run_command(['mount', '-o', 'ro', @extra_opts, '-t', 'zfs', $path_arg, $mount_path]);
e84f7f5d 1438 } else {
30de33be
DM
1439 die "cannot mount subvol snapshots for storage type '$scfg->{type}'\n";
1440 }
e84f7f5d 1441 } else {
719129ea
WB
1442 if (defined($acl) && $scfg->{type} eq 'zfspool') {
1443 my $acltype = ($acl ? 'acltype=posixacl' : 'acltype=noacl');
1444 my (undef, $name) = PVE::Storage::parse_volname($storage_cfg, $volid);
1445 $name .= "\@$snapname" if defined($snapname);
1446 PVE::Tools::run_command(['zfs', 'set', $acltype, "$scfg->{pool}/$name"]);
1447 }
ab3722b3 1448 bindmount($path, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts);
50df544c 1449 warn "cannot enable quota control for bind mounted subvolumes\n" if $quota;
30de33be 1450 }
b15c75fc 1451 }
5f6280cf 1452 return wantarray ? ($path, 0, undef) : $path;
c87b9dd8 1453 } elsif ($format eq 'raw' || $format eq 'iso') {
ada088e6
WB
1454 # NOTE: 'mount' performs canonicalization without the '-c' switch, which for
1455 # device-mapper devices is special-cased to use the /dev/mapper symlinks.
1456 # Our autodev hook expects the /dev/dm-* device currently
1457 # and will create the /dev/mapper symlink accordingly
e439a713
WB
1458 $path = Cwd::realpath($path);
1459 die "failed to get device path\n" if !$path;
1460 ($path) = ($path =~ /^(.*)$/s); #untaint
50df544c
WB
1461 my $domount = sub {
1462 my ($path) = @_;
1463 if ($mount_path) {
1464 if ($format eq 'iso') {
1465 PVE::Tools::run_command(['mount', '-o', 'ro', @extra_opts, $path, $mount_path]);
1466 } elsif ($isBase || defined($snapname)) {
1467 PVE::Tools::run_command(['mount', '-o', 'ro,noload', @extra_opts, $path, $mount_path]);
1468 } else {
1469 if ($quota) {
1470 push @extra_opts, '-o', 'usrjquota=aquota.user,grpjquota=aquota.group,jqfmt=vfsv0';
1471 }
c2744c97 1472 push @extra_opts, '-o', 'ro' if $readonly;
50df544c
WB
1473 PVE::Tools::run_command(['mount', @extra_opts, $path, $mount_path]);
1474 }
1475 }
1476 };
30de33be 1477 my $use_loopdev = 0;
b15c75fc 1478 if ($scfg->{path}) {
fd8cab92 1479 $mounted_dev = run_with_loopdev($domount, $path, $readonly);
30de33be 1480 $use_loopdev = 1;
2e879877
DM
1481 } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' ||
1482 $scfg->{type} eq 'rbd' || $scfg->{type} eq 'lvmthin') {
50df544c
WB
1483 $mounted_dev = $path;
1484 &$domount($path);
b15c75fc
DM
1485 } else {
1486 die "unsupported storage type '$scfg->{type}'\n";
1487 }
50df544c 1488 return wantarray ? ($path, $use_loopdev, $mounted_dev) : $path;
b15c75fc
DM
1489 } else {
1490 die "unsupported image format '$format'\n";
1491 }
7c921c80 1492 } elsif ($type eq 'device') {
c9bc95a1 1493 push @extra_opts, '-o', 'ro' if $readonly;
0d449e38
WB
1494 push @extra_opts, '-o', 'usrjquota=aquota.user,grpjquota=aquota.group,jqfmt=vfsv0' if $quota;
1495 # See the NOTE above about devicemapper canonicalization
1496 my ($devpath) = (Cwd::realpath($volid) =~ /^(.*)$/s); # realpath() taints
471dd315 1497 PVE::Tools::run_command(['mount', @extra_opts, $volid, $mount_path]) if $mount_path;
0d449e38 1498 return wantarray ? ($volid, 0, $devpath) : $volid;
e2007ac2
DM
1499 } elsif ($type eq 'bind') {
1500 die "directory '$volid' does not exist\n" if ! -d $volid;
ab3722b3 1501 bindmount($volid, $parentfd, $last_dir//$rootdir, $mount_path, $readonly, @extra_opts) if $mount_path;
50df544c
WB
1502 warn "cannot enable quota control for bind mounts\n" if $quota;
1503 return wantarray ? ($volid, 0, undef) : $volid;
b15c75fc
DM
1504 }
1505
1506 die "unsupported storage";
cc6b0307
AD
1507}
1508
6c871c36 1509sub mkfs {
d216e891 1510 my ($dev, $rootuid, $rootgid) = @_;
6c871c36 1511
d216e891
WB
1512 PVE::Tools::run_command(['mkfs.ext4', '-O', 'mmp',
1513 '-E', "root_owner=$rootuid:$rootgid",
1514 $dev]);
6c871c36
DM
1515}
1516
1517sub format_disk {
d216e891 1518 my ($storage_cfg, $volid, $rootuid, $rootgid) = @_;
6c871c36
DM
1519
1520 if ($volid =~ m!^/dev/.+!) {
1521 mkfs($volid);
1522 return;
1523 }
1524
1525 my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
1526
1527 die "cannot format volume '$volid' with no storage\n" if !$storage;
1528
08ca136d
DM
1529 PVE::Storage::activate_volumes($storage_cfg, [$volid]);
1530
841fba68
DM
1531 my $path = PVE::Storage::map_volume($storage_cfg, $volid);
1532
1533 $path = PVE::Storage::path($storage_cfg, $volid) if !defined($path);
6c871c36
DM
1534
1535 my ($vtype, undef, undef, undef, undef, $isBase, $format) =
1536 PVE::Storage::parse_volname($storage_cfg, $volid);
1537
1538 die "cannot format volume '$volid' (format == $format)\n"
1539 if $format ne 'raw';
1540
d216e891 1541 mkfs($path, $rootuid, $rootgid);
6c871c36
DM
1542}
1543
1544sub destroy_disks {
1545 my ($storecfg, $vollist) = @_;
1546
1547 foreach my $volid (@$vollist) {
1548 eval { PVE::Storage::vdisk_free($storecfg, $volid); };
1549 warn $@ if $@;
1550 }
1551}
1552
c9cf8008
WB
1553sub alloc_disk {
1554 my ($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid) = @_;
1555
1556 my $needs_chown = 0;
1557 my $volid;
1558
1559 my $scfg = PVE::Storage::storage_config($storecfg, $storage);
1560 # fixme: use better naming ct-$vmid-disk-X.raw?
1561
1562 eval {
1563 my $do_format = 0;
be6c3dfa 1564 if ($scfg->{type} eq 'dir' || $scfg->{type} eq 'nfs' || $scfg->{type} eq 'cifs' ) {
c9cf8008
WB
1565 if ($size_kb > 0) {
1566 $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw',
1567 undef, $size_kb);
1568 $do_format = 1;
1569 } else {
1570 $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol',
1571 undef, 0);
1572 $needs_chown = 1;
1573 }
1574 } elsif ($scfg->{type} eq 'zfspool') {
1575
1576 $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'subvol',
1577 undef, $size_kb);
1578 $needs_chown = 1;
1579 } elsif ($scfg->{type} eq 'drbd' || $scfg->{type} eq 'lvm' || $scfg->{type} eq 'lvmthin') {
1580
1581 $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb);
1582 $do_format = 1;
1583
1584 } elsif ($scfg->{type} eq 'rbd') {
1585
c9cf8008
WB
1586 $volid = PVE::Storage::vdisk_alloc($storecfg, $storage, $vmid, 'raw', undef, $size_kb);
1587 $do_format = 1;
1588 } else {
1589 die "unable to create containers on storage type '$scfg->{type}'\n";
1590 }
1591 format_disk($storecfg, $volid, $rootuid, $rootgid) if $do_format;
1592 };
1593 if (my $err = $@) {
1594 # in case formatting got interrupted:
1595 if (defined($volid)) {
1596 eval { PVE::Storage::vdisk_free($storecfg, $volid); };
1597 warn $@ if $@;
1598 }
1599 die $err;
1600 }
1601
1602 return ($volid, $needs_chown);
1603}
1604
2aee38e5 1605our $NEW_DISK_RE = qr/^([^:\s]+):(\d+(\.\d+)?)$/;
6c871c36
DM
1606sub create_disks {
1607 my ($storecfg, $vmid, $settings, $conf) = @_;
1608
1609 my $vollist = [];
1610
1611 eval {
d216e891
WB
1612 my (undef, $rootuid, $rootgid) = PVE::LXC::parse_id_maps($conf);
1613 my $chown_vollist = [];
1614
d250604f 1615 PVE::LXC::Config->foreach_mountpoint($settings, sub {
6c871c36
DM
1616 my ($ms, $mountpoint) = @_;
1617
1618 my $volid = $mountpoint->{volume};
1619 my $mp = $mountpoint->{mp};
1620
1621 my ($storage, $volname) = PVE::Storage::parse_volume_id($volid, 1);
1622
2aee38e5 1623 if ($storage && ($volid =~ $NEW_DISK_RE)) {
8ed5ff9d 1624 my ($storeid, $size_gb) = ($1, $2);
6c871c36 1625
8ed5ff9d 1626 my $size_kb = int(${size_gb}*1024) * 1024;
6c871c36 1627
c9cf8008
WB
1628 my $needs_chown = 0;
1629 ($volid, $needs_chown) = alloc_disk($storecfg, $vmid, $storage, $size_kb, $rootuid, $rootgid);
1630 push @$chown_vollist, $volid if $needs_chown;
6c871c36 1631 push @$vollist, $volid;
71c780b9
WB
1632 $mountpoint->{volume} = $volid;
1633 $mountpoint->{size} = $size_kb * 1024;
1b4cf758 1634 $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq 'rootfs');
6c871c36 1635 } else {
e2007ac2 1636 # use specified/existing volid/dir/device
1b4cf758 1637 $conf->{$ms} = PVE::LXC::Config->print_ct_mountpoint($mountpoint, $ms eq 'rootfs');
6c871c36
DM
1638 }
1639 });
d216e891
WB
1640
1641 PVE::Storage::activate_volumes($storecfg, $chown_vollist, undef);
1642 foreach my $volid (@$chown_vollist) {
1643 my $path = PVE::Storage::path($storecfg, $volid, undef);
1644 chown($rootuid, $rootgid, $path);
1645 }
1646 PVE::Storage::deactivate_volumes($storecfg, $chown_vollist, undef);
6c871c36
DM
1647 };
1648 # free allocated images on error
1649 if (my $err = $@) {
1650 destroy_disks($storecfg, $vollist);
1651 die $err;
1652 }
1653 return $vollist;
1654}
1655
6827e44d
AA
1656sub update_disksize {
1657 my ($vmid, $conf, $all_volumes) = @_;
1658
1659 my $changes;
1660 my $prefix = "CT $vmid:";
1661
1662 my $update_mp = sub {
1663 my ($key, $mp, @param) = @_;
1664 my $size = $all_volumes->{$mp->{volume}}->{size} // 0;
1665
1666 if (!defined($mp->{size}) || $size != $mp->{size}) {
1667 $changes = 1;
1668 print "$prefix updated volume size of '$mp->{volume}' in config.\n";
1669 $mp->{size} = $size;
1670 my $nomp = 1 if ($key eq 'rootfs');
1671 $conf->{$key} = PVE::LXC::Config->print_ct_mountpoint($mp, $nomp);
1672 }
1673 };
1674
1675 PVE::LXC::Config->foreach_mountpoint($conf, $update_mp);
1676
1677 return $changes;
1678}
1679
1680sub update_unused {
1681 my ($vmid, $conf, $all_volumes) = @_;
1682
1683 my $changes;
1684 my $prefix = "CT $vmid:";
1685
1686 # Note: it is allowed to define multiple storage entries with the same path
1687 # (alias), so we need to check both 'volid' and real 'path' (two different
1688 # volid can point to the same path).
1689
1690 # used and unused disks
1691 my $refpath = {};
1692 my $orphans = {};
1693
1694 foreach my $opt (keys %$conf) {
1695 next if ($opt !~ m/^unused\d+$/);
1696 my $vol = $all_volumes->{$conf->{$opt}};
1697 $refpath->{$vol->{path}} = $vol->{volid};
1698 }
1699
1700 foreach my $key (keys %$all_volumes) {
1701 my $vol = $all_volumes->{$key};
1702 my $in_use = PVE::LXC::Config->is_volume_in_use($conf, $vol->{volid});
1703 my $path = $vol->{path};
1704
1705 if ($in_use) {
1706 $refpath->{$path} = $key;
1707 delete $orphans->{$path};
1708 } else {
1709 if ((!$orphans->{$path}) && (!$refpath->{$path})) {
1710 $orphans->{$path} = $key;
1711 }
1712 }
1713 }
1714
1715 for my $key (keys %$orphans) {
1716 my $disk = $orphans->{$key};
1717 my $unused = PVE::LXC::Config->add_unused_volume($conf, $disk);
1718
1719 if ($unused) {
1720 $changes = 1;
1721 print "$prefix add unreferenced volume '$disk' as '$unused' to config.\n";
1722 }
1723 }
1724
1725 return $changes;
1726}
1727
1728sub scan_volids {
1729 my ($cfg, $vmid) = @_;
1730
1731 my $info = PVE::Storage::vdisk_list($cfg, undef, $vmid);
1732
1733 my $all_volumes = {};
1734 foreach my $storeid (keys %$info) {
1735 foreach my $item (@{$info->{$storeid}}) {
1736 my $volid = $item->{volid};
1737 next if !($volid && $item->{size});
1738 $item->{path} = PVE::Storage::path($cfg, $volid);
1739 $all_volumes->{$volid} = $item;
1740 }
1741 }
1742
1743 return $all_volumes;
1744}
1745
1746sub rescan {
1747 my ($vmid, $nolock, $dryrun) = @_;
1748
1749 my $cfg = PVE::Storage::config();
1750
1751 # FIXME: Remove once our RBD plugin can handle CT and VM on a single storage
1752 # see: https://pve.proxmox.com/pipermail/pve-devel/2018-July/032900.html
1753 foreach my $stor (keys %{$cfg->{ids}}) {
1754 delete($cfg->{ids}->{$stor}) if !$cfg->{ids}->{$stor}->{content}->{rootdir};
1755 }
1756
1757 print "rescan volumes...\n";
1758 my $all_volumes = scan_volids($cfg, $vmid);
1759
1760 my $updatefn = sub {
1761 my ($vmid) = @_;
1762
1763 my $changes;
1764 my $conf = PVE::LXC::Config->load_config($vmid);
1765
1766 PVE::LXC::Config->check_lock($conf);
1767
1768 my $vm_volids = {};
1769 foreach my $volid (keys %$all_volumes) {
1770 my $info = $all_volumes->{$volid};
1771 $vm_volids->{$volid} = $info if $info->{vmid} == $vmid;
1772 }
1773
1774 my $upu = update_unused($vmid, $conf, $vm_volids);
1775 my $upd = update_disksize($vmid, $conf, $vm_volids);
1776 $changes = $upu || $upd;
1777
1778 PVE::LXC::Config->write_config($vmid, $conf) if $changes && !$dryrun;
1779 };
1780
1781 if (defined($vmid)) {
1782 if ($nolock) {
1783 &$updatefn($vmid);
1784 } else {
1785 PVE::LXC::Config->lock_config($vmid, $updatefn, $vmid);
1786 }
1787 } else {
1788 my $vmlist = config_list();
1789 foreach my $vmid (keys %$vmlist) {
1790 if ($nolock) {
1791 &$updatefn($vmid);
1792 } else {
1793 PVE::LXC::Config->lock_config($vmid, $updatefn, $vmid);
1794 }
1795 }
1796 }
1797}
1798
1799
68e8f3c5
DM
1800# bash completion helper
1801
1802sub complete_os_templates {
1803 my ($cmdname, $pname, $cvalue) = @_;
1804
1805 my $cfg = PVE::Storage::config();
1806
9e9bc3a6 1807 my $storeid;
68e8f3c5
DM
1808
1809 if ($cvalue =~ m/^([^:]+):/) {
1810 $storeid = $1;
1811 }
1812
1813 my $vtype = $cmdname eq 'restore' ? 'backup' : 'vztmpl';
1814 my $data = PVE::Storage::template_list($cfg, $storeid, $vtype);
1815
1816 my $res = [];
1817 foreach my $id (keys %$data) {
1818 foreach my $item (@{$data->{$id}}) {
1819 push @$res, $item->{volid} if defined($item->{volid});
1820 }
1821 }
1822
1823 return $res;
1824}
1825
68e8f3c5
DM
1826my $complete_ctid_full = sub {
1827 my ($running) = @_;
1828
1829 my $idlist = vmstatus();
1830
1831 my $active_hash = list_active_containers();
1832
1833 my $res = [];
1834
1835 foreach my $id (keys %$idlist) {
1836 my $d = $idlist->{$id};
1837 if (defined($running)) {
1838 next if $d->{template};
1839 next if $running && !$active_hash->{$id};
1840 next if !$running && $active_hash->{$id};
1841 }
1842 push @$res, $id;
1843
1844 }
1845 return $res;
1846};
1847
1848sub complete_ctid {
1849 return &$complete_ctid_full();
1850}
1851
1852sub complete_ctid_stopped {
1853 return &$complete_ctid_full(0);
1854}
1855
1856sub complete_ctid_running {
1857 return &$complete_ctid_full(1);
1858}
1859
c6a605f9
WB
1860sub parse_id_maps {
1861 my ($conf) = @_;
1862
1863 my $id_map = [];
1864 my $rootuid = 0;
1865 my $rootgid = 0;
1866
1867 my $lxc = $conf->{lxc};
1868 foreach my $entry (@$lxc) {
1869 my ($key, $value) = @$entry;
108c6cab
WB
1870 # FIXME: remove the 'id_map' variant when lxc-3.0 arrives
1871 next if $key ne 'lxc.idmap' && $key ne 'lxc.id_map';
c6a605f9
WB
1872 if ($value =~ /^([ug])\s+(\d+)\s+(\d+)\s+(\d+)\s*$/) {
1873 my ($type, $ct, $host, $length) = ($1, $2, $3, $4);
1874 push @$id_map, [$type, $ct, $host, $length];
1875 if ($ct == 0) {
1876 $rootuid = $host if $type eq 'u';
1877 $rootgid = $host if $type eq 'g';
1878 }
1879 } else {
108c6cab 1880 die "failed to parse idmap: $value\n";
c6a605f9
WB
1881 }
1882 }
1883
1884 if (!@$id_map && $conf->{unprivileged}) {
1885 # Should we read them from /etc/subuid?
1886 $id_map = [ ['u', '0', '100000', '65536'],
1887 ['g', '0', '100000', '65536'] ];
1888 $rootuid = $rootgid = 100000;
1889 }
1890
1891 return ($id_map, $rootuid, $rootgid);
1892}
1893
01dce99b
WB
1894sub userns_command {
1895 my ($id_map) = @_;
1896 if (@$id_map) {
1897 return ['lxc-usernsexec', (map { ('-m', join(':', @$_)) } @$id_map), '--'];
1898 }
1899 return [];
1900}
1901
6725e93c
AA
1902sub vm_start {
1903 my ($vmid, $conf, $skiplock) = @_;
1904
1905 update_lxc_config($vmid, $conf);
1906
2e64f057
AA
1907 my $skiplock_flag_fn = "/run/lxc/skiplock-$vmid";
1908
1909 if ($skiplock) {
1910 open(my $fh, '>', $skiplock_flag_fn) || die "failed to open $skiplock_flag_fn for writing: $!\n";
1911 close($fh);
1912 }
6725e93c
AA
1913
1914 my $cmd = ['systemctl', 'start', "pve-container\@$vmid"];
1915
2e64f057
AA
1916 eval { PVE::Tools::run_command($cmd); };
1917 if (my $err = $@) {
1918 unlink $skiplock_flag_fn;
1322f50d 1919 die $err;
2e64f057 1920 }
6725e93c
AA
1921
1922 return;
1923}
1924
b1bad293
WB
1925# Helper to stop a container completely and make sure it has stopped completely.
1926# This is necessary because we want the post-stop hook to have completed its
1927# unmount-all step, but post-stop happens after lxc puts the container into the
1928# STOPPED state.
1929sub vm_stop {
1930 my ($vmid, $kill, $shutdown_timeout, $exit_timeout) = @_;
1931
1932 # Open the container's command socket.
1933 my $path = "\0/var/lib/lxc/$vmid/command";
1934 my $sock = IO::Socket::UNIX->new(
1935 Type => SOCK_STREAM(),
1936 Peer => $path,
1937 );
1938 if (!$sock) {
1939 return if $! == ECONNREFUSED; # The container is not running
1940 die "failed to open container ${vmid}'s command socket: $!\n";
1941 }
1942
1943 # Stop the container:
1944
1945 my $cmd = ['lxc-stop', '-n', $vmid];
1946
1947 if ($kill) {
1948 push @$cmd, '--kill'; # doesn't allow timeouts
1949 } elsif (defined($shutdown_timeout)) {
1950 push @$cmd, '--timeout', $shutdown_timeout;
1951 # Give run_command 5 extra seconds
1952 $shutdown_timeout += 5;
1953 }
1954
1955 eval { PVE::Tools::run_command($cmd, timeout => $shutdown_timeout) };
1956 if (my $err = $@) {
1957 warn $@ if $@;
1958 }
1959
1960 my $result = 1;
1961 my $wait = sub { $result = <$sock>; };
1962 if (defined($exit_timeout)) {
1963 PVE::Tools::run_with_timeout($exit_timeout, $wait);
1964 } else {
1965 $wait->();
1966 }
1967
1968 return if !defined $result; # monitor is gone and the ct has stopped.
1969 die "container did not stop\n";
1970}
846a66b0 1971
00501642
WB
1972sub run_unshared {
1973 my ($code) = @_;
1974
1975 return PVE::Tools::run_fork(sub {
1976 # Unshare the mount namespace
1977 die "failed to unshare mount namespace: $!\n"
1978 if !PVE::Tools::unshare(PVE::Tools::CLONE_NEWNS);
1979 PVE::Tools::run_command(['mount', '--make-rslave', '/']);
1980 return $code->();
1981 });
1982}
1983
1984my $copy_volume = sub {
1985 my ($src_volid, $src, $dst_volid, $dest, $storage_cfg, $snapname) = @_;
1986
fd8cab92 1987 my $src_mp = { volume => $src_volid, mp => '/', ro => 1 };
00501642
WB
1988 $src_mp->{type} = PVE::LXC::Config->classify_mountpoint($src_volid);
1989
fd8cab92 1990 my $dst_mp = { volume => $dst_volid, mp => '/', ro => 0 };
00501642
WB
1991 $dst_mp->{type} = PVE::LXC::Config->classify_mountpoint($dst_volid);
1992
1993 my @mounted;
1994 eval {
1995 # mount and copy
1996 mkdir $src;
1997 mountpoint_mount($src_mp, $src, $storage_cfg, $snapname);
1998 push @mounted, $src;
1999 mkdir $dest;
2000 mountpoint_mount($dst_mp, $dest, $storage_cfg);
2001 push @mounted, $dest;
2002
2003 PVE::Tools::run_command(['/usr/bin/rsync', '--stats', '-X', '-A', '--numeric-ids',
2004 '-aH', '--whole-file', '--sparse', '--one-file-system',
2005 "$src/", $dest]);
2006 };
2007 my $err = $@;
2008 foreach my $mount (reverse @mounted) {
2009 eval { PVE::Tools::run_command(['/bin/umount', '--lazy', $mount], errfunc => sub{})};
2010 warn "Can't umount $mount\n" if $@;
2011 }
2012
2013 # If this fails they're used as mount points in a concurrent operation
2014 # (which should not happen but there's also no real need to get rid of them).
2015 rmdir $dest;
2016 rmdir $src;
2017
2018 die $err if $err;
2019};
2020
2021# Should not be called after unsharing the mount namespace!
2022sub copy_volume {
2023 my ($mp, $vmid, $storage, $storage_cfg, $conf, $snapname) = @_;
2024
2025 die "cannot copy volumes of type $mp->{type}\n" if $mp->{type} ne 'volume';
2026 File::Path::make_path("/var/lib/lxc/$vmid");
2027 my $dest = "/var/lib/lxc/$vmid/.copy-volume-1";
2028 my $src = "/var/lib/lxc/$vmid/.copy-volume-2";
2029
2030 # get id's for unprivileged container
2031 my (undef, $rootuid, $rootgid) = parse_id_maps($conf);
2032
2033 # Allocate the disk before unsharing in order to make sure zfs subvolumes
2034 # are visible in this namespace, otherwise the host only sees the empty
2035 # (not-mounted) directory.
2036 my $new_volid;
2037 eval {
5154e3e9
WB
2038 # Make sure $mp contains a correct size.
2039 $mp->{size} = PVE::Storage::volume_size_info($storage_cfg, $mp->{volume});
00501642
WB
2040 my $needs_chown;
2041 ($new_volid, $needs_chown) = alloc_disk($storage_cfg, $vmid, $storage, $mp->{size}/1024, $rootuid, $rootgid);
2042 if ($needs_chown) {
2043 PVE::Storage::activate_volumes($storage_cfg, [$new_volid], undef);
2044 my $path = PVE::Storage::path($storage_cfg, $new_volid, undef);
2045 chown($rootuid, $rootgid, $path);
2046 }
2047
2048 run_unshared(sub {
2049 $copy_volume->($mp->{volume}, $src, $new_volid, $dest, $storage_cfg, $snapname);
2050 });
2051 };
2052 if (my $err = $@) {
2053 PVE::Storage::vdisk_free($storage_cfg, $new_volid)
2054 if defined($new_volid);
2055 die $err;
2056 }
2057
2058 return $new_volid;
2059}
2060
f76a2828 20611;