]> git.proxmox.com Git - pve-container.git/blame - src/PVE/LXC/CGroup.pm
cgroup: add freeze_thaw implementation
[pve-container.git] / src / PVE / LXC / CGroup.pm
CommitLineData
80c7e72f
WB
1# cgroup handler
2#
3# This package should deal with figuring out the right cgroup path for a
4# container (via the command socket), reading and writing cgroup values, and
5# handling cgroup v1 & v2 differences.
6#
6973a214 7# Note that the long term plan is to have resource manage functions instead of
80c7e72f
WB
8# dealing with cgroup files on the outside.
9
10package PVE::LXC::CGroup;
11
12use strict;
13use warnings;
14
57f6f9f5
WB
15use IO::File;
16use IO::Select;
a7d10aa3
WB
17use POSIX qw();
18
397b1203 19use PVE::ProcFSTools;
acb5fabf
WB
20use PVE::Tools qw(
21 file_get_contents
22 file_read_firstline
23);
24
80c7e72f
WB
25use PVE::LXC::Command;
26
27# We don't want to do a command socket round trip for every cgroup read/write,
28# so any cgroup function needs to have the container's path cached, so this
29# package has to be instantiated.
30#
31# LXC keeps separate paths by controller (although they're normally all the
32# same, in our # case anyway), so we cache them by controller as well.
33sub new {
34 my ($class, $vmid) = @_;
35
36 my $self = { vmid => $vmid };
37
38 return bless $self, $class;
39}
40
1f37e0d2 41# Get the v1 controller list.
80c7e72f 42#
1f37e0d2
WB
43# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
44# optional boolean whether a unified (cgroupv2) hierarchy exists.
45#
46# Deprecated: Use `get_cgroup_controllers()` instead.
47sub get_v1_controllers {
48 my $v1 = {};
49 my $v2 = 0;
50 my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
51 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
52 my $type = $1;
53 if (length($type)) {
54 $v1->{$_} = 1 foreach split(/,/, $type);
55 } else {
56 $v2 = 1;
57 }
80c7e72f 58 }
1f37e0d2
WB
59 return wantarray ? ($v1, $v2) : $v1;
60}
61
62# Get the set v2 controller list from the `cgroup.controllers` file.
63my sub get_v2_controllers {
64 my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') }
65 || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') };
66 return undef if !defined $v2;
80c7e72f 67
1f37e0d2
WB
68 # It's a simple space separated list:
69 return { map { $_ => 1 } split(/\s+/, $v2) };
70}
71
72my $CGROUP_CONTROLLERS = undef;
73# Get a list of controllers enabled in each cgroup subsystem.
74#
75# This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
76#
77# Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
78# version.
79sub get_cgroup_controllers() {
80 if (!defined($CGROUP_CONTROLLERS)) {
81 my ($v1, undef) = get_v1_controllers();
82 my $v2 = get_v2_controllers();
83
84 $CGROUP_CONTROLLERS = [$v1, $v2];
85 }
86
87 return $CGROUP_CONTROLLERS->@*;
80c7e72f
WB
88}
89
90my $CGROUP_MODE = undef;
91# Figure out which cgroup mode we're operating under:
92#
93# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
94# cgroupv2-only environment.
95#
1f37e0d2
WB
96# NOTE: To fully support a hybrid layout it is better to use functions like
97# `cpuset_controller_path`.
98#
80c7e72f
WB
99# This is a function, not a method!
100sub cgroup_mode() {
101 if (!defined($CGROUP_MODE)) {
1f37e0d2 102 my ($v1, $v2) = get_cgroup_controllers();
80c7e72f
WB
103 if (keys %$v1) {
104 # hybrid or legacy mode
105 $CGROUP_MODE = 1;
106 } elsif ($v2) {
107 $CGROUP_MODE = 2;
108 }
109 }
110
111 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
112 return $CGROUP_MODE;
113}
114
fb57c905
WB
115my $CGROUPV2_PATH = undef;
116sub cgroupv2_base_path() {
117 if (!defined($CGROUPV2_PATH)) {
118 if (cgroup_mode() == 2) {
119 $CGROUPV2_PATH = '/sys/fs/cgroup';
120 } else {
121 $CGROUPV2_PATH = '/sys/fs/cgroup/unified';
122 }
123 }
124 return $CGROUPV2_PATH;
125}
126
1f37e0d2
WB
127# Find a cgroup controller and return its path and version.
128#
129# LXC initializes the unified hierarchy first, so if a controller is
130# available via both we favor cgroupv2 here as well.
131#
132# Returns nothing if the controller is not available.
133sub find_cgroup_controller($) {
134 my ($controller) = @_;
135
136 my ($v1, $v2) = get_cgroup_controllers();
137
138 if (!defined($controller) || $v2->{$controller}) {
fb57c905 139 my $path = cgroupv2_base_path();
1f37e0d2
WB
140 return wantarray ? ($path, 2) : $path;
141 }
142
143 if (defined($controller) && $v1->{$controller}) {
144 my $path = "/sys/fs/cgroup/$controller";
145 return wantarray ? ($path, 1) : $path;
146 }
147
148 return;
149}
150
151my $CG_PATH_CPUSET = undef;
152my $CG_VER_CPUSET = undef;
153# Find the cpuset cgroup controller.
154#
155# This is a function, not a method!
156sub cpuset_controller_path() {
157 if (!defined($CG_PATH_CPUSET)) {
158 ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset')
159 or die "failed to find cpuset controller\n";
160 }
161
162 return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
163}
164
80c7e72f
WB
165# Get a subdirectory (without the cgroup mount point) for a controller.
166#
167# If `$controller` is `undef`, get the unified (cgroupv2) path.
168#
169# Note that in cgroup v2, lxc uses the activated controller names
170# (`cgroup.controllers` file) as list of controllers for the unified hierarchy,
171# so this returns a result when a `controller` is provided even when using
172# a pure cgroupv2 setup.
173my sub get_subdir {
174 my ($self, $controller, $limiting) = @_;
175
176 my $entry_name = $controller || 'unified';
177 my $entry = ($self->{controllers}->{$entry_name} //= {});
178
179 my $kind = $limiting ? 'limit' : 'ns';
180 my $path = $entry->{$kind};
181
182 return $path if defined $path;
183
184 $path = PVE::LXC::Command::get_cgroup_path(
185 $self->{vmid},
186 $controller,
187 $limiting,
188 ) or return undef;
189
190 # untaint:
191 if ($path =~ /\.\./) {
192 die "lxc returned suspicious path: '$path'\n";
193 }
194 ($path) = ($path =~ /^(.*)$/s);
195
196 $entry->{$kind} = $path;
197
198 return $path;
199}
200
1f37e0d2 201# Get path and version for a controller.
80c7e72f
WB
202#
203# `$controller` may be `undef`, see get_subdir above for details.
1f37e0d2
WB
204#
205# Returns either just the path, or the path and cgroup version as a tuple.
80c7e72f 206sub get_path {
04affe4b 207 my ($self, $controller, $limiting) = @_;
80c7e72f 208
1f37e0d2
WB
209 # Find the controller before querying the lxc monitor via a socket:
210 my ($cgpath, $ver) = find_cgroup_controller($controller)
211 or return undef;
212
04affe4b 213 my $path = get_subdir($self, $controller, $limiting)
80c7e72f
WB
214 or return undef;
215
1f37e0d2
WB
216 $path = "$cgpath/$path";
217 return wantarray ? ($path, $ver) : $path;
218}
219
220# Convenience method to get the path info if the first existing controller.
221#
222# Returns the same as `get_path`.
223sub get_any_path {
04affe4b 224 my ($self, $limiting, @controllers) = @_;
1f37e0d2
WB
225
226 my ($path, $ver);
227 for my $c (@controllers) {
04affe4b 228 ($path, $ver) = $self->get_path($c, $limiting);
1f37e0d2
WB
229 last if defined $path;
230 }
231 return wantarray ? ($path, $ver) : $path;
80c7e72f
WB
232}
233
acb5fabf
WB
234# Parse a 'Nested keyed' file:
235#
236# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
237my sub parse_nested_keyed_file($) {
238 my ($data) = @_;
239 my $res = {};
240 foreach my $line (split(/\n/, $data)) {
241 my ($key, @values) = split(/\s+/, $line);
242
243 my $d = ($res->{$key} = {});
244
245 foreach my $value (@values) {
246 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
247 $d->{$key} = $value;
248 } else {
249 warn "bad key=value pair in nested keyed file\n";
250 }
251 }
252 }
a7d10aa3 253 return $res;
acb5fabf
WB
254}
255
a7d10aa3
WB
256# Parse a 'Flat keyed' file:
257#
258# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
259my sub parse_flat_keyed_file($) {
260 my ($data) = @_;
261 my $res = {};
262 foreach my $line (split(/\n/, $data)) {
263 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
264 $res->{$key} = $value;
265 } else {
266 warn "bad 'key value' pair in flat keyed file\n";
267 }
268 }
269 return $res;
270}
271
272# Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
acb5fabf
WB
273sub get_io_stats {
274 my ($self) = @_;
275
276 my $res = {
277 diskread => 0,
278 diskwrite => 0,
279 };
280
8571efcb 281 # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
04affe4b 282 my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio');
8571efcb
WB
283 if (!defined($path)) {
284 # container not running
285 return undef;
286 } elsif ($ver == 2) {
287 # cgroupv2 environment, io controller enabled
288 my $io_stat = file_get_contents("$path/io.stat");
289
290 my $data = parse_nested_keyed_file($io_stat);
291 foreach my $dev (keys %$data) {
292 my $dev = $data->{$dev};
293 if (my $b = $dev->{rbytes}) {
294 $res->{diskread} += $b;
295 }
296 if (my $b = $dev->{wbytes}) {
297 $res->{diskread} += $b;
acb5fabf 298 }
acb5fabf 299 }
8571efcb
WB
300
301 return $res;
302 } elsif ($ver == 1) {
acb5fabf
WB
303 # cgroupv1 environment:
304 my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
305 foreach my $line (split(/\n/, $io)) {
306 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
307 $res->{diskread} += $bytes if $type eq 'Read';
308 $res->{diskwrite} += $bytes if $type eq 'Write';
309 }
310 }
8571efcb
WB
311
312 return $res;
acb5fabf 313 } else {
8571efcb 314 die "bad cgroup version: $ver\n";
acb5fabf
WB
315 }
316
8571efcb
WB
317 # container not running
318 return undef;
acb5fabf
WB
319}
320
a7d10aa3
WB
321# Read utime and stime for this container from the cpuacct cgroup.
322# Values are in milliseconds!
323sub get_cpu_stat {
324 my ($self) = @_;
325
326 my $res = {
327 utime => 0,
328 stime => 0,
329 };
330
04affe4b 331 my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu');
8571efcb
WB
332 if (!defined($path)) {
333 # container not running
334 return undef;
335 } elsif ($ver == 2) {
336 my $data = eval { file_get_contents("$path/cpu.stat") };
a7d10aa3 337
8571efcb
WB
338 # or no io controller available:
339 return undef if !defined($data);
a7d10aa3 340
8571efcb
WB
341 $data = parse_flat_keyed_file($data);
342 $res->{utime} = int($data->{user_usec} / 1000);
343 $res->{stime} = int($data->{system_usec} / 1000);
344 } elsif ($ver == 1) {
a7d10aa3
WB
345 # cgroupv1 environment:
346 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
347 my $clk_to_usec = 1000 / $clock_ticks;
348
349 my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
350 $res->{utime} = int($data->{user} * $clk_to_usec);
351 $res->{stime} = int($data->{system} * $clk_to_usec);
352 } else {
8571efcb 353 die "bad cgroup version: $ver\n";
a7d10aa3
WB
354 }
355
356 return $res;
357}
358
8b882cd4
WB
359# Parse some memory data from `memory.stat`
360sub get_memory_stat {
361 my ($self) = @_;
362
363 my $res = {
364 mem => 0,
365 swap => 0,
366 };
367
04affe4b 368 my ($path, $ver) = $self->get_path('memory', 1);
8571efcb
WB
369 if (!defined($path)) {
370 # container most likely isn't running
371 return undef;
372 } elsif ($ver == 2) {
373 my $mem = file_get_contents("$path/memory.current");
374 my $swap = file_get_contents("$path/memory.swap.current");
8b882cd4 375
8571efcb 376 chomp ($mem, $swap);
8b882cd4 377
8571efcb
WB
378 # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
379 # the values in `memory.stat`...
8b882cd4 380
8571efcb
WB
381 $res->{mem} = $mem;
382 $res->{swap} = $swap;
383 } elsif ($ver == 1) {
8b882cd4
WB
384 # cgroupv1 environment:
385 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
386 my $mem = file_get_contents("$path/memory.usage_in_bytes");
387 my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
388 chomp ($mem, $memsw);
389
390 $res->{mem} = $mem - $stat->{total_cache};
391 $res->{swap} = $memsw - $mem;
392 } else {
8571efcb 393 die "bad cgroup version: $ver\n";
8b882cd4
WB
394 }
395
396 return $res;
397}
398
397b1203
WB
399# Change the memory limit for this container.
400#
401# Dies on error (including a not-running or currently-shutting-down guest).
402sub change_memory_limit {
403 my ($self, $mem_bytes, $swap_bytes) = @_;
404
04affe4b 405 my ($path, $ver) = $self->get_path('memory', 1);
8571efcb
WB
406 if (!defined($path)) {
407 die "trying to change memory cgroup values: container not running\n";
408 } elsif ($ver == 2) {
409 PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
410 if defined($swap_bytes);
411 PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
412 if defined($mem_bytes);
413 } elsif ($ver == 1) {
397b1203
WB
414 # With cgroupv1 we cannot control memory and swap limits separately.
415 # This also means that since the two values aren't independent, we need to handle
416 # growing and shrinking separately.
417 my $path_mem = "$path/memory.limit_in_bytes";
418 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
419
420 my $old_mem_bytes = file_get_contents($path_mem);
421 my $old_memsw_bytes = file_get_contents($path_memsw);
422 chomp($old_mem_bytes, $old_memsw_bytes);
423
424 $mem_bytes //= $old_mem_bytes;
425 my $memsw_bytes = defined($swap_bytes) ? ($mem_bytes + $swap_bytes) : $old_memsw_bytes;
426
427 if ($memsw_bytes > $old_memsw_bytes) {
428 # Growing the limit means growing the combined limit first, then pulling the
429 # memory limitup.
430 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
431 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
432 } else {
433 # Shrinking means we first need to shrink the mem-only memsw cannot be
434 # shrunk below it.
435 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
436 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
437 }
8571efcb
WB
438 } else {
439 die "bad cgroup version: $ver\n";
397b1203
WB
440 }
441
8571efcb
WB
442 # return a truth value
443 return 1;
397b1203
WB
444}
445
26b645e2
WB
446# Change the cpu quota for a container.
447#
448# Dies on error (including a not-running or currently-shutting-down guest).
449sub change_cpu_quota {
450 my ($self, $quota, $period) = @_;
451
452 die "quota without period not allowed\n" if !defined($period) && defined($quota);
453
04affe4b 454 my ($path, $ver) = $self->get_path('cpu', 1);
8571efcb
WB
455 if (!defined($path)) {
456 die "trying to change cpu quota cgroup values: container not running\n";
457 } elsif ($ver == 2) {
458 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
459 # in this interface:
460 $quota //= 'max'; # unlimited
461 if (defined($quota)) {
462 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
463 } else {
464 # we're allowed to only write the quota:
465 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
26b645e2 466 }
8571efcb 467 } elsif ($ver == 1) {
26b645e2
WB
468 $quota //= -1; # unlimited
469 $period //= -1;
470 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
471 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
8571efcb
WB
472 } else {
473 die "bad cgroup version: $ver\n";
26b645e2
WB
474 }
475
8571efcb
WB
476 # return a truth value
477 return 1;
26b645e2
WB
478}
479
480# Change the cpu "shares" for a container.
481#
482# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
483#
484# In cgroupv2 we do not have "shares", we have "weights" in the range
485# of `[1..10000]` with a default of 100.
486#
487# Since the default values don't match when scaling linearly, we use the
488# values we get as-is and simply error for values >10000 in cgroupv2.
489#
490# It is left to the user to figure this out for now.
491#
492# Dies on error (including a not-running or currently-shutting-down guest).
493sub change_cpu_shares {
494 my ($self, $shares, $cgroupv1_default) = @_;
495
04affe4b 496 my ($path, $ver) = $self->get_path('cpu', 1);
8571efcb
WB
497 if (!defined($path)) {
498 die "trying to change cpu shares/weight cgroup values: container not running\n";
499 } elsif ($ver == 2) {
500 # the cgroupv2 documentation defines the default to 100
501 $shares //= 100;
502 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
503 PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
04affe4b 504 } elsif ($ver == 1) {
26b645e2
WB
505 $shares //= 100;
506 PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
8571efcb
WB
507 } else {
508 die "bad cgroup version: $ver\n";
26b645e2
WB
509 }
510
8571efcb
WB
511 # return a truth value
512 return 1;
26b645e2
WB
513}
514
57f6f9f5
WB
515my sub v1_freeze_thaw {
516 my ($self, $controller_path, $freeze) = @_;
517 my $path = get_subdir($self, 'freezer', 1)
518 or die "trying to freeze container: container not running\n";
519 $path = "$controller_path/$path/freezer.state";
520
521 my $data = $freeze ? 'FROZEN' : 'THAWED';
522 PVE::ProcFSTools::write_proc_entry($path, $data);
523
524 # Here we just poll the freezer.state once per second.
525 while (1) {
526 my $state = file_get_contents($path);
527 chomp $state;
528 last if $state eq $data;
529 }
530}
531
532my sub v2_freeze_thaw {
533 my ($self, $controller_path, $freeze) = @_;
534 my $path = get_subdir($self, undef, 1)
535 or die "trying to freeze container: container not running\n";
536 $path = "$controller_path/$path";
537
538 my $desired_state = $freeze ? 1 : 0;
539
540 # cgroupv2 supports poll events on cgroup.events which contains the frozen
541 # state.
542 my $fh = IO::File->new("$path/cgroup.events", 'r')
543 or die "failed to open $path/cgroup.events file: $!\n";
544 my $select = IO::Select->new();
545 $select->add($fh);
546
547 PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state);
548 while (1) {
549 my @handles = $select->can_read();
550 next if !@handles;
551 open(my $dup, '<&', $fh)
552 or die "failed to reopen cgroup.events file: $!\n";
553 seek($dup, 0, 0)
554 or die "failed to rewind cgroup.events file: $!\n";
555 my $data = do {
556 local $/ = undef;
557 <$dup>
558 };
559 $data = parse_flat_keyed_file($data);
560 last if $data->{frozen} == $desired_state;
561 }
562}
563
564# Freeze or unfreeze a container.
565#
566# This will freeze the container at its outer (limiting) cgroup path. We use
567# this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to
568# fetch the cgroup path from contaienrs still running on lxc3.
569sub freeze_thaw {
570 my ($self, $freeze) = @_;
571
572 my $controller_path = find_cgroup_controller('freezer');
573 if (defined($controller_path)) {
574 return v1_freeze_thaw($self, $controller_path, $freeze);
575 } else {
576 # cgroupv2 always has a freezer, there can be both cgv1 and cgv2
577 # freezers, but we'll prefer v1 when it's available as that's what lxc
578 # does as well...
579 return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze);
580 }
581}
582
80c7e72f 5831;