]> git.proxmox.com Git - pve-common.git/blame - src/PVE/CGroup.pm
bump version to 8.1.2
[pve-common.git] / src / PVE / CGroup.pm
CommitLineData
86dff11c
AD
1# cgroup handler
2#
3# This package should deal with figuring out the right cgroup path for a
4# container (via the command socket), reading and writing cgroup values, and
5# handling cgroup v1 & v2 differences.
6#
7# Note that the long term plan is to have resource manage functions instead of
8# dealing with cgroup files on the outside.
9
10package PVE::CGroup;
11
12use strict;
13use warnings;
14
15use IO::File;
16use IO::Select;
17use POSIX qw();
18
19use PVE::ProcFSTools;
20use PVE::Tools qw(
21 file_get_contents
22 file_read_firstline
23);
24
86dff11c
AD
25# We don't want to do a command socket round trip for every cgroup read/write,
26# so any cgroup function needs to have the container's path cached, so this
27# package has to be instantiated.
28#
29# LXC keeps separate paths by controller (although they're normally all the
30# same, in our # case anyway), so we cache them by controller as well.
31sub new {
32 my ($class, $vmid) = @_;
33
34 my $self = { vmid => $vmid };
35
36 return bless $self, $class;
37}
38
39# Get the v1 controller list.
40#
41# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
42# optional boolean whether a unified (cgroupv2) hierarchy exists.
2cae54b9 43my sub get_v1_controllers {
86dff11c
AD
44 my $v1 = {};
45 my $v2 = 0;
46 my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
47 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
48 my $type = $1;
49 if (length($type)) {
50 $v1->{$_} = 1 foreach split(/,/, $type);
51 } else {
52 $v2 = 1;
53 }
54 }
55 return wantarray ? ($v1, $v2) : $v1;
56}
57
58# Get the set v2 controller list from the `cgroup.controllers` file.
59my sub get_v2_controllers {
60 my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') }
61 || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') };
62 return undef if !defined $v2;
63
64 # It's a simple space separated list:
65 return { map { $_ => 1 } split(/\s+/, $v2) };
66}
67
68my $CGROUP_CONTROLLERS = undef;
69# Get a list of controllers enabled in each cgroup subsystem.
70#
71# This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
72#
73# Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
74# version.
75sub get_cgroup_controllers() {
76 if (!defined($CGROUP_CONTROLLERS)) {
77 my ($v1, undef) = get_v1_controllers();
78 my $v2 = get_v2_controllers();
79
80 $CGROUP_CONTROLLERS = [$v1, $v2];
81 }
82
83 return $CGROUP_CONTROLLERS->@*;
84}
85
86my $CGROUP_MODE = undef;
87# Figure out which cgroup mode we're operating under:
88#
55b6de70
WB
89# For this we check the file system type of `/sys/fs/cgroup` as it may well be possible that some
90# additional cgroupv1 mount points have been created by tools such as `systemd-nspawn`, or
91# manually.
92#
93# Returns 1 for what we consider the hybrid layout, 2 for what we consider the unified layout.
86dff11c
AD
94#
95# NOTE: To fully support a hybrid layout it is better to use functions like
55b6de70 96# `cpuset_controller_path` and not rely on this value for anything involving paths.
86dff11c
AD
97#
98# This is a function, not a method!
99sub cgroup_mode() {
100 if (!defined($CGROUP_MODE)) {
55b6de70
WB
101 my $mounts = PVE::ProcFSTools::parse_proc_mounts();
102 for my $entry (@$mounts) {
103 my ($what, $dir, $fstype, $opts) = @$entry;
104 if ($dir eq '/sys/fs/cgroup') {
105 if ($fstype eq 'cgroup2') {
106 $CGROUP_MODE = 2;
107 last;
108 } else {
109 $CGROUP_MODE = 1;
110 last;
111 }
112 }
86dff11c
AD
113 }
114 }
115
116 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
117 return $CGROUP_MODE;
118}
119
120my $CGROUPV2_PATH = undef;
121sub cgroupv2_base_path() {
122 if (!defined($CGROUPV2_PATH)) {
123 if (cgroup_mode() == 2) {
124 $CGROUPV2_PATH = '/sys/fs/cgroup';
125 } else {
126 $CGROUPV2_PATH = '/sys/fs/cgroup/unified';
127 }
128 }
129 return $CGROUPV2_PATH;
130}
131
132# Find a cgroup controller and return its path and version.
133#
134# LXC initializes the unified hierarchy first, so if a controller is
135# available via both we favor cgroupv2 here as well.
136#
137# Returns nothing if the controller is not available.
9465abe2 138
86dff11c
AD
139sub find_cgroup_controller($) {
140 my ($controller) = @_;
141
142 my ($v1, $v2) = get_cgroup_controllers();
143
144 if (!defined($controller) || $v2->{$controller}) {
145 my $path = cgroupv2_base_path();
146 return wantarray ? ($path, 2) : $path;
147 }
148
149 if (defined($controller) && $v1->{$controller}) {
150 my $path = "/sys/fs/cgroup/$controller";
151 return wantarray ? ($path, 1) : $path;
152 }
153
154 return;
155}
156
157my $CG_PATH_CPUSET = undef;
158my $CG_VER_CPUSET = undef;
159# Find the cpuset cgroup controller.
160#
161# This is a function, not a method!
162sub cpuset_controller_path() {
163 if (!defined($CG_PATH_CPUSET)) {
164 ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset')
165 or die "failed to find cpuset controller\n";
166 }
167
168 return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
169}
170
171# Get a subdirectory (without the cgroup mount point) for a controller.
9465abe2 172sub get_subdir {
86dff11c
AD
173 my ($self, $controller, $limiting) = @_;
174
9465abe2 175 die "implement in subclass";
86dff11c
AD
176}
177
178# Get path and version for a controller.
179#
180# `$controller` may be `undef`, see get_subdir above for details.
181#
182# Returns either just the path, or the path and cgroup version as a tuple.
183sub get_path {
184 my ($self, $controller, $limiting) = @_;
86dff11c
AD
185 # Find the controller before querying the lxc monitor via a socket:
186 my ($cgpath, $ver) = find_cgroup_controller($controller)
187 or return undef;
188
9465abe2 189 my $path = $self->get_subdir($controller, $limiting)
86dff11c
AD
190 or return undef;
191
192 $path = "$cgpath/$path";
193 return wantarray ? ($path, $ver) : $path;
194}
195
196# Convenience method to get the path info if the first existing controller.
197#
198# Returns the same as `get_path`.
199sub get_any_path {
200 my ($self, $limiting, @controllers) = @_;
201
202 my ($path, $ver);
203 for my $c (@controllers) {
204 ($path, $ver) = $self->get_path($c, $limiting);
205 last if defined $path;
206 }
207 return wantarray ? ($path, $ver) : $path;
208}
209
210# Parse a 'Nested keyed' file:
211#
212# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
213my sub parse_nested_keyed_file($) {
214 my ($data) = @_;
215 my $res = {};
216 foreach my $line (split(/\n/, $data)) {
217 my ($key, @values) = split(/\s+/, $line);
218
219 my $d = ($res->{$key} = {});
220
221 foreach my $value (@values) {
222 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
223 $d->{$key} = $value;
224 } else {
225 warn "bad key=value pair in nested keyed file\n";
226 }
227 }
228 }
229 return $res;
230}
231
232# Parse a 'Flat keyed' file:
233#
234# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
235my sub parse_flat_keyed_file($) {
236 my ($data) = @_;
237 my $res = {};
238 foreach my $line (split(/\n/, $data)) {
239 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
240 $res->{$key} = $value;
241 } else {
242 warn "bad 'key value' pair in flat keyed file\n";
243 }
244 }
245 return $res;
246}
247
248# Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
249sub get_io_stats {
250 my ($self) = @_;
251
252 my $res = {
253 diskread => 0,
254 diskwrite => 0,
255 };
256
257 # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
258 my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio');
259 if (!defined($path)) {
260 # container not running
261 return undef;
262 } elsif ($ver == 2) {
263 # cgroupv2 environment, io controller enabled
264 my $io_stat = file_get_contents("$path/io.stat");
265
266 my $data = parse_nested_keyed_file($io_stat);
267 foreach my $dev (keys %$data) {
268 my $dev = $data->{$dev};
269 if (my $b = $dev->{rbytes}) {
270 $res->{diskread} += $b;
271 }
272 if (my $b = $dev->{wbytes}) {
194f706b 273 $res->{diskwrite} += $b;
86dff11c
AD
274 }
275 }
276
277 return $res;
278 } elsif ($ver == 1) {
279 # cgroupv1 environment:
280 my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
281 foreach my $line (split(/\n/, $io)) {
282 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
283 $res->{diskread} += $bytes if $type eq 'Read';
284 $res->{diskwrite} += $bytes if $type eq 'Write';
285 }
286 }
287
288 return $res;
289 } else {
290 die "bad cgroup version: $ver\n";
291 }
292
293 # container not running
294 return undef;
295}
296
297# Read utime and stime for this container from the cpuacct cgroup.
298# Values are in milliseconds!
299sub get_cpu_stat {
300 my ($self) = @_;
301
302 my $res = {
303 utime => 0,
304 stime => 0,
305 };
306
307 my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu');
308 if (!defined($path)) {
309 # container not running
310 return undef;
311 } elsif ($ver == 2) {
312 my $data = eval { file_get_contents("$path/cpu.stat") };
313
314 # or no io controller available:
315 return undef if !defined($data);
316
317 $data = parse_flat_keyed_file($data);
318 $res->{utime} = int($data->{user_usec} / 1000);
319 $res->{stime} = int($data->{system_usec} / 1000);
320 } elsif ($ver == 1) {
321 # cgroupv1 environment:
322 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
323 my $clk_to_usec = 1000 / $clock_ticks;
324
325 my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
326 $res->{utime} = int($data->{user} * $clk_to_usec);
327 $res->{stime} = int($data->{system} * $clk_to_usec);
328 } else {
329 die "bad cgroup version: $ver\n";
330 }
331
332 return $res;
333}
334
335# Parse some memory data from `memory.stat`
336sub get_memory_stat {
337 my ($self) = @_;
338
339 my $res = {
340 mem => 0,
341 swap => 0,
342 };
343
344 my ($path, $ver) = $self->get_path('memory', 1);
345 if (!defined($path)) {
346 # container most likely isn't running
347 return undef;
348 } elsif ($ver == 2) {
349 my $mem = file_get_contents("$path/memory.current");
350 my $swap = file_get_contents("$path/memory.swap.current");
61f1cb1a 351 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
86dff11c
AD
352
353 chomp ($mem, $swap);
354
61f1cb1a 355 $res->{mem} = $mem - $stat->{file};
86dff11c
AD
356 $res->{swap} = $swap;
357 } elsif ($ver == 1) {
358 # cgroupv1 environment:
359 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
360 my $mem = file_get_contents("$path/memory.usage_in_bytes");
361 my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
362 chomp ($mem, $memsw);
363
364 $res->{mem} = $mem - $stat->{total_cache};
365 $res->{swap} = $memsw - $mem;
366 } else {
367 die "bad cgroup version: $ver\n";
368 }
369
370 return $res;
371}
372
0bc3dac9
AD
373sub get_pressure_stat {
374 my ($self) = @_;
375
376 my $res = {
377 cpu => {
378 some => { avg10 => 0, avg60 => 0, avg300 => 0 }
379 },
380 memory => {
381 some => { avg10 => 0, avg60 => 0, avg300 => 0 },
382 full => { avg10 => 0, avg60 => 0, avg300 => 0 }
383 },
384 io => {
385 some => { avg10 => 0, avg60 => 0, avg300 => 0 },
386 full => { avg10 => 0, avg60 => 0, avg300 => 0 }
387 },
388 };
389
b82ddf5d 390 my ($path, $version) = $self->get_path(undef, 1);
0bc3dac9 391 if (!defined($path)) {
b82ddf5d
TL
392 return $res; # container or VM most likely isn't running, retrun zero stats
393 } elsif ($version == 1) {
394 return undef; # v1 controller does not provides pressure stat
395 } elsif ($version == 2) {
eadfaabd
TL
396 for my $type (qw(cpu memory io)) {
397 my $stats = PVE::ProcFSTools::parse_pressure("$path/$type.pressure");
398 $res->{$type} = $stats if $stats;
0bc3dac9
AD
399 }
400 } else {
b82ddf5d 401 die "bad cgroup version: $version\n";
0bc3dac9
AD
402 }
403
404 return $res;
405}
406
86dff11c
AD
407# Change the memory limit for this container.
408#
409# Dies on error (including a not-running or currently-shutting-down guest).
410sub change_memory_limit {
2b4c6678 411 my ($self, $mem_bytes, $swap_bytes, $mem_high_bytes) = @_;
86dff11c
AD
412
413 my ($path, $ver) = $self->get_path('memory', 1);
414 if (!defined($path)) {
415 die "trying to change memory cgroup values: container not running\n";
416 } elsif ($ver == 2) {
417 PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
418 if defined($swap_bytes);
2b4c6678
TL
419 if (defined($mem_bytes)) {
420 # 'max' is the hard-limit (triggers OOM), while 'high' throttles & adds reclaim pressure
421 PVE::ProcFSTools::write_proc_entry("$path/memory.high", $mem_high_bytes // 'max');
422 PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes);
423 }
86dff11c
AD
424 } elsif ($ver == 1) {
425 # With cgroupv1 we cannot control memory and swap limits separately.
426 # This also means that since the two values aren't independent, we need to handle
427 # growing and shrinking separately.
428 my $path_mem = "$path/memory.limit_in_bytes";
429 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
430
431 my $old_mem_bytes = file_get_contents($path_mem);
432 my $old_memsw_bytes = file_get_contents($path_memsw);
433 chomp($old_mem_bytes, $old_memsw_bytes);
434
435 $mem_bytes //= $old_mem_bytes;
436 $swap_bytes //= $old_memsw_bytes - $old_mem_bytes;
437 my $memsw_bytes = $mem_bytes + $swap_bytes;
438
439 if ($memsw_bytes > $old_memsw_bytes) {
440 # Growing the limit means growing the combined limit first, then pulling the
441 # memory limitup.
442 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
443 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
444 } else {
445 # Shrinking means we first need to shrink the mem-only memsw cannot be
446 # shrunk below it.
447 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
448 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
449 }
450 } else {
451 die "bad cgroup version: $ver\n";
452 }
453
454 # return a truth value
455 return 1;
456}
457
458# Change the cpu quota for a container.
459#
460# Dies on error (including a not-running or currently-shutting-down guest).
461sub change_cpu_quota {
462 my ($self, $quota, $period) = @_;
463
464 die "quota without period not allowed\n" if !defined($period) && defined($quota);
465
466 my ($path, $ver) = $self->get_path('cpu', 1);
467 if (!defined($path)) {
468 die "trying to change cpu quota cgroup values: container not running\n";
469 } elsif ($ver == 2) {
470 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
471 # in this interface:
472 $quota //= 'max'; # unlimited
473 if (defined($quota)) {
474 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
475 } else {
476 # we're allowed to only write the quota:
477 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
478 }
479 } elsif ($ver == 1) {
d37a7186
OB
480 $quota //= -1; # default (unlimited)
481 $period //= 100_000; # default (100 ms)
86dff11c
AD
482 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
483 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
484 } else {
485 die "bad cgroup version: $ver\n";
486 }
487
488 # return a truth value
489 return 1;
490}
491
07c10d58
TL
492# Clamp an integer to the supported range of CPU shares from the booted CGroup version
493#
494# Returns the default if called with an undefined value.
495sub clamp_cpu_shares {
496 my ($shares) = @_;
497
498 my $is_cgroupv2 = cgroup_mode() == 2;
499
500 return $is_cgroupv2 ? 100 : 1024 if !defined($shares);
501
502 if ($is_cgroupv2) {
503 $shares = 10000 if $shares >= 10000; # v1 can be higher, so clamp v2 there
504 } else {
505 $shares = 2 if $shares < 2; # v2 can be lower, so clamp v1 there
506 }
507 return $shares;
508}
509
86dff11c
AD
510# Change the cpu "shares" for a container.
511#
512# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
513#
514# In cgroupv2 we do not have "shares", we have "weights" in the range
515# of `[1..10000]` with a default of 100.
516#
517# Since the default values don't match when scaling linearly, we use the
518# values we get as-is and simply error for values >10000 in cgroupv2.
519#
520# It is left to the user to figure this out for now.
521#
522# Dies on error (including a not-running or currently-shutting-down guest).
17832659
TL
523#
524# NOTE: if you add a new param during 7.x you need to break older pve-container/qemu-server versions
525# that previously passed a `$cgroupv1_default`, which got removed due to being ignored anyway.
526# otherwise you risk that a old module bogusly passes some cgroup default as your new param.
86dff11c 527sub change_cpu_shares {
91fa9a5e 528 my ($self, $shares) = @_;
86dff11c
AD
529
530 my ($path, $ver) = $self->get_path('cpu', 1);
531 if (!defined($path)) {
532 die "trying to change cpu shares/weight cgroup values: container not running\n";
533 } elsif ($ver == 2) {
534 # the cgroupv2 documentation defines the default to 100
535 $shares //= 100;
536 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
537 PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
538 } elsif ($ver == 1) {
6d7c3065 539 $shares //= 1024;
91fa9a5e 540 PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares);
86dff11c
AD
541 } else {
542 die "bad cgroup version: $ver\n";
543 }
544
545 # return a truth value
546 return 1;
547}
548
549my sub v1_freeze_thaw {
550 my ($self, $controller_path, $freeze) = @_;
9465abe2 551 my $path = $self->get_subdir('freezer', 1)
86dff11c
AD
552 or die "trying to freeze container: container not running\n";
553 $path = "$controller_path/$path/freezer.state";
554
555 my $data = $freeze ? 'FROZEN' : 'THAWED';
556 PVE::ProcFSTools::write_proc_entry($path, $data);
557
558 # Here we just poll the freezer.state once per second.
559 while (1) {
560 my $state = file_get_contents($path);
561 chomp $state;
562 last if $state eq $data;
563 }
564}
565
566my sub v2_freeze_thaw {
567 my ($self, $controller_path, $freeze) = @_;
9465abe2 568 my $path = $self->get_subdir(undef, 1)
86dff11c
AD
569 or die "trying to freeze container: container not running\n";
570 $path = "$controller_path/$path";
571
572 my $desired_state = $freeze ? 1 : 0;
573
574 # cgroupv2 supports poll events on cgroup.events which contains the frozen
575 # state.
576 my $fh = IO::File->new("$path/cgroup.events", 'r')
577 or die "failed to open $path/cgroup.events file: $!\n";
578 my $select = IO::Select->new();
579 $select->add($fh);
580
581 PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state);
582 while (1) {
583 my $data = do {
584 local $/ = undef;
585 <$fh>
586 };
587 $data = parse_flat_keyed_file($data);
588 last if $data->{frozen} == $desired_state;
589 my @handles = $select->has_exception();
590 next if !@handles;
591 seek($fh, 0, 0)
592 or die "failed to rewind cgroup.events file: $!\n";
593 }
594}
595
596# Freeze or unfreeze a container.
597#
598# This will freeze the container at its outer (limiting) cgroup path. We use
599# this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to
600# fetch the cgroup path from contaienrs still running on lxc3.
601sub freeze_thaw {
602 my ($self, $freeze) = @_;
603
604 my $controller_path = find_cgroup_controller('freezer');
605 if (defined($controller_path)) {
606 return v1_freeze_thaw($self, $controller_path, $freeze);
607 } else {
608 # cgroupv2 always has a freezer, there can be both cgv1 and cgv2
609 # freezers, but we'll prefer v1 when it's available as that's what lxc
610 # does as well...
611 return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze);
612 }
613}
614
6151;