]> git.proxmox.com Git - pve-common.git/blame - src/PVE/CGroup.pm
bump version to 7.0-4
[pve-common.git] / src / PVE / CGroup.pm
CommitLineData
86dff11c
AD
1# cgroup handler
2#
3# This package should deal with figuring out the right cgroup path for a
4# container (via the command socket), reading and writing cgroup values, and
5# handling cgroup v1 & v2 differences.
6#
7# Note that the long term plan is to have resource manage functions instead of
8# dealing with cgroup files on the outside.
9
10package PVE::CGroup;
11
12use strict;
13use warnings;
14
15use IO::File;
16use IO::Select;
17use POSIX qw();
18
19use PVE::ProcFSTools;
20use PVE::Tools qw(
21 file_get_contents
22 file_read_firstline
23);
24
86dff11c
AD
25# We don't want to do a command socket round trip for every cgroup read/write,
26# so any cgroup function needs to have the container's path cached, so this
27# package has to be instantiated.
28#
29# LXC keeps separate paths by controller (although they're normally all the
30# same, in our # case anyway), so we cache them by controller as well.
31sub new {
32 my ($class, $vmid) = @_;
33
34 my $self = { vmid => $vmid };
35
36 return bless $self, $class;
37}
38
39# Get the v1 controller list.
40#
41# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
42# optional boolean whether a unified (cgroupv2) hierarchy exists.
43#
44# Deprecated: Use `get_cgroup_controllers()` instead.
45sub get_v1_controllers {
46 my $v1 = {};
47 my $v2 = 0;
48 my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
49 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
50 my $type = $1;
51 if (length($type)) {
52 $v1->{$_} = 1 foreach split(/,/, $type);
53 } else {
54 $v2 = 1;
55 }
56 }
57 return wantarray ? ($v1, $v2) : $v1;
58}
59
60# Get the set v2 controller list from the `cgroup.controllers` file.
61my sub get_v2_controllers {
62 my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') }
63 || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') };
64 return undef if !defined $v2;
65
66 # It's a simple space separated list:
67 return { map { $_ => 1 } split(/\s+/, $v2) };
68}
69
70my $CGROUP_CONTROLLERS = undef;
71# Get a list of controllers enabled in each cgroup subsystem.
72#
73# This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
74#
75# Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
76# version.
77sub get_cgroup_controllers() {
78 if (!defined($CGROUP_CONTROLLERS)) {
79 my ($v1, undef) = get_v1_controllers();
80 my $v2 = get_v2_controllers();
81
82 $CGROUP_CONTROLLERS = [$v1, $v2];
83 }
84
85 return $CGROUP_CONTROLLERS->@*;
86}
87
88my $CGROUP_MODE = undef;
89# Figure out which cgroup mode we're operating under:
90#
91# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
92# cgroupv2-only environment.
93#
94# NOTE: To fully support a hybrid layout it is better to use functions like
95# `cpuset_controller_path`.
96#
97# This is a function, not a method!
98sub cgroup_mode() {
99 if (!defined($CGROUP_MODE)) {
100 my ($v1, $v2) = get_cgroup_controllers();
101 if (keys %$v1) {
102 # hybrid or legacy mode
103 $CGROUP_MODE = 1;
104 } elsif ($v2) {
105 $CGROUP_MODE = 2;
106 }
107 }
108
109 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
110 return $CGROUP_MODE;
111}
112
113my $CGROUPV2_PATH = undef;
114sub cgroupv2_base_path() {
115 if (!defined($CGROUPV2_PATH)) {
116 if (cgroup_mode() == 2) {
117 $CGROUPV2_PATH = '/sys/fs/cgroup';
118 } else {
119 $CGROUPV2_PATH = '/sys/fs/cgroup/unified';
120 }
121 }
122 return $CGROUPV2_PATH;
123}
124
125# Find a cgroup controller and return its path and version.
126#
127# LXC initializes the unified hierarchy first, so if a controller is
128# available via both we favor cgroupv2 here as well.
129#
130# Returns nothing if the controller is not available.
9465abe2 131
86dff11c
AD
132sub find_cgroup_controller($) {
133 my ($controller) = @_;
134
135 my ($v1, $v2) = get_cgroup_controllers();
136
137 if (!defined($controller) || $v2->{$controller}) {
138 my $path = cgroupv2_base_path();
139 return wantarray ? ($path, 2) : $path;
140 }
141
142 if (defined($controller) && $v1->{$controller}) {
143 my $path = "/sys/fs/cgroup/$controller";
144 return wantarray ? ($path, 1) : $path;
145 }
146
147 return;
148}
149
150my $CG_PATH_CPUSET = undef;
151my $CG_VER_CPUSET = undef;
152# Find the cpuset cgroup controller.
153#
154# This is a function, not a method!
155sub cpuset_controller_path() {
156 if (!defined($CG_PATH_CPUSET)) {
157 ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset')
158 or die "failed to find cpuset controller\n";
159 }
160
161 return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
162}
163
164# Get a subdirectory (without the cgroup mount point) for a controller.
9465abe2 165sub get_subdir {
86dff11c
AD
166 my ($self, $controller, $limiting) = @_;
167
9465abe2 168 die "implement in subclass";
86dff11c
AD
169}
170
171# Get path and version for a controller.
172#
173# `$controller` may be `undef`, see get_subdir above for details.
174#
175# Returns either just the path, or the path and cgroup version as a tuple.
176sub get_path {
177 my ($self, $controller, $limiting) = @_;
86dff11c
AD
178 # Find the controller before querying the lxc monitor via a socket:
179 my ($cgpath, $ver) = find_cgroup_controller($controller)
180 or return undef;
181
9465abe2 182 my $path = $self->get_subdir($controller, $limiting)
86dff11c
AD
183 or return undef;
184
185 $path = "$cgpath/$path";
186 return wantarray ? ($path, $ver) : $path;
187}
188
189# Convenience method to get the path info if the first existing controller.
190#
191# Returns the same as `get_path`.
192sub get_any_path {
193 my ($self, $limiting, @controllers) = @_;
194
195 my ($path, $ver);
196 for my $c (@controllers) {
197 ($path, $ver) = $self->get_path($c, $limiting);
198 last if defined $path;
199 }
200 return wantarray ? ($path, $ver) : $path;
201}
202
203# Parse a 'Nested keyed' file:
204#
205# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
206my sub parse_nested_keyed_file($) {
207 my ($data) = @_;
208 my $res = {};
209 foreach my $line (split(/\n/, $data)) {
210 my ($key, @values) = split(/\s+/, $line);
211
212 my $d = ($res->{$key} = {});
213
214 foreach my $value (@values) {
215 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
216 $d->{$key} = $value;
217 } else {
218 warn "bad key=value pair in nested keyed file\n";
219 }
220 }
221 }
222 return $res;
223}
224
225# Parse a 'Flat keyed' file:
226#
227# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
228my sub parse_flat_keyed_file($) {
229 my ($data) = @_;
230 my $res = {};
231 foreach my $line (split(/\n/, $data)) {
232 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
233 $res->{$key} = $value;
234 } else {
235 warn "bad 'key value' pair in flat keyed file\n";
236 }
237 }
238 return $res;
239}
240
241# Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
242sub get_io_stats {
243 my ($self) = @_;
244
245 my $res = {
246 diskread => 0,
247 diskwrite => 0,
248 };
249
250 # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
251 my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio');
252 if (!defined($path)) {
253 # container not running
254 return undef;
255 } elsif ($ver == 2) {
256 # cgroupv2 environment, io controller enabled
257 my $io_stat = file_get_contents("$path/io.stat");
258
259 my $data = parse_nested_keyed_file($io_stat);
260 foreach my $dev (keys %$data) {
261 my $dev = $data->{$dev};
262 if (my $b = $dev->{rbytes}) {
263 $res->{diskread} += $b;
264 }
265 if (my $b = $dev->{wbytes}) {
266 $res->{diskread} += $b;
267 }
268 }
269
270 return $res;
271 } elsif ($ver == 1) {
272 # cgroupv1 environment:
273 my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
274 foreach my $line (split(/\n/, $io)) {
275 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
276 $res->{diskread} += $bytes if $type eq 'Read';
277 $res->{diskwrite} += $bytes if $type eq 'Write';
278 }
279 }
280
281 return $res;
282 } else {
283 die "bad cgroup version: $ver\n";
284 }
285
286 # container not running
287 return undef;
288}
289
290# Read utime and stime for this container from the cpuacct cgroup.
291# Values are in milliseconds!
292sub get_cpu_stat {
293 my ($self) = @_;
294
295 my $res = {
296 utime => 0,
297 stime => 0,
298 };
299
300 my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu');
301 if (!defined($path)) {
302 # container not running
303 return undef;
304 } elsif ($ver == 2) {
305 my $data = eval { file_get_contents("$path/cpu.stat") };
306
307 # or no io controller available:
308 return undef if !defined($data);
309
310 $data = parse_flat_keyed_file($data);
311 $res->{utime} = int($data->{user_usec} / 1000);
312 $res->{stime} = int($data->{system_usec} / 1000);
313 } elsif ($ver == 1) {
314 # cgroupv1 environment:
315 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
316 my $clk_to_usec = 1000 / $clock_ticks;
317
318 my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
319 $res->{utime} = int($data->{user} * $clk_to_usec);
320 $res->{stime} = int($data->{system} * $clk_to_usec);
321 } else {
322 die "bad cgroup version: $ver\n";
323 }
324
325 return $res;
326}
327
328# Parse some memory data from `memory.stat`
329sub get_memory_stat {
330 my ($self) = @_;
331
332 my $res = {
333 mem => 0,
334 swap => 0,
335 };
336
337 my ($path, $ver) = $self->get_path('memory', 1);
338 if (!defined($path)) {
339 # container most likely isn't running
340 return undef;
341 } elsif ($ver == 2) {
342 my $mem = file_get_contents("$path/memory.current");
343 my $swap = file_get_contents("$path/memory.swap.current");
344
345 chomp ($mem, $swap);
346
347 # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
348 # the values in `memory.stat`...
349
350 $res->{mem} = $mem;
351 $res->{swap} = $swap;
352 } elsif ($ver == 1) {
353 # cgroupv1 environment:
354 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
355 my $mem = file_get_contents("$path/memory.usage_in_bytes");
356 my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
357 chomp ($mem, $memsw);
358
359 $res->{mem} = $mem - $stat->{total_cache};
360 $res->{swap} = $memsw - $mem;
361 } else {
362 die "bad cgroup version: $ver\n";
363 }
364
365 return $res;
366}
367
0bc3dac9
AD
368sub get_pressure_stat {
369 my ($self) = @_;
370
371 my $res = {
372 cpu => {
373 some => { avg10 => 0, avg60 => 0, avg300 => 0 }
374 },
375 memory => {
376 some => { avg10 => 0, avg60 => 0, avg300 => 0 },
377 full => { avg10 => 0, avg60 => 0, avg300 => 0 }
378 },
379 io => {
380 some => { avg10 => 0, avg60 => 0, avg300 => 0 },
381 full => { avg10 => 0, avg60 => 0, avg300 => 0 }
382 },
383 };
384
b82ddf5d 385 my ($path, $version) = $self->get_path(undef, 1);
0bc3dac9 386 if (!defined($path)) {
b82ddf5d
TL
387 return $res; # container or VM most likely isn't running, retrun zero stats
388 } elsif ($version == 1) {
389 return undef; # v1 controller does not provides pressure stat
390 } elsif ($version == 2) {
eadfaabd
TL
391 for my $type (qw(cpu memory io)) {
392 my $stats = PVE::ProcFSTools::parse_pressure("$path/$type.pressure");
393 $res->{$type} = $stats if $stats;
0bc3dac9
AD
394 }
395 } else {
b82ddf5d 396 die "bad cgroup version: $version\n";
0bc3dac9
AD
397 }
398
399 return $res;
400}
401
86dff11c
AD
402# Change the memory limit for this container.
403#
404# Dies on error (including a not-running or currently-shutting-down guest).
405sub change_memory_limit {
406 my ($self, $mem_bytes, $swap_bytes) = @_;
407
408 my ($path, $ver) = $self->get_path('memory', 1);
409 if (!defined($path)) {
410 die "trying to change memory cgroup values: container not running\n";
411 } elsif ($ver == 2) {
412 PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
413 if defined($swap_bytes);
414 PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
415 if defined($mem_bytes);
416 } elsif ($ver == 1) {
417 # With cgroupv1 we cannot control memory and swap limits separately.
418 # This also means that since the two values aren't independent, we need to handle
419 # growing and shrinking separately.
420 my $path_mem = "$path/memory.limit_in_bytes";
421 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
422
423 my $old_mem_bytes = file_get_contents($path_mem);
424 my $old_memsw_bytes = file_get_contents($path_memsw);
425 chomp($old_mem_bytes, $old_memsw_bytes);
426
427 $mem_bytes //= $old_mem_bytes;
428 $swap_bytes //= $old_memsw_bytes - $old_mem_bytes;
429 my $memsw_bytes = $mem_bytes + $swap_bytes;
430
431 if ($memsw_bytes > $old_memsw_bytes) {
432 # Growing the limit means growing the combined limit first, then pulling the
433 # memory limitup.
434 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
435 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
436 } else {
437 # Shrinking means we first need to shrink the mem-only memsw cannot be
438 # shrunk below it.
439 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
440 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
441 }
442 } else {
443 die "bad cgroup version: $ver\n";
444 }
445
446 # return a truth value
447 return 1;
448}
449
450# Change the cpu quota for a container.
451#
452# Dies on error (including a not-running or currently-shutting-down guest).
453sub change_cpu_quota {
454 my ($self, $quota, $period) = @_;
455
456 die "quota without period not allowed\n" if !defined($period) && defined($quota);
457
458 my ($path, $ver) = $self->get_path('cpu', 1);
459 if (!defined($path)) {
460 die "trying to change cpu quota cgroup values: container not running\n";
461 } elsif ($ver == 2) {
462 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
463 # in this interface:
464 $quota //= 'max'; # unlimited
465 if (defined($quota)) {
466 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
467 } else {
468 # we're allowed to only write the quota:
469 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
470 }
471 } elsif ($ver == 1) {
472 $quota //= -1; # unlimited
473 $period //= -1;
474 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
475 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
476 } else {
477 die "bad cgroup version: $ver\n";
478 }
479
480 # return a truth value
481 return 1;
482}
483
484# Change the cpu "shares" for a container.
485#
486# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
487#
488# In cgroupv2 we do not have "shares", we have "weights" in the range
489# of `[1..10000]` with a default of 100.
490#
491# Since the default values don't match when scaling linearly, we use the
492# values we get as-is and simply error for values >10000 in cgroupv2.
493#
494# It is left to the user to figure this out for now.
495#
496# Dies on error (including a not-running or currently-shutting-down guest).
497sub change_cpu_shares {
498 my ($self, $shares, $cgroupv1_default) = @_;
499
500 my ($path, $ver) = $self->get_path('cpu', 1);
501 if (!defined($path)) {
502 die "trying to change cpu shares/weight cgroup values: container not running\n";
503 } elsif ($ver == 2) {
504 # the cgroupv2 documentation defines the default to 100
505 $shares //= 100;
506 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
507 PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
508 } elsif ($ver == 1) {
6d7c3065 509 $shares //= 1024;
86dff11c
AD
510 PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
511 } else {
512 die "bad cgroup version: $ver\n";
513 }
514
515 # return a truth value
516 return 1;
517}
518
519my sub v1_freeze_thaw {
520 my ($self, $controller_path, $freeze) = @_;
9465abe2 521 my $path = $self->get_subdir('freezer', 1)
86dff11c
AD
522 or die "trying to freeze container: container not running\n";
523 $path = "$controller_path/$path/freezer.state";
524
525 my $data = $freeze ? 'FROZEN' : 'THAWED';
526 PVE::ProcFSTools::write_proc_entry($path, $data);
527
528 # Here we just poll the freezer.state once per second.
529 while (1) {
530 my $state = file_get_contents($path);
531 chomp $state;
532 last if $state eq $data;
533 }
534}
535
536my sub v2_freeze_thaw {
537 my ($self, $controller_path, $freeze) = @_;
9465abe2 538 my $path = $self->get_subdir(undef, 1)
86dff11c
AD
539 or die "trying to freeze container: container not running\n";
540 $path = "$controller_path/$path";
541
542 my $desired_state = $freeze ? 1 : 0;
543
544 # cgroupv2 supports poll events on cgroup.events which contains the frozen
545 # state.
546 my $fh = IO::File->new("$path/cgroup.events", 'r')
547 or die "failed to open $path/cgroup.events file: $!\n";
548 my $select = IO::Select->new();
549 $select->add($fh);
550
551 PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state);
552 while (1) {
553 my $data = do {
554 local $/ = undef;
555 <$fh>
556 };
557 $data = parse_flat_keyed_file($data);
558 last if $data->{frozen} == $desired_state;
559 my @handles = $select->has_exception();
560 next if !@handles;
561 seek($fh, 0, 0)
562 or die "failed to rewind cgroup.events file: $!\n";
563 }
564}
565
566# Freeze or unfreeze a container.
567#
568# This will freeze the container at its outer (limiting) cgroup path. We use
569# this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to
570# fetch the cgroup path from contaienrs still running on lxc3.
571sub freeze_thaw {
572 my ($self, $freeze) = @_;
573
574 my $controller_path = find_cgroup_controller('freezer');
575 if (defined($controller_path)) {
576 return v1_freeze_thaw($self, $controller_path, $freeze);
577 } else {
578 # cgroupv2 always has a freezer, there can be both cgv1 and cgv2
579 # freezers, but we'll prefer v1 when it's available as that's what lxc
580 # does as well...
581 return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze);
582 }
583}
584
5851;