]> git.proxmox.com Git - pve-common.git/blame - src/PVE/CGroup.pm
cgroup: make get_v1_controllers private
[pve-common.git] / src / PVE / CGroup.pm
CommitLineData
86dff11c
AD
1# cgroup handler
2#
3# This package should deal with figuring out the right cgroup path for a
4# container (via the command socket), reading and writing cgroup values, and
5# handling cgroup v1 & v2 differences.
6#
7# Note that the long term plan is to have resource manage functions instead of
8# dealing with cgroup files on the outside.
9
10package PVE::CGroup;
11
12use strict;
13use warnings;
14
15use IO::File;
16use IO::Select;
17use POSIX qw();
18
19use PVE::ProcFSTools;
20use PVE::Tools qw(
21 file_get_contents
22 file_read_firstline
23);
24
86dff11c
AD
25# We don't want to do a command socket round trip for every cgroup read/write,
26# so any cgroup function needs to have the container's path cached, so this
27# package has to be instantiated.
28#
29# LXC keeps separate paths by controller (although they're normally all the
30# same, in our # case anyway), so we cache them by controller as well.
31sub new {
32 my ($class, $vmid) = @_;
33
34 my $self = { vmid => $vmid };
35
36 return bless $self, $class;
37}
38
39# Get the v1 controller list.
40#
41# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
42# optional boolean whether a unified (cgroupv2) hierarchy exists.
2cae54b9 43my sub get_v1_controllers {
86dff11c
AD
44 my $v1 = {};
45 my $v2 = 0;
46 my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
47 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
48 my $type = $1;
49 if (length($type)) {
50 $v1->{$_} = 1 foreach split(/,/, $type);
51 } else {
52 $v2 = 1;
53 }
54 }
55 return wantarray ? ($v1, $v2) : $v1;
56}
57
58# Get the set v2 controller list from the `cgroup.controllers` file.
59my sub get_v2_controllers {
60 my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') }
61 || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') };
62 return undef if !defined $v2;
63
64 # It's a simple space separated list:
65 return { map { $_ => 1 } split(/\s+/, $v2) };
66}
67
68my $CGROUP_CONTROLLERS = undef;
69# Get a list of controllers enabled in each cgroup subsystem.
70#
71# This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
72#
73# Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
74# version.
75sub get_cgroup_controllers() {
76 if (!defined($CGROUP_CONTROLLERS)) {
77 my ($v1, undef) = get_v1_controllers();
78 my $v2 = get_v2_controllers();
79
80 $CGROUP_CONTROLLERS = [$v1, $v2];
81 }
82
83 return $CGROUP_CONTROLLERS->@*;
84}
85
86my $CGROUP_MODE = undef;
87# Figure out which cgroup mode we're operating under:
88#
89# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
90# cgroupv2-only environment.
91#
92# NOTE: To fully support a hybrid layout it is better to use functions like
93# `cpuset_controller_path`.
94#
95# This is a function, not a method!
96sub cgroup_mode() {
97 if (!defined($CGROUP_MODE)) {
98 my ($v1, $v2) = get_cgroup_controllers();
99 if (keys %$v1) {
100 # hybrid or legacy mode
101 $CGROUP_MODE = 1;
102 } elsif ($v2) {
103 $CGROUP_MODE = 2;
104 }
105 }
106
107 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
108 return $CGROUP_MODE;
109}
110
111my $CGROUPV2_PATH = undef;
112sub cgroupv2_base_path() {
113 if (!defined($CGROUPV2_PATH)) {
114 if (cgroup_mode() == 2) {
115 $CGROUPV2_PATH = '/sys/fs/cgroup';
116 } else {
117 $CGROUPV2_PATH = '/sys/fs/cgroup/unified';
118 }
119 }
120 return $CGROUPV2_PATH;
121}
122
123# Find a cgroup controller and return its path and version.
124#
125# LXC initializes the unified hierarchy first, so if a controller is
126# available via both we favor cgroupv2 here as well.
127#
128# Returns nothing if the controller is not available.
9465abe2 129
86dff11c
AD
130sub find_cgroup_controller($) {
131 my ($controller) = @_;
132
133 my ($v1, $v2) = get_cgroup_controllers();
134
135 if (!defined($controller) || $v2->{$controller}) {
136 my $path = cgroupv2_base_path();
137 return wantarray ? ($path, 2) : $path;
138 }
139
140 if (defined($controller) && $v1->{$controller}) {
141 my $path = "/sys/fs/cgroup/$controller";
142 return wantarray ? ($path, 1) : $path;
143 }
144
145 return;
146}
147
148my $CG_PATH_CPUSET = undef;
149my $CG_VER_CPUSET = undef;
150# Find the cpuset cgroup controller.
151#
152# This is a function, not a method!
153sub cpuset_controller_path() {
154 if (!defined($CG_PATH_CPUSET)) {
155 ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset')
156 or die "failed to find cpuset controller\n";
157 }
158
159 return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
160}
161
162# Get a subdirectory (without the cgroup mount point) for a controller.
9465abe2 163sub get_subdir {
86dff11c
AD
164 my ($self, $controller, $limiting) = @_;
165
9465abe2 166 die "implement in subclass";
86dff11c
AD
167}
168
169# Get path and version for a controller.
170#
171# `$controller` may be `undef`, see get_subdir above for details.
172#
173# Returns either just the path, or the path and cgroup version as a tuple.
174sub get_path {
175 my ($self, $controller, $limiting) = @_;
86dff11c
AD
176 # Find the controller before querying the lxc monitor via a socket:
177 my ($cgpath, $ver) = find_cgroup_controller($controller)
178 or return undef;
179
9465abe2 180 my $path = $self->get_subdir($controller, $limiting)
86dff11c
AD
181 or return undef;
182
183 $path = "$cgpath/$path";
184 return wantarray ? ($path, $ver) : $path;
185}
186
187# Convenience method to get the path info if the first existing controller.
188#
189# Returns the same as `get_path`.
190sub get_any_path {
191 my ($self, $limiting, @controllers) = @_;
192
193 my ($path, $ver);
194 for my $c (@controllers) {
195 ($path, $ver) = $self->get_path($c, $limiting);
196 last if defined $path;
197 }
198 return wantarray ? ($path, $ver) : $path;
199}
200
201# Parse a 'Nested keyed' file:
202#
203# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
204my sub parse_nested_keyed_file($) {
205 my ($data) = @_;
206 my $res = {};
207 foreach my $line (split(/\n/, $data)) {
208 my ($key, @values) = split(/\s+/, $line);
209
210 my $d = ($res->{$key} = {});
211
212 foreach my $value (@values) {
213 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
214 $d->{$key} = $value;
215 } else {
216 warn "bad key=value pair in nested keyed file\n";
217 }
218 }
219 }
220 return $res;
221}
222
223# Parse a 'Flat keyed' file:
224#
225# See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
226my sub parse_flat_keyed_file($) {
227 my ($data) = @_;
228 my $res = {};
229 foreach my $line (split(/\n/, $data)) {
230 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
231 $res->{$key} = $value;
232 } else {
233 warn "bad 'key value' pair in flat keyed file\n";
234 }
235 }
236 return $res;
237}
238
239# Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
240sub get_io_stats {
241 my ($self) = @_;
242
243 my $res = {
244 diskread => 0,
245 diskwrite => 0,
246 };
247
248 # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
249 my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio');
250 if (!defined($path)) {
251 # container not running
252 return undef;
253 } elsif ($ver == 2) {
254 # cgroupv2 environment, io controller enabled
255 my $io_stat = file_get_contents("$path/io.stat");
256
257 my $data = parse_nested_keyed_file($io_stat);
258 foreach my $dev (keys %$data) {
259 my $dev = $data->{$dev};
260 if (my $b = $dev->{rbytes}) {
261 $res->{diskread} += $b;
262 }
263 if (my $b = $dev->{wbytes}) {
194f706b 264 $res->{diskwrite} += $b;
86dff11c
AD
265 }
266 }
267
268 return $res;
269 } elsif ($ver == 1) {
270 # cgroupv1 environment:
271 my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
272 foreach my $line (split(/\n/, $io)) {
273 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
274 $res->{diskread} += $bytes if $type eq 'Read';
275 $res->{diskwrite} += $bytes if $type eq 'Write';
276 }
277 }
278
279 return $res;
280 } else {
281 die "bad cgroup version: $ver\n";
282 }
283
284 # container not running
285 return undef;
286}
287
288# Read utime and stime for this container from the cpuacct cgroup.
289# Values are in milliseconds!
290sub get_cpu_stat {
291 my ($self) = @_;
292
293 my $res = {
294 utime => 0,
295 stime => 0,
296 };
297
298 my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu');
299 if (!defined($path)) {
300 # container not running
301 return undef;
302 } elsif ($ver == 2) {
303 my $data = eval { file_get_contents("$path/cpu.stat") };
304
305 # or no io controller available:
306 return undef if !defined($data);
307
308 $data = parse_flat_keyed_file($data);
309 $res->{utime} = int($data->{user_usec} / 1000);
310 $res->{stime} = int($data->{system_usec} / 1000);
311 } elsif ($ver == 1) {
312 # cgroupv1 environment:
313 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
314 my $clk_to_usec = 1000 / $clock_ticks;
315
316 my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
317 $res->{utime} = int($data->{user} * $clk_to_usec);
318 $res->{stime} = int($data->{system} * $clk_to_usec);
319 } else {
320 die "bad cgroup version: $ver\n";
321 }
322
323 return $res;
324}
325
326# Parse some memory data from `memory.stat`
327sub get_memory_stat {
328 my ($self) = @_;
329
330 my $res = {
331 mem => 0,
332 swap => 0,
333 };
334
335 my ($path, $ver) = $self->get_path('memory', 1);
336 if (!defined($path)) {
337 # container most likely isn't running
338 return undef;
339 } elsif ($ver == 2) {
340 my $mem = file_get_contents("$path/memory.current");
341 my $swap = file_get_contents("$path/memory.swap.current");
61f1cb1a 342 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
86dff11c
AD
343
344 chomp ($mem, $swap);
345
61f1cb1a 346 $res->{mem} = $mem - $stat->{file};
86dff11c
AD
347 $res->{swap} = $swap;
348 } elsif ($ver == 1) {
349 # cgroupv1 environment:
350 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
351 my $mem = file_get_contents("$path/memory.usage_in_bytes");
352 my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
353 chomp ($mem, $memsw);
354
355 $res->{mem} = $mem - $stat->{total_cache};
356 $res->{swap} = $memsw - $mem;
357 } else {
358 die "bad cgroup version: $ver\n";
359 }
360
361 return $res;
362}
363
0bc3dac9
AD
364sub get_pressure_stat {
365 my ($self) = @_;
366
367 my $res = {
368 cpu => {
369 some => { avg10 => 0, avg60 => 0, avg300 => 0 }
370 },
371 memory => {
372 some => { avg10 => 0, avg60 => 0, avg300 => 0 },
373 full => { avg10 => 0, avg60 => 0, avg300 => 0 }
374 },
375 io => {
376 some => { avg10 => 0, avg60 => 0, avg300 => 0 },
377 full => { avg10 => 0, avg60 => 0, avg300 => 0 }
378 },
379 };
380
b82ddf5d 381 my ($path, $version) = $self->get_path(undef, 1);
0bc3dac9 382 if (!defined($path)) {
b82ddf5d
TL
383 return $res; # container or VM most likely isn't running, retrun zero stats
384 } elsif ($version == 1) {
385 return undef; # v1 controller does not provides pressure stat
386 } elsif ($version == 2) {
eadfaabd
TL
387 for my $type (qw(cpu memory io)) {
388 my $stats = PVE::ProcFSTools::parse_pressure("$path/$type.pressure");
389 $res->{$type} = $stats if $stats;
0bc3dac9
AD
390 }
391 } else {
b82ddf5d 392 die "bad cgroup version: $version\n";
0bc3dac9
AD
393 }
394
395 return $res;
396}
397
86dff11c
AD
398# Change the memory limit for this container.
399#
400# Dies on error (including a not-running or currently-shutting-down guest).
401sub change_memory_limit {
402 my ($self, $mem_bytes, $swap_bytes) = @_;
403
404 my ($path, $ver) = $self->get_path('memory', 1);
405 if (!defined($path)) {
406 die "trying to change memory cgroup values: container not running\n";
407 } elsif ($ver == 2) {
408 PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
409 if defined($swap_bytes);
410 PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
411 if defined($mem_bytes);
412 } elsif ($ver == 1) {
413 # With cgroupv1 we cannot control memory and swap limits separately.
414 # This also means that since the two values aren't independent, we need to handle
415 # growing and shrinking separately.
416 my $path_mem = "$path/memory.limit_in_bytes";
417 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
418
419 my $old_mem_bytes = file_get_contents($path_mem);
420 my $old_memsw_bytes = file_get_contents($path_memsw);
421 chomp($old_mem_bytes, $old_memsw_bytes);
422
423 $mem_bytes //= $old_mem_bytes;
424 $swap_bytes //= $old_memsw_bytes - $old_mem_bytes;
425 my $memsw_bytes = $mem_bytes + $swap_bytes;
426
427 if ($memsw_bytes > $old_memsw_bytes) {
428 # Growing the limit means growing the combined limit first, then pulling the
429 # memory limitup.
430 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
431 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
432 } else {
433 # Shrinking means we first need to shrink the mem-only memsw cannot be
434 # shrunk below it.
435 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
436 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
437 }
438 } else {
439 die "bad cgroup version: $ver\n";
440 }
441
442 # return a truth value
443 return 1;
444}
445
446# Change the cpu quota for a container.
447#
448# Dies on error (including a not-running or currently-shutting-down guest).
449sub change_cpu_quota {
450 my ($self, $quota, $period) = @_;
451
452 die "quota without period not allowed\n" if !defined($period) && defined($quota);
453
454 my ($path, $ver) = $self->get_path('cpu', 1);
455 if (!defined($path)) {
456 die "trying to change cpu quota cgroup values: container not running\n";
457 } elsif ($ver == 2) {
458 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
459 # in this interface:
460 $quota //= 'max'; # unlimited
461 if (defined($quota)) {
462 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
463 } else {
464 # we're allowed to only write the quota:
465 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
466 }
467 } elsif ($ver == 1) {
d37a7186
OB
468 $quota //= -1; # default (unlimited)
469 $period //= 100_000; # default (100 ms)
86dff11c
AD
470 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
471 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
472 } else {
473 die "bad cgroup version: $ver\n";
474 }
475
476 # return a truth value
477 return 1;
478}
479
480# Change the cpu "shares" for a container.
481#
482# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
483#
484# In cgroupv2 we do not have "shares", we have "weights" in the range
485# of `[1..10000]` with a default of 100.
486#
487# Since the default values don't match when scaling linearly, we use the
488# values we get as-is and simply error for values >10000 in cgroupv2.
489#
490# It is left to the user to figure this out for now.
491#
492# Dies on error (including a not-running or currently-shutting-down guest).
493sub change_cpu_shares {
494 my ($self, $shares, $cgroupv1_default) = @_;
495
496 my ($path, $ver) = $self->get_path('cpu', 1);
497 if (!defined($path)) {
498 die "trying to change cpu shares/weight cgroup values: container not running\n";
499 } elsif ($ver == 2) {
500 # the cgroupv2 documentation defines the default to 100
501 $shares //= 100;
502 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
503 PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
504 } elsif ($ver == 1) {
6d7c3065 505 $shares //= 1024;
86dff11c
AD
506 PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
507 } else {
508 die "bad cgroup version: $ver\n";
509 }
510
511 # return a truth value
512 return 1;
513}
514
515my sub v1_freeze_thaw {
516 my ($self, $controller_path, $freeze) = @_;
9465abe2 517 my $path = $self->get_subdir('freezer', 1)
86dff11c
AD
518 or die "trying to freeze container: container not running\n";
519 $path = "$controller_path/$path/freezer.state";
520
521 my $data = $freeze ? 'FROZEN' : 'THAWED';
522 PVE::ProcFSTools::write_proc_entry($path, $data);
523
524 # Here we just poll the freezer.state once per second.
525 while (1) {
526 my $state = file_get_contents($path);
527 chomp $state;
528 last if $state eq $data;
529 }
530}
531
532my sub v2_freeze_thaw {
533 my ($self, $controller_path, $freeze) = @_;
9465abe2 534 my $path = $self->get_subdir(undef, 1)
86dff11c
AD
535 or die "trying to freeze container: container not running\n";
536 $path = "$controller_path/$path";
537
538 my $desired_state = $freeze ? 1 : 0;
539
540 # cgroupv2 supports poll events on cgroup.events which contains the frozen
541 # state.
542 my $fh = IO::File->new("$path/cgroup.events", 'r')
543 or die "failed to open $path/cgroup.events file: $!\n";
544 my $select = IO::Select->new();
545 $select->add($fh);
546
547 PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state);
548 while (1) {
549 my $data = do {
550 local $/ = undef;
551 <$fh>
552 };
553 $data = parse_flat_keyed_file($data);
554 last if $data->{frozen} == $desired_state;
555 my @handles = $select->has_exception();
556 next if !@handles;
557 seek($fh, 0, 0)
558 or die "failed to rewind cgroup.events file: $!\n";
559 }
560}
561
562# Freeze or unfreeze a container.
563#
564# This will freeze the container at its outer (limiting) cgroup path. We use
565# this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to
566# fetch the cgroup path from contaienrs still running on lxc3.
567sub freeze_thaw {
568 my ($self, $freeze) = @_;
569
570 my $controller_path = find_cgroup_controller('freezer');
571 if (defined($controller_path)) {
572 return v1_freeze_thaw($self, $controller_path, $freeze);
573 } else {
574 # cgroupv2 always has a freezer, there can be both cgv1 and cgv2
575 # freezers, but we'll prefer v1 when it's available as that's what lxc
576 # does as well...
577 return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze);
578 }
579}
580
5811;