]> git.proxmox.com Git - pve-container.git/blob - src/PVE/LXC/CGroup.pm
ee5ea16565d4c65a1fe7a6498592a5f91c2998b5
[pve-container.git] / src / PVE / LXC / CGroup.pm
1 # cgroup handler
2 #
3 # This package should deal with figuring out the right cgroup path for a
4 # container (via the command socket), reading and writing cgroup values, and
5 # handling cgroup v1 & v2 differences.
6 #
7 # Note that the long term plan is to have resource manage functions instead of
8 # dealing with cgroup files on the outside.
9
10 package PVE::LXC::CGroup;
11
12 use strict;
13 use warnings;
14
15 use POSIX qw();
16
17 use PVE::ProcFSTools;
18 use PVE::Tools qw(
19 file_get_contents
20 file_read_firstline
21 );
22
23 use PVE::LXC::Command;
24
25 # We don't want to do a command socket round trip for every cgroup read/write,
26 # so any cgroup function needs to have the container's path cached, so this
27 # package has to be instantiated.
28 #
29 # LXC keeps separate paths by controller (although they're normally all the
30 # same, in our # case anyway), so we cache them by controller as well.
31 sub new {
32 my ($class, $vmid) = @_;
33
34 my $self = { vmid => $vmid };
35
36 return bless $self, $class;
37 }
38
39 # Get the v1 controller list.
40 #
41 # Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
42 # optional boolean whether a unified (cgroupv2) hierarchy exists.
43 #
44 # Deprecated: Use `get_cgroup_controllers()` instead.
45 sub get_v1_controllers {
46 my $v1 = {};
47 my $v2 = 0;
48 my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
49 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
50 my $type = $1;
51 if (length($type)) {
52 $v1->{$_} = 1 foreach split(/,/, $type);
53 } else {
54 $v2 = 1;
55 }
56 }
57 return wantarray ? ($v1, $v2) : $v1;
58 }
59
60 # Get the set v2 controller list from the `cgroup.controllers` file.
61 my sub get_v2_controllers {
62 my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') }
63 || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') };
64 return undef if !defined $v2;
65
66 # It's a simple space separated list:
67 return { map { $_ => 1 } split(/\s+/, $v2) };
68 }
69
70 my $CGROUP_CONTROLLERS = undef;
71 # Get a list of controllers enabled in each cgroup subsystem.
72 #
73 # This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
74 #
75 # Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
76 # version.
77 sub get_cgroup_controllers() {
78 if (!defined($CGROUP_CONTROLLERS)) {
79 my ($v1, undef) = get_v1_controllers();
80 my $v2 = get_v2_controllers();
81
82 $CGROUP_CONTROLLERS = [$v1, $v2];
83 }
84
85 return $CGROUP_CONTROLLERS->@*;
86 }
87
88 my $CGROUP_MODE = undef;
89 # Figure out which cgroup mode we're operating under:
90 #
91 # Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
92 # cgroupv2-only environment.
93 #
94 # NOTE: To fully support a hybrid layout it is better to use functions like
95 # `cpuset_controller_path`.
96 #
97 # This is a function, not a method!
98 sub cgroup_mode() {
99 if (!defined($CGROUP_MODE)) {
100 my ($v1, $v2) = get_cgroup_controllers();
101 if (keys %$v1) {
102 # hybrid or legacy mode
103 $CGROUP_MODE = 1;
104 } elsif ($v2) {
105 $CGROUP_MODE = 2;
106 }
107 }
108
109 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
110 return $CGROUP_MODE;
111 }
112
113 my $CGROUPV2_PATH = undef;
114 sub cgroupv2_base_path() {
115 if (!defined($CGROUPV2_PATH)) {
116 if (cgroup_mode() == 2) {
117 $CGROUPV2_PATH = '/sys/fs/cgroup';
118 } else {
119 $CGROUPV2_PATH = '/sys/fs/cgroup/unified';
120 }
121 }
122 return $CGROUPV2_PATH;
123 }
124
125 # Find a cgroup controller and return its path and version.
126 #
127 # LXC initializes the unified hierarchy first, so if a controller is
128 # available via both we favor cgroupv2 here as well.
129 #
130 # Returns nothing if the controller is not available.
131 sub find_cgroup_controller($) {
132 my ($controller) = @_;
133
134 my ($v1, $v2) = get_cgroup_controllers();
135
136 if (!defined($controller) || $v2->{$controller}) {
137 my $path = cgroupv2_base_path();
138 return wantarray ? ($path, 2) : $path;
139 }
140
141 if (defined($controller) && $v1->{$controller}) {
142 my $path = "/sys/fs/cgroup/$controller";
143 return wantarray ? ($path, 1) : $path;
144 }
145
146 return;
147 }
148
149 my $CG_PATH_CPUSET = undef;
150 my $CG_VER_CPUSET = undef;
151 # Find the cpuset cgroup controller.
152 #
153 # This is a function, not a method!
154 sub cpuset_controller_path() {
155 if (!defined($CG_PATH_CPUSET)) {
156 ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset')
157 or die "failed to find cpuset controller\n";
158 }
159
160 return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
161 }
162
163 # Get a subdirectory (without the cgroup mount point) for a controller.
164 #
165 # If `$controller` is `undef`, get the unified (cgroupv2) path.
166 #
167 # Note that in cgroup v2, lxc uses the activated controller names
168 # (`cgroup.controllers` file) as list of controllers for the unified hierarchy,
169 # so this returns a result when a `controller` is provided even when using
170 # a pure cgroupv2 setup.
171 my sub get_subdir {
172 my ($self, $controller, $limiting) = @_;
173
174 my $entry_name = $controller || 'unified';
175 my $entry = ($self->{controllers}->{$entry_name} //= {});
176
177 my $kind = $limiting ? 'limit' : 'ns';
178 my $path = $entry->{$kind};
179
180 return $path if defined $path;
181
182 $path = PVE::LXC::Command::get_cgroup_path(
183 $self->{vmid},
184 $controller,
185 $limiting,
186 ) or return undef;
187
188 # untaint:
189 if ($path =~ /\.\./) {
190 die "lxc returned suspicious path: '$path'\n";
191 }
192 ($path) = ($path =~ /^(.*)$/s);
193
194 $entry->{$kind} = $path;
195
196 return $path;
197 }
198
199 # Get path and version for a controller.
200 #
201 # `$controller` may be `undef`, see get_subdir above for details.
202 #
203 # Returns either just the path, or the path and cgroup version as a tuple.
204 sub get_path {
205 my ($self, $controller, $limiting) = @_;
206
207 # Find the controller before querying the lxc monitor via a socket:
208 my ($cgpath, $ver) = find_cgroup_controller($controller)
209 or return undef;
210
211 my $path = get_subdir($self, $controller, $limiting)
212 or return undef;
213
214 $path = "$cgpath/$path";
215 return wantarray ? ($path, $ver) : $path;
216 }
217
218 # Convenience method to get the path info if the first existing controller.
219 #
220 # Returns the same as `get_path`.
221 sub get_any_path {
222 my ($self, $limiting, @controllers) = @_;
223
224 my ($path, $ver);
225 for my $c (@controllers) {
226 ($path, $ver) = $self->get_path($c, $limiting);
227 last if defined $path;
228 }
229 return wantarray ? ($path, $ver) : $path;
230 }
231
232 # Parse a 'Nested keyed' file:
233 #
234 # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
235 my sub parse_nested_keyed_file($) {
236 my ($data) = @_;
237 my $res = {};
238 foreach my $line (split(/\n/, $data)) {
239 my ($key, @values) = split(/\s+/, $line);
240
241 my $d = ($res->{$key} = {});
242
243 foreach my $value (@values) {
244 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
245 $d->{$key} = $value;
246 } else {
247 warn "bad key=value pair in nested keyed file\n";
248 }
249 }
250 }
251 return $res;
252 }
253
254 # Parse a 'Flat keyed' file:
255 #
256 # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
257 my sub parse_flat_keyed_file($) {
258 my ($data) = @_;
259 my $res = {};
260 foreach my $line (split(/\n/, $data)) {
261 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
262 $res->{$key} = $value;
263 } else {
264 warn "bad 'key value' pair in flat keyed file\n";
265 }
266 }
267 return $res;
268 }
269
270 # Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
271 sub get_io_stats {
272 my ($self) = @_;
273
274 my $res = {
275 diskread => 0,
276 diskwrite => 0,
277 };
278
279 # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
280 my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio');
281 if (!defined($path)) {
282 # container not running
283 return undef;
284 } elsif ($ver == 2) {
285 # cgroupv2 environment, io controller enabled
286 my $io_stat = file_get_contents("$path/io.stat");
287
288 my $data = parse_nested_keyed_file($io_stat);
289 foreach my $dev (keys %$data) {
290 my $dev = $data->{$dev};
291 if (my $b = $dev->{rbytes}) {
292 $res->{diskread} += $b;
293 }
294 if (my $b = $dev->{wbytes}) {
295 $res->{diskread} += $b;
296 }
297 }
298
299 return $res;
300 } elsif ($ver == 1) {
301 # cgroupv1 environment:
302 my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
303 foreach my $line (split(/\n/, $io)) {
304 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
305 $res->{diskread} += $bytes if $type eq 'Read';
306 $res->{diskwrite} += $bytes if $type eq 'Write';
307 }
308 }
309
310 return $res;
311 } else {
312 die "bad cgroup version: $ver\n";
313 }
314
315 # container not running
316 return undef;
317 }
318
319 # Read utime and stime for this container from the cpuacct cgroup.
320 # Values are in milliseconds!
321 sub get_cpu_stat {
322 my ($self) = @_;
323
324 my $res = {
325 utime => 0,
326 stime => 0,
327 };
328
329 my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu');
330 if (!defined($path)) {
331 # container not running
332 return undef;
333 } elsif ($ver == 2) {
334 my $data = eval { file_get_contents("$path/cpu.stat") };
335
336 # or no io controller available:
337 return undef if !defined($data);
338
339 $data = parse_flat_keyed_file($data);
340 $res->{utime} = int($data->{user_usec} / 1000);
341 $res->{stime} = int($data->{system_usec} / 1000);
342 } elsif ($ver == 1) {
343 # cgroupv1 environment:
344 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
345 my $clk_to_usec = 1000 / $clock_ticks;
346
347 my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
348 $res->{utime} = int($data->{user} * $clk_to_usec);
349 $res->{stime} = int($data->{system} * $clk_to_usec);
350 } else {
351 die "bad cgroup version: $ver\n";
352 }
353
354 return $res;
355 }
356
357 # Parse some memory data from `memory.stat`
358 sub get_memory_stat {
359 my ($self) = @_;
360
361 my $res = {
362 mem => 0,
363 swap => 0,
364 };
365
366 my ($path, $ver) = $self->get_path('memory', 1);
367 if (!defined($path)) {
368 # container most likely isn't running
369 return undef;
370 } elsif ($ver == 2) {
371 my $mem = file_get_contents("$path/memory.current");
372 my $swap = file_get_contents("$path/memory.swap.current");
373
374 chomp ($mem, $swap);
375
376 # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
377 # the values in `memory.stat`...
378
379 $res->{mem} = $mem;
380 $res->{swap} = $swap;
381 } elsif ($ver == 1) {
382 # cgroupv1 environment:
383 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
384 my $mem = file_get_contents("$path/memory.usage_in_bytes");
385 my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
386 chomp ($mem, $memsw);
387
388 $res->{mem} = $mem - $stat->{total_cache};
389 $res->{swap} = $memsw - $mem;
390 } else {
391 die "bad cgroup version: $ver\n";
392 }
393
394 return $res;
395 }
396
397 # Change the memory limit for this container.
398 #
399 # Dies on error (including a not-running or currently-shutting-down guest).
400 sub change_memory_limit {
401 my ($self, $mem_bytes, $swap_bytes) = @_;
402
403 my ($path, $ver) = $self->get_path('memory', 1);
404 if (!defined($path)) {
405 die "trying to change memory cgroup values: container not running\n";
406 } elsif ($ver == 2) {
407 PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
408 if defined($swap_bytes);
409 PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
410 if defined($mem_bytes);
411 } elsif ($ver == 1) {
412 # With cgroupv1 we cannot control memory and swap limits separately.
413 # This also means that since the two values aren't independent, we need to handle
414 # growing and shrinking separately.
415 my $path_mem = "$path/memory.limit_in_bytes";
416 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
417
418 my $old_mem_bytes = file_get_contents($path_mem);
419 my $old_memsw_bytes = file_get_contents($path_memsw);
420 chomp($old_mem_bytes, $old_memsw_bytes);
421
422 $mem_bytes //= $old_mem_bytes;
423 my $memsw_bytes = defined($swap_bytes) ? ($mem_bytes + $swap_bytes) : $old_memsw_bytes;
424
425 if ($memsw_bytes > $old_memsw_bytes) {
426 # Growing the limit means growing the combined limit first, then pulling the
427 # memory limitup.
428 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
429 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
430 } else {
431 # Shrinking means we first need to shrink the mem-only memsw cannot be
432 # shrunk below it.
433 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
434 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
435 }
436 } else {
437 die "bad cgroup version: $ver\n";
438 }
439
440 # return a truth value
441 return 1;
442 }
443
444 # Change the cpu quota for a container.
445 #
446 # Dies on error (including a not-running or currently-shutting-down guest).
447 sub change_cpu_quota {
448 my ($self, $quota, $period) = @_;
449
450 die "quota without period not allowed\n" if !defined($period) && defined($quota);
451
452 my ($path, $ver) = $self->get_path('cpu', 1);
453 if (!defined($path)) {
454 die "trying to change cpu quota cgroup values: container not running\n";
455 } elsif ($ver == 2) {
456 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
457 # in this interface:
458 $quota //= 'max'; # unlimited
459 if (defined($quota)) {
460 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
461 } else {
462 # we're allowed to only write the quota:
463 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
464 }
465 } elsif ($ver == 1) {
466 $quota //= -1; # unlimited
467 $period //= -1;
468 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
469 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
470 } else {
471 die "bad cgroup version: $ver\n";
472 }
473
474 # return a truth value
475 return 1;
476 }
477
478 # Change the cpu "shares" for a container.
479 #
480 # In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
481 #
482 # In cgroupv2 we do not have "shares", we have "weights" in the range
483 # of `[1..10000]` with a default of 100.
484 #
485 # Since the default values don't match when scaling linearly, we use the
486 # values we get as-is and simply error for values >10000 in cgroupv2.
487 #
488 # It is left to the user to figure this out for now.
489 #
490 # Dies on error (including a not-running or currently-shutting-down guest).
491 sub change_cpu_shares {
492 my ($self, $shares, $cgroupv1_default) = @_;
493
494 my ($path, $ver) = $self->get_path('cpu', 1);
495 if (!defined($path)) {
496 die "trying to change cpu shares/weight cgroup values: container not running\n";
497 } elsif ($ver == 2) {
498 # the cgroupv2 documentation defines the default to 100
499 $shares //= 100;
500 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
501 PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
502 } elsif ($ver == 1) {
503 $shares //= 100;
504 PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
505 } else {
506 die "bad cgroup version: $ver\n";
507 }
508
509 # return a truth value
510 return 1;
511 }
512
513 1;