]>
git.proxmox.com Git - pve-container.git/blob - src/PVE/LXC/CGroup.pm
3 # This package should deal with figuring out the right cgroup path for a
4 # container (via the command socket), reading and writing cgroup values, and
5 # handling cgroup v1 & v2 differences.
7 # Note that the long term plan is to have resource manage functions instead of
8 # dealing with cgroup files on the outside.
10 package PVE
::LXC
::CGroup
;
23 use PVE
::LXC
::Command
;
25 # We don't want to do a command socket round trip for every cgroup read/write,
26 # so any cgroup function needs to have the container's path cached, so this
27 # package has to be instantiated.
29 # LXC keeps separate paths by controller (although they're normally all the
30 # same, in our # case anyway), so we cache them by controller as well.
32 my ($class, $vmid) = @_;
34 my $self = { vmid
=> $vmid };
36 return bless $self, $class;
39 # Get the v1 controller list.
41 # Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
42 # optional boolean whether a unified (cgroupv2) hierarchy exists.
44 # Deprecated: Use `get_cgroup_controllers()` instead.
45 sub get_v1_controllers
{
48 my $data = PVE
::Tools
::file_get_contents
('/proc/self/cgroup');
49 while ($data =~ /^\d+:([^:\n]*):.*$/gm) {
52 $v1->{$_} = 1 foreach split(/,/, $type);
57 return wantarray ?
($v1, $v2) : $v1;
60 # Get the set v2 controller list from the `cgroup.controllers` file.
61 my sub get_v2_controllers
{
62 my $v2 = eval { file_get_contents
('/sys/fs/cgroup/cgroup.controllers') }
63 || eval { file_get_contents
('/sys/fs/cgroup/unified/cgroup.controllers') };
64 return undef if !defined $v2;
66 # It's a simple space separated list:
67 return { map { $_ => 1 } split(/\s+/, $v2) };
70 my $CGROUP_CONTROLLERS = undef;
71 # Get a list of controllers enabled in each cgroup subsystem.
73 # This is a more complete version of `PVE::LXC::get_cgroup_subsystems`.
75 # Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup
77 sub get_cgroup_controllers
() {
78 if (!defined($CGROUP_CONTROLLERS)) {
79 my ($v1, undef) = get_v1_controllers
();
80 my $v2 = get_v2_controllers
();
82 $CGROUP_CONTROLLERS = [$v1, $v2];
85 return $CGROUP_CONTROLLERS->@*;
88 my $CGROUP_MODE = undef;
89 # Figure out which cgroup mode we're operating under:
91 # Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
92 # cgroupv2-only environment.
94 # NOTE: To fully support a hybrid layout it is better to use functions like
95 # `cpuset_controller_path`.
97 # This is a function, not a method!
99 if (!defined($CGROUP_MODE)) {
100 my ($v1, $v2) = get_cgroup_controllers
();
102 # hybrid or legacy mode
109 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
113 # Find a cgroup controller and return its path and version.
115 # LXC initializes the unified hierarchy first, so if a controller is
116 # available via both we favor cgroupv2 here as well.
118 # Returns nothing if the controller is not available.
119 sub find_cgroup_controller
($) {
120 my ($controller) = @_;
122 my ($v1, $v2) = get_cgroup_controllers
();
124 if (!defined($controller) || $v2->{$controller}) {
126 if (cgroup_mode
() == 2) {
127 $path = '/sys/fs/cgroup';
129 $path = '/sys/fs/cgroup/unified';
131 return wantarray ?
($path, 2) : $path;
134 if (defined($controller) && $v1->{$controller}) {
135 my $path = "/sys/fs/cgroup/$controller";
136 return wantarray ?
($path, 1) : $path;
142 my $CG_PATH_CPUSET = undef;
143 my $CG_VER_CPUSET = undef;
144 # Find the cpuset cgroup controller.
146 # This is a function, not a method!
147 sub cpuset_controller_path
() {
148 if (!defined($CG_PATH_CPUSET)) {
149 ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller
('cpuset')
150 or die "failed to find cpuset controller\n";
153 return wantarray ?
($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET;
156 # Get a subdirectory (without the cgroup mount point) for a controller.
158 # If `$controller` is `undef`, get the unified (cgroupv2) path.
160 # Note that in cgroup v2, lxc uses the activated controller names
161 # (`cgroup.controllers` file) as list of controllers for the unified hierarchy,
162 # so this returns a result when a `controller` is provided even when using
163 # a pure cgroupv2 setup.
165 my ($self, $controller, $limiting) = @_;
167 my $entry_name = $controller || 'unified';
168 my $entry = ($self->{controllers
}->{$entry_name} //= {});
170 my $kind = $limiting ?
'limit' : 'ns';
171 my $path = $entry->{$kind};
173 return $path if defined $path;
175 $path = PVE
::LXC
::Command
::get_cgroup_path
(
182 if ($path =~ /\.\./) {
183 die "lxc returned suspicious path: '$path'\n";
185 ($path) = ($path =~ /^(.*)$/s);
187 $entry->{$kind} = $path;
192 # Get path and version for a controller.
194 # `$controller` may be `undef`, see get_subdir above for details.
196 # Returns either just the path, or the path and cgroup version as a tuple.
198 my ($self, $controller) = @_;
200 # Find the controller before querying the lxc monitor via a socket:
201 my ($cgpath, $ver) = find_cgroup_controller
($controller)
204 my $path = get_subdir
($self, $controller)
207 $path = "$cgpath/$path";
208 return wantarray ?
($path, $ver) : $path;
211 # Convenience method to get the path info if the first existing controller.
213 # Returns the same as `get_path`.
215 my ($self, @controllers) = @_;
218 for my $c (@controllers) {
219 ($path, $ver) = $self->get_path($c);
220 last if defined $path;
222 return wantarray ?
($path, $ver) : $path;
225 # Parse a 'Nested keyed' file:
227 # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
228 my sub parse_nested_keyed_file
($) {
231 foreach my $line (split(/\n/, $data)) {
232 my ($key, @values) = split(/\s+/, $line);
234 my $d = ($res->{$key} = {});
236 foreach my $value (@values) {
237 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
240 warn "bad key=value pair in nested keyed file\n";
247 # Parse a 'Flat keyed' file:
249 # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
250 my sub parse_flat_keyed_file
($) {
253 foreach my $line (split(/\n/, $data)) {
254 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
255 $res->{$key} = $value;
257 warn "bad 'key value' pair in flat keyed file\n";
263 # Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
272 # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io':
273 my ($path, $ver) = $self->get_any_path('io', 'blkio');
274 if (!defined($path)) {
275 # container not running
277 } elsif ($ver == 2) {
278 # cgroupv2 environment, io controller enabled
279 my $io_stat = file_get_contents
("$path/io.stat");
281 my $data = parse_nested_keyed_file
($io_stat);
282 foreach my $dev (keys %$data) {
283 my $dev = $data->{$dev};
284 if (my $b = $dev->{rbytes
}) {
285 $res->{diskread
} += $b;
287 if (my $b = $dev->{wbytes
}) {
288 $res->{diskread
} += $b;
293 } elsif ($ver == 1) {
294 # cgroupv1 environment:
295 my $io = file_get_contents
("$path/blkio.throttle.io_service_bytes_recursive");
296 foreach my $line (split(/\n/, $io)) {
297 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
298 $res->{diskread
} += $bytes if $type eq 'Read';
299 $res->{diskwrite
} += $bytes if $type eq 'Write';
305 die "bad cgroup version: $ver\n";
308 # container not running
312 # Read utime and stime for this container from the cpuacct cgroup.
313 # Values are in milliseconds!
322 my ($path, $ver) = $self->get_any_path('cpuacct', 'cpu');
323 if (!defined($path)) {
324 # container not running
326 } elsif ($ver == 2) {
327 my $data = eval { file_get_contents
("$path/cpu.stat") };
329 # or no io controller available:
330 return undef if !defined($data);
332 $data = parse_flat_keyed_file
($data);
333 $res->{utime} = int($data->{user_usec
} / 1000);
334 $res->{stime
} = int($data->{system_usec
} / 1000);
335 } elsif ($ver == 1) {
336 # cgroupv1 environment:
337 my $clock_ticks = POSIX
::sysconf
(&POSIX
::_SC_CLK_TCK
);
338 my $clk_to_usec = 1000 / $clock_ticks;
340 my $data = parse_flat_keyed_file
(file_get_contents
("$path/cpuacct.stat"));
341 $res->{utime} = int($data->{user
} * $clk_to_usec);
342 $res->{stime
} = int($data->{system} * $clk_to_usec);
344 die "bad cgroup version: $ver\n";
350 # Parse some memory data from `memory.stat`
351 sub get_memory_stat
{
359 my ($path, $ver) = $self->get_path('memory');
360 if (!defined($path)) {
361 # container most likely isn't running
363 } elsif ($ver == 2) {
364 my $mem = file_get_contents
("$path/memory.current");
365 my $swap = file_get_contents
("$path/memory.swap.current");
369 # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
370 # the values in `memory.stat`...
373 $res->{swap
} = $swap;
374 } elsif ($ver == 1) {
375 # cgroupv1 environment:
376 my $stat = parse_flat_keyed_file
(file_get_contents
("$path/memory.stat"));
377 my $mem = file_get_contents
("$path/memory.usage_in_bytes");
378 my $memsw = file_get_contents
("$path/memory.memsw.usage_in_bytes");
379 chomp ($mem, $memsw);
381 $res->{mem
} = $mem - $stat->{total_cache
};
382 $res->{swap
} = $memsw - $mem;
384 die "bad cgroup version: $ver\n";
390 # Change the memory limit for this container.
392 # Dies on error (including a not-running or currently-shutting-down guest).
393 sub change_memory_limit
{
394 my ($self, $mem_bytes, $swap_bytes) = @_;
396 my ($path, $ver) = $self->get_path('memory');
397 if (!defined($path)) {
398 die "trying to change memory cgroup values: container not running\n";
399 } elsif ($ver == 2) {
400 PVE
::ProcFSTools
::write_proc_entry
("$path/memory.swap.max", $swap_bytes)
401 if defined($swap_bytes);
402 PVE
::ProcFSTools
::write_proc_entry
("$path/memory.max", $mem_bytes)
403 if defined($mem_bytes);
404 } elsif ($ver == 1) {
405 # With cgroupv1 we cannot control memory and swap limits separately.
406 # This also means that since the two values aren't independent, we need to handle
407 # growing and shrinking separately.
408 my $path_mem = "$path/memory.limit_in_bytes";
409 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
411 my $old_mem_bytes = file_get_contents
($path_mem);
412 my $old_memsw_bytes = file_get_contents
($path_memsw);
413 chomp($old_mem_bytes, $old_memsw_bytes);
415 $mem_bytes //= $old_mem_bytes;
416 my $memsw_bytes = defined($swap_bytes) ?
($mem_bytes + $swap_bytes) : $old_memsw_bytes;
418 if ($memsw_bytes > $old_memsw_bytes) {
419 # Growing the limit means growing the combined limit first, then pulling the
421 PVE
::ProcFSTools
::write_proc_entry
($path_memsw, $memsw_bytes);
422 PVE
::ProcFSTools
::write_proc_entry
($path_mem, $mem_bytes);
424 # Shrinking means we first need to shrink the mem-only memsw cannot be
426 PVE
::ProcFSTools
::write_proc_entry
($path_mem, $mem_bytes);
427 PVE
::ProcFSTools
::write_proc_entry
($path_memsw, $memsw_bytes);
430 die "bad cgroup version: $ver\n";
433 # return a truth value
437 # Change the cpu quota for a container.
439 # Dies on error (including a not-running or currently-shutting-down guest).
440 sub change_cpu_quota
{
441 my ($self, $quota, $period) = @_;
443 die "quota without period not allowed\n" if !defined($period) && defined($quota);
445 my ($path, $ver) = $self->get_path('memory');
446 if (!defined($path)) {
447 die "trying to change cpu quota cgroup values: container not running\n";
448 } elsif ($ver == 2) {
449 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
451 $quota //= 'max'; # unlimited
452 if (defined($quota)) {
453 PVE
::ProcFSTools
::write_proc_entry
("$path/cpu.max", "$quota $period");
455 # we're allowed to only write the quota:
456 PVE
::ProcFSTools
::write_proc_entry
("$path/cpu.max", 'max');
458 } elsif ($ver == 1) {
459 $quota //= -1; # unlimited
461 PVE
::ProcFSTools
::write_proc_entry
("$path/cpu.cfs_period_us", $period);
462 PVE
::ProcFSTools
::write_proc_entry
("$path/cpu.cfs_quota_us", $quota);
464 die "bad cgroup version: $ver\n";
467 # return a truth value
471 # Change the cpu "shares" for a container.
473 # In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
475 # In cgroupv2 we do not have "shares", we have "weights" in the range
476 # of `[1..10000]` with a default of 100.
478 # Since the default values don't match when scaling linearly, we use the
479 # values we get as-is and simply error for values >10000 in cgroupv2.
481 # It is left to the user to figure this out for now.
483 # Dies on error (including a not-running or currently-shutting-down guest).
484 sub change_cpu_shares
{
485 my ($self, $shares, $cgroupv1_default) = @_;
487 my ($path, $ver) = $self->get_path('memory');
488 if (!defined($path)) {
489 die "trying to change cpu shares/weight cgroup values: container not running\n";
490 } elsif ($ver == 2) {
491 # the cgroupv2 documentation defines the default to 100
493 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
494 PVE
::ProcFSTools
::write_proc_entry
("$path/cpu.weight", $shares);
495 } elsif (defined(my $path = $self->get_path('cpu'))) {
497 PVE
::ProcFSTools
::write_proc_entry
("$path/cpu.shares", $shares // $cgroupv1_default);
499 die "bad cgroup version: $ver\n";
502 # return a truth value