# cgroup handler # # This package should deal with figuring out the right cgroup path for a # container (via the command socket), reading and writing cgroup values, and # handling cgroup v1 & v2 differences. # # Note that the long term plan is to have resource manage functions instead of # dealing with cgroup files on the outside. package PVE::CGroup; use strict; use warnings; use IO::File; use IO::Select; use POSIX qw(); use PVE::ProcFSTools; use PVE::Tools qw( file_get_contents file_read_firstline ); # We don't want to do a command socket round trip for every cgroup read/write, # so any cgroup function needs to have the container's path cached, so this # package has to be instantiated. # # LXC keeps separate paths by controller (although they're normally all the # same, in our # case anyway), so we cache them by controller as well. sub new { my ($class, $vmid) = @_; my $self = { vmid => $vmid }; return bless $self, $class; } # Get the v1 controller list. # # Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an # optional boolean whether a unified (cgroupv2) hierarchy exists. # # Deprecated: Use `get_cgroup_controllers()` instead. sub get_v1_controllers { my $v1 = {}; my $v2 = 0; my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); while ($data =~ /^\d+:([^:\n]*):.*$/gm) { my $type = $1; if (length($type)) { $v1->{$_} = 1 foreach split(/,/, $type); } else { $v2 = 1; } } return wantarray ? ($v1, $v2) : $v1; } # Get the set v2 controller list from the `cgroup.controllers` file. my sub get_v2_controllers { my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') } || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') }; return undef if !defined $v2; # It's a simple space separated list: return { map { $_ => 1 } split(/\s+/, $v2) }; } my $CGROUP_CONTROLLERS = undef; # Get a list of controllers enabled in each cgroup subsystem. # # This is a more complete version of `PVE::LXC::get_cgroup_subsystems`. # # Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup # version. sub get_cgroup_controllers() { if (!defined($CGROUP_CONTROLLERS)) { my ($v1, undef) = get_v1_controllers(); my $v2 = get_v2_controllers(); $CGROUP_CONTROLLERS = [$v1, $v2]; } return $CGROUP_CONTROLLERS->@*; } my $CGROUP_MODE = undef; # Figure out which cgroup mode we're operating under: # # Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a # cgroupv2-only environment. # # NOTE: To fully support a hybrid layout it is better to use functions like # `cpuset_controller_path`. # # This is a function, not a method! sub cgroup_mode() { if (!defined($CGROUP_MODE)) { my ($v1, $v2) = get_cgroup_controllers(); if (keys %$v1) { # hybrid or legacy mode $CGROUP_MODE = 1; } elsif ($v2) { $CGROUP_MODE = 2; } } die "unknown cgroup mode\n" if !defined($CGROUP_MODE); return $CGROUP_MODE; } my $CGROUPV2_PATH = undef; sub cgroupv2_base_path() { if (!defined($CGROUPV2_PATH)) { if (cgroup_mode() == 2) { $CGROUPV2_PATH = '/sys/fs/cgroup'; } else { $CGROUPV2_PATH = '/sys/fs/cgroup/unified'; } } return $CGROUPV2_PATH; } # Find a cgroup controller and return its path and version. # # LXC initializes the unified hierarchy first, so if a controller is # available via both we favor cgroupv2 here as well. # # Returns nothing if the controller is not available. sub find_cgroup_controller($) { my ($controller) = @_; my ($v1, $v2) = get_cgroup_controllers(); if (!defined($controller) || $v2->{$controller}) { my $path = cgroupv2_base_path(); return wantarray ? ($path, 2) : $path; } if (defined($controller) && $v1->{$controller}) { my $path = "/sys/fs/cgroup/$controller"; return wantarray ? ($path, 1) : $path; } return; } my $CG_PATH_CPUSET = undef; my $CG_VER_CPUSET = undef; # Find the cpuset cgroup controller. # # This is a function, not a method! sub cpuset_controller_path() { if (!defined($CG_PATH_CPUSET)) { ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset') or die "failed to find cpuset controller\n"; } return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET; } # Get a subdirectory (without the cgroup mount point) for a controller. sub get_subdir { my ($self, $controller, $limiting) = @_; die "implement in subclass"; } # Get path and version for a controller. # # `$controller` may be `undef`, see get_subdir above for details. # # Returns either just the path, or the path and cgroup version as a tuple. sub get_path { my ($self, $controller, $limiting) = @_; # Find the controller before querying the lxc monitor via a socket: my ($cgpath, $ver) = find_cgroup_controller($controller) or return undef; my $path = $self->get_subdir($controller, $limiting) or return undef; $path = "$cgpath/$path"; return wantarray ? ($path, $ver) : $path; } # Convenience method to get the path info if the first existing controller. # # Returns the same as `get_path`. sub get_any_path { my ($self, $limiting, @controllers) = @_; my ($path, $ver); for my $c (@controllers) { ($path, $ver) = $self->get_path($c, $limiting); last if defined $path; } return wantarray ? ($path, $ver) : $path; } # Parse a 'Nested keyed' file: # # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. my sub parse_nested_keyed_file($) { my ($data) = @_; my $res = {}; foreach my $line (split(/\n/, $data)) { my ($key, @values) = split(/\s+/, $line); my $d = ($res->{$key} = {}); foreach my $value (@values) { if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) { $d->{$key} = $value; } else { warn "bad key=value pair in nested keyed file\n"; } } } return $res; } # Parse a 'Flat keyed' file: # # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. my sub parse_flat_keyed_file($) { my ($data) = @_; my $res = {}; foreach my $line (split(/\n/, $data)) { if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) { $res->{$key} = $value; } else { warn "bad 'key value' pair in flat keyed file\n"; } } return $res; } # Parse out 'diskread' and 'diskwrite' values from I/O stats for this container. sub get_io_stats { my ($self) = @_; my $res = { diskread => 0, diskwrite => 0, }; # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io': my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio'); if (!defined($path)) { # container not running return undef; } elsif ($ver == 2) { # cgroupv2 environment, io controller enabled my $io_stat = file_get_contents("$path/io.stat"); my $data = parse_nested_keyed_file($io_stat); foreach my $dev (keys %$data) { my $dev = $data->{$dev}; if (my $b = $dev->{rbytes}) { $res->{diskread} += $b; } if (my $b = $dev->{wbytes}) { $res->{diskread} += $b; } } return $res; } elsif ($ver == 1) { # cgroupv1 environment: my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive"); foreach my $line (split(/\n/, $io)) { if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) { $res->{diskread} += $bytes if $type eq 'Read'; $res->{diskwrite} += $bytes if $type eq 'Write'; } } return $res; } else { die "bad cgroup version: $ver\n"; } # container not running return undef; } # Read utime and stime for this container from the cpuacct cgroup. # Values are in milliseconds! sub get_cpu_stat { my ($self) = @_; my $res = { utime => 0, stime => 0, }; my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu'); if (!defined($path)) { # container not running return undef; } elsif ($ver == 2) { my $data = eval { file_get_contents("$path/cpu.stat") }; # or no io controller available: return undef if !defined($data); $data = parse_flat_keyed_file($data); $res->{utime} = int($data->{user_usec} / 1000); $res->{stime} = int($data->{system_usec} / 1000); } elsif ($ver == 1) { # cgroupv1 environment: my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); my $clk_to_usec = 1000 / $clock_ticks; my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat")); $res->{utime} = int($data->{user} * $clk_to_usec); $res->{stime} = int($data->{system} * $clk_to_usec); } else { die "bad cgroup version: $ver\n"; } return $res; } # Parse some memory data from `memory.stat` sub get_memory_stat { my ($self) = @_; my $res = { mem => 0, swap => 0, }; my ($path, $ver) = $self->get_path('memory', 1); if (!defined($path)) { # container most likely isn't running return undef; } elsif ($ver == 2) { my $mem = file_get_contents("$path/memory.current"); my $swap = file_get_contents("$path/memory.swap.current"); chomp ($mem, $swap); # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up # the values in `memory.stat`... $res->{mem} = $mem; $res->{swap} = $swap; } elsif ($ver == 1) { # cgroupv1 environment: my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat")); my $mem = file_get_contents("$path/memory.usage_in_bytes"); my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes"); chomp ($mem, $memsw); $res->{mem} = $mem - $stat->{total_cache}; $res->{swap} = $memsw - $mem; } else { die "bad cgroup version: $ver\n"; } return $res; } sub get_pressure_stat { my ($self) = @_; my $res = { cpu => { some => { avg10 => 0, avg60 => 0, avg300 => 0 } }, memory => { some => { avg10 => 0, avg60 => 0, avg300 => 0 }, full => { avg10 => 0, avg60 => 0, avg300 => 0 } }, io => { some => { avg10 => 0, avg60 => 0, avg300 => 0 }, full => { avg10 => 0, avg60 => 0, avg300 => 0 } }, }; my ($path, $version) = $self->get_path(undef, 1); if (!defined($path)) { return $res; # container or VM most likely isn't running, retrun zero stats } elsif ($version == 1) { return undef; # v1 controller does not provides pressure stat } elsif ($version == 2) { for my $type (qw(cpu memory io)) { my $stats = PVE::ProcFSTools::parse_pressure("$path/$type.pressure"); $res->{$type} = $stats if $stats; } } else { die "bad cgroup version: $version\n"; } return $res; } # Change the memory limit for this container. # # Dies on error (including a not-running or currently-shutting-down guest). sub change_memory_limit { my ($self, $mem_bytes, $swap_bytes) = @_; my ($path, $ver) = $self->get_path('memory', 1); if (!defined($path)) { die "trying to change memory cgroup values: container not running\n"; } elsif ($ver == 2) { PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes) if defined($swap_bytes); PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes) if defined($mem_bytes); } elsif ($ver == 1) { # With cgroupv1 we cannot control memory and swap limits separately. # This also means that since the two values aren't independent, we need to handle # growing and shrinking separately. my $path_mem = "$path/memory.limit_in_bytes"; my $path_memsw = "$path/memory.memsw.limit_in_bytes"; my $old_mem_bytes = file_get_contents($path_mem); my $old_memsw_bytes = file_get_contents($path_memsw); chomp($old_mem_bytes, $old_memsw_bytes); $mem_bytes //= $old_mem_bytes; $swap_bytes //= $old_memsw_bytes - $old_mem_bytes; my $memsw_bytes = $mem_bytes + $swap_bytes; if ($memsw_bytes > $old_memsw_bytes) { # Growing the limit means growing the combined limit first, then pulling the # memory limitup. PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); } else { # Shrinking means we first need to shrink the mem-only memsw cannot be # shrunk below it. PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); } } else { die "bad cgroup version: $ver\n"; } # return a truth value return 1; } # Change the cpu quota for a container. # # Dies on error (including a not-running or currently-shutting-down guest). sub change_cpu_quota { my ($self, $quota, $period) = @_; die "quota without period not allowed\n" if !defined($period) && defined($quota); my ($path, $ver) = $self->get_path('cpu', 1); if (!defined($path)) { die "trying to change cpu quota cgroup values: container not running\n"; } elsif ($ver == 2) { # cgroupv2 environment, an undefined (unlimited) quota is defined as "max" # in this interface: $quota //= 'max'; # unlimited if (defined($quota)) { PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period"); } else { # we're allowed to only write the quota: PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max'); } } elsif ($ver == 1) { $quota //= -1; # unlimited $period //= -1; PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period); PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota); } else { die "bad cgroup version: $ver\n"; } # return a truth value return 1; } # Change the cpu "shares" for a container. # # In cgroupv1 we used a value in `[0..500000]` with a default of 1024. # # In cgroupv2 we do not have "shares", we have "weights" in the range # of `[1..10000]` with a default of 100. # # Since the default values don't match when scaling linearly, we use the # values we get as-is and simply error for values >10000 in cgroupv2. # # It is left to the user to figure this out for now. # # Dies on error (including a not-running or currently-shutting-down guest). sub change_cpu_shares { my ($self, $shares, $cgroupv1_default) = @_; my ($path, $ver) = $self->get_path('cpu', 1); if (!defined($path)) { die "trying to change cpu shares/weight cgroup values: container not running\n"; } elsif ($ver == 2) { # the cgroupv2 documentation defines the default to 100 $shares //= 100; die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000; PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares); } elsif ($ver == 1) { $shares //= 1024; PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default); } else { die "bad cgroup version: $ver\n"; } # return a truth value return 1; } my sub v1_freeze_thaw { my ($self, $controller_path, $freeze) = @_; my $path = $self->get_subdir('freezer', 1) or die "trying to freeze container: container not running\n"; $path = "$controller_path/$path/freezer.state"; my $data = $freeze ? 'FROZEN' : 'THAWED'; PVE::ProcFSTools::write_proc_entry($path, $data); # Here we just poll the freezer.state once per second. while (1) { my $state = file_get_contents($path); chomp $state; last if $state eq $data; } } my sub v2_freeze_thaw { my ($self, $controller_path, $freeze) = @_; my $path = $self->get_subdir(undef, 1) or die "trying to freeze container: container not running\n"; $path = "$controller_path/$path"; my $desired_state = $freeze ? 1 : 0; # cgroupv2 supports poll events on cgroup.events which contains the frozen # state. my $fh = IO::File->new("$path/cgroup.events", 'r') or die "failed to open $path/cgroup.events file: $!\n"; my $select = IO::Select->new(); $select->add($fh); PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state); while (1) { my $data = do { local $/ = undef; <$fh> }; $data = parse_flat_keyed_file($data); last if $data->{frozen} == $desired_state; my @handles = $select->has_exception(); next if !@handles; seek($fh, 0, 0) or die "failed to rewind cgroup.events file: $!\n"; } } # Freeze or unfreeze a container. # # This will freeze the container at its outer (limiting) cgroup path. We use # this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to # fetch the cgroup path from contaienrs still running on lxc3. sub freeze_thaw { my ($self, $freeze) = @_; my $controller_path = find_cgroup_controller('freezer'); if (defined($controller_path)) { return v1_freeze_thaw($self, $controller_path, $freeze); } else { # cgroupv2 always has a freezer, there can be both cgv1 and cgv2 # freezers, but we'll prefer v1 when it's available as that's what lxc # does as well... return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze); } } 1;