file_read_firstline
);
-use PVE::LXC::Command;
-
# We don't want to do a command socket round trip for every cgroup read/write,
# so any cgroup function needs to have the container's path cached, so this
# package has to be instantiated.
#
# Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an
# optional boolean whether a unified (cgroupv2) hierarchy exists.
-#
-# Deprecated: Use `get_cgroup_controllers()` instead.
-sub get_v1_controllers {
+my sub get_v1_controllers {
my $v1 = {};
my $v2 = 0;
my $data = PVE::Tools::file_get_contents('/proc/self/cgroup');
my $CGROUP_MODE = undef;
# Figure out which cgroup mode we're operating under:
#
-# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
-# cgroupv2-only environment.
+# For this we check the file system type of `/sys/fs/cgroup` as it may well be possible that some
+# additional cgroupv1 mount points have been created by tools such as `systemd-nspawn`, or
+# manually.
+#
+# Returns 1 for what we consider the hybrid layout, 2 for what we consider the unified layout.
#
# NOTE: To fully support a hybrid layout it is better to use functions like
-# `cpuset_controller_path`.
+# `cpuset_controller_path` and not rely on this value for anything involving paths.
#
# This is a function, not a method!
sub cgroup_mode() {
if (!defined($CGROUP_MODE)) {
- my ($v1, $v2) = get_cgroup_controllers();
- if (keys %$v1) {
- # hybrid or legacy mode
- $CGROUP_MODE = 1;
- } elsif ($v2) {
- $CGROUP_MODE = 2;
+ my $mounts = PVE::ProcFSTools::parse_proc_mounts();
+ for my $entry (@$mounts) {
+ my ($what, $dir, $fstype, $opts) = @$entry;
+ if ($dir eq '/sys/fs/cgroup') {
+ if ($fstype eq 'cgroup2') {
+ $CGROUP_MODE = 2;
+ last;
+ } else {
+ $CGROUP_MODE = 1;
+ last;
+ }
+ }
}
}
# available via both we favor cgroupv2 here as well.
#
# Returns nothing if the controller is not available.
+
sub find_cgroup_controller($) {
my ($controller) = @_;
}
# Get a subdirectory (without the cgroup mount point) for a controller.
-#
-# If `$controller` is `undef`, get the unified (cgroupv2) path.
-#
-# Note that in cgroup v2, lxc uses the activated controller names
-# (`cgroup.controllers` file) as list of controllers for the unified hierarchy,
-# so this returns a result when a `controller` is provided even when using
-# a pure cgroupv2 setup.
-my sub get_subdir {
+sub get_subdir {
my ($self, $controller, $limiting) = @_;
- my $entry_name = $controller || 'unified';
- my $entry = ($self->{controllers}->{$entry_name} //= {});
-
- my $kind = $limiting ? 'limit' : 'ns';
- my $path = $entry->{$kind};
-
- return $path if defined $path;
-
- $path = PVE::LXC::Command::get_cgroup_path(
- $self->{vmid},
- $controller,
- $limiting,
- ) or return undef;
-
- # untaint:
- if ($path =~ /\.\./) {
- die "lxc returned suspicious path: '$path'\n";
- }
- ($path) = ($path =~ /^(.*)$/s);
-
- $entry->{$kind} = $path;
-
- return $path;
+ die "implement in subclass";
}
# Get path and version for a controller.
# Returns either just the path, or the path and cgroup version as a tuple.
sub get_path {
my ($self, $controller, $limiting) = @_;
-
# Find the controller before querying the lxc monitor via a socket:
my ($cgpath, $ver) = find_cgroup_controller($controller)
or return undef;
- my $path = get_subdir($self, $controller, $limiting)
+ my $path = $self->get_subdir($controller, $limiting)
or return undef;
$path = "$cgpath/$path";
$res->{diskread} += $b;
}
if (my $b = $dev->{wbytes}) {
- $res->{diskread} += $b;
+ $res->{diskwrite} += $b;
}
}
} elsif ($ver == 2) {
my $mem = file_get_contents("$path/memory.current");
my $swap = file_get_contents("$path/memory.swap.current");
+ my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
chomp ($mem, $swap);
- # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
- # the values in `memory.stat`...
-
- $res->{mem} = $mem;
+ $res->{mem} = $mem - $stat->{file};
$res->{swap} = $swap;
} elsif ($ver == 1) {
# cgroupv1 environment:
return $res;
}
+sub get_pressure_stat {
+ my ($self) = @_;
+
+ my $res = {
+ cpu => {
+ some => { avg10 => 0, avg60 => 0, avg300 => 0 }
+ },
+ memory => {
+ some => { avg10 => 0, avg60 => 0, avg300 => 0 },
+ full => { avg10 => 0, avg60 => 0, avg300 => 0 }
+ },
+ io => {
+ some => { avg10 => 0, avg60 => 0, avg300 => 0 },
+ full => { avg10 => 0, avg60 => 0, avg300 => 0 }
+ },
+ };
+
+ my ($path, $version) = $self->get_path(undef, 1);
+ if (!defined($path)) {
+ return $res; # container or VM most likely isn't running, retrun zero stats
+ } elsif ($version == 1) {
+ return undef; # v1 controller does not provides pressure stat
+ } elsif ($version == 2) {
+ for my $type (qw(cpu memory io)) {
+ my $stats = PVE::ProcFSTools::parse_pressure("$path/$type.pressure");
+ $res->{$type} = $stats if $stats;
+ }
+ } else {
+ die "bad cgroup version: $version\n";
+ }
+
+ return $res;
+}
+
# Change the memory limit for this container.
#
# Dies on error (including a not-running or currently-shutting-down guest).
sub change_memory_limit {
- my ($self, $mem_bytes, $swap_bytes) = @_;
+ my ($self, $mem_bytes, $swap_bytes, $mem_high_bytes) = @_;
my ($path, $ver) = $self->get_path('memory', 1);
if (!defined($path)) {
} elsif ($ver == 2) {
PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
if defined($swap_bytes);
- PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
- if defined($mem_bytes);
+ if (defined($mem_bytes)) {
+ # 'max' is the hard-limit (triggers OOM), while 'high' throttles & adds reclaim pressure
+ PVE::ProcFSTools::write_proc_entry("$path/memory.high", $mem_high_bytes // 'max');
+ PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes);
+ }
} elsif ($ver == 1) {
# With cgroupv1 we cannot control memory and swap limits separately.
# This also means that since the two values aren't independent, we need to handle
PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
}
} elsif ($ver == 1) {
- $quota //= -1; # unlimited
- $period //= -1;
+ $quota //= -1; # default (unlimited)
+ $period //= 100_000; # default (100 ms)
PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
} else {
return 1;
}
+# Clamp an integer to the supported range of CPU shares from the booted CGroup version
+#
+# Returns the default if called with an undefined value.
+sub clamp_cpu_shares {
+ my ($shares) = @_;
+
+ my $is_cgroupv2 = cgroup_mode() == 2;
+
+ return $is_cgroupv2 ? 100 : 1024 if !defined($shares);
+
+ if ($is_cgroupv2) {
+ $shares = 10000 if $shares >= 10000; # v1 can be higher, so clamp v2 there
+ } else {
+ $shares = 2 if $shares < 2; # v2 can be lower, so clamp v1 there
+ }
+ return $shares;
+}
+
# Change the cpu "shares" for a container.
#
# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
# It is left to the user to figure this out for now.
#
# Dies on error (including a not-running or currently-shutting-down guest).
+#
+# NOTE: if you add a new param during 7.x you need to break older pve-container/qemu-server versions
+# that previously passed a `$cgroupv1_default`, which got removed due to being ignored anyway.
+# otherwise you risk that a old module bogusly passes some cgroup default as your new param.
sub change_cpu_shares {
- my ($self, $shares, $cgroupv1_default) = @_;
+ my ($self, $shares) = @_;
my ($path, $ver) = $self->get_path('cpu', 1);
if (!defined($path)) {
die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
} elsif ($ver == 1) {
- $shares //= 100;
- PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
+ $shares //= 1024;
+ PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares);
} else {
die "bad cgroup version: $ver\n";
}
my sub v1_freeze_thaw {
my ($self, $controller_path, $freeze) = @_;
- my $path = get_subdir($self, 'freezer', 1)
+ my $path = $self->get_subdir('freezer', 1)
or die "trying to freeze container: container not running\n";
$path = "$controller_path/$path/freezer.state";
my sub v2_freeze_thaw {
my ($self, $controller_path, $freeze) = @_;
- my $path = get_subdir($self, undef, 1)
+ my $path = $self->get_subdir(undef, 1)
or die "trying to freeze container: container not running\n";
$path = "$controller_path/$path";