my $CGROUP_MODE = undef;
# Figure out which cgroup mode we're operating under:
#
-# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
-# cgroupv2-only environment.
+# For this we check the file system type of `/sys/fs/cgroup` as it may well be possible that some
+# additional cgroupv1 mount points have been created by tools such as `systemd-nspawn`, or
+# manually.
+#
+# Returns 1 for what we consider the hybrid layout, 2 for what we consider the unified layout.
#
# NOTE: To fully support a hybrid layout it is better to use functions like
-# `cpuset_controller_path`.
+# `cpuset_controller_path` and not rely on this value for anything involving paths.
#
# This is a function, not a method!
sub cgroup_mode() {
if (!defined($CGROUP_MODE)) {
- my ($v1, $v2) = get_cgroup_controllers();
- if (keys %$v1) {
- # hybrid or legacy mode
- $CGROUP_MODE = 1;
- } elsif ($v2) {
- $CGROUP_MODE = 2;
+ my $mounts = PVE::ProcFSTools::parse_proc_mounts();
+ for my $entry (@$mounts) {
+ my ($what, $dir, $fstype, $opts) = @$entry;
+ if ($dir eq '/sys/fs/cgroup') {
+ if ($fstype eq 'cgroup2') {
+ $CGROUP_MODE = 2;
+ last;
+ } else {
+ $CGROUP_MODE = 1;
+ last;
+ }
+ }
}
}
#
# Dies on error (including a not-running or currently-shutting-down guest).
sub change_memory_limit {
- my ($self, $mem_bytes, $swap_bytes) = @_;
+ my ($self, $mem_bytes, $swap_bytes, $mem_high_bytes) = @_;
my ($path, $ver) = $self->get_path('memory', 1);
if (!defined($path)) {
} elsif ($ver == 2) {
PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
if defined($swap_bytes);
- PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
- if defined($mem_bytes);
+ if (defined($mem_bytes)) {
+ # 'max' is the hard-limit (triggers OOM), while 'high' throttles & adds reclaim pressure
+ PVE::ProcFSTools::write_proc_entry("$path/memory.high", $mem_high_bytes // 'max');
+ PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes);
+ }
} elsif ($ver == 1) {
# With cgroupv1 we cannot control memory and swap limits separately.
# This also means that since the two values aren't independent, we need to handle
return 1;
}
+# Clamp an integer to the supported range of CPU shares from the booted CGroup version
+#
+# Returns the default if called with an undefined value.
+sub clamp_cpu_shares {
+ my ($shares) = @_;
+
+ my $is_cgroupv2 = cgroup_mode() == 2;
+
+ return $is_cgroupv2 ? 100 : 1024 if !defined($shares);
+
+ if ($is_cgroupv2) {
+ $shares = 10000 if $shares >= 10000; # v1 can be higher, so clamp v2 there
+ } else {
+ $shares = 2 if $shares < 2; # v2 can be lower, so clamp v1 there
+ }
+ return $shares;
+}
+
# Change the cpu "shares" for a container.
#
# In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
# It is left to the user to figure this out for now.
#
# Dies on error (including a not-running or currently-shutting-down guest).
+#
+# NOTE: if you add a new param during 7.x you need to break older pve-container/qemu-server versions
+# that previously passed a `$cgroupv1_default`, which got removed due to being ignored anyway.
+# otherwise you risk that a old module bogusly passes some cgroup default as your new param.
sub change_cpu_shares {
- my ($self, $shares, $cgroupv1_default) = @_;
+ my ($self, $shares) = @_;
my ($path, $ver) = $self->get_path('cpu', 1);
if (!defined($path)) {
PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
} elsif ($ver == 1) {
$shares //= 1024;
- PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
+ PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares);
} else {
die "bad cgroup version: $ver\n";
}