]> git.proxmox.com Git - pve-container.git/blob - src/PVE/LXC/CGroup.pm
8ad1627e47340b18d3fce3d6d3d9d0fcb2d3d2a6
[pve-container.git] / src / PVE / LXC / CGroup.pm
1 # cgroup handler
2 #
3 # This package should deal with figuring out the right cgroup path for a
4 # container (via the command socket), reading and writing cgroup values, and
5 # handling cgroup v1 & v2 differences.
6 #
7 # Note that the long term plan is to have resource manage functions instead of
8 # dealing with cgroup files on the outside.
9
10 package PVE::LXC::CGroup;
11
12 use strict;
13 use warnings;
14
15 use POSIX qw();
16
17 use PVE::ProcFSTools;
18 use PVE::Tools qw(
19 file_get_contents
20 file_read_firstline
21 );
22
23 use PVE::LXC::Command;
24
25 # We don't want to do a command socket round trip for every cgroup read/write,
26 # so any cgroup function needs to have the container's path cached, so this
27 # package has to be instantiated.
28 #
29 # LXC keeps separate paths by controller (although they're normally all the
30 # same, in our # case anyway), so we cache them by controller as well.
31 sub new {
32 my ($class, $vmid) = @_;
33
34 my $self = { vmid => $vmid };
35
36 return bless $self, $class;
37 }
38
39 my $CPUSET_BASE = undef;
40 # Find the cpuset cgroup controller.
41 #
42 # This is a function, not a method!
43 sub cpuset_controller_path() {
44 if (!defined($CPUSET_BASE)) {
45 my $CPUSET_PATHS = [
46 # legacy cpuset cgroup:
47 ['/sys/fs/cgroup/cpuset', 'cpuset.effective_cpus'],
48 # pure cgroupv2 environment:
49 ['/sys/fs/cgroup', 'cpuset.cpus.effective'],
50 # hybrid, with cpuset moved to cgroupv2
51 ['/sys/fs/cgroup/unified', 'cpuset.cpus.effective'],
52 ];
53
54 my ($result) = grep { -f "$_->[0]/$_->[1]" } @$CPUSET_PATHS;
55 die "failed to find cpuset controller\n" if !defined($result);
56
57 $CPUSET_BASE = $result->[0];
58 }
59
60 return $CPUSET_BASE;
61 }
62
63 my $CGROUP_MODE = undef;
64 # Figure out which cgroup mode we're operating under:
65 #
66 # Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a
67 # cgroupv2-only environment.
68 #
69 # This is a function, not a method!
70 sub cgroup_mode() {
71 if (!defined($CGROUP_MODE)) {
72 my ($v1, $v2) = PVE::LXC::get_cgroup_subsystems();
73 if (keys %$v1) {
74 # hybrid or legacy mode
75 $CGROUP_MODE = 1;
76 } elsif ($v2) {
77 $CGROUP_MODE = 2;
78 }
79 }
80
81 die "unknown cgroup mode\n" if !defined($CGROUP_MODE);
82 return $CGROUP_MODE;
83 }
84
85 # Get a subdirectory (without the cgroup mount point) for a controller.
86 #
87 # If `$controller` is `undef`, get the unified (cgroupv2) path.
88 #
89 # Note that in cgroup v2, lxc uses the activated controller names
90 # (`cgroup.controllers` file) as list of controllers for the unified hierarchy,
91 # so this returns a result when a `controller` is provided even when using
92 # a pure cgroupv2 setup.
93 my sub get_subdir {
94 my ($self, $controller, $limiting) = @_;
95
96 my $entry_name = $controller || 'unified';
97 my $entry = ($self->{controllers}->{$entry_name} //= {});
98
99 my $kind = $limiting ? 'limit' : 'ns';
100 my $path = $entry->{$kind};
101
102 return $path if defined $path;
103
104 $path = PVE::LXC::Command::get_cgroup_path(
105 $self->{vmid},
106 $controller,
107 $limiting,
108 ) or return undef;
109
110 # untaint:
111 if ($path =~ /\.\./) {
112 die "lxc returned suspicious path: '$path'\n";
113 }
114 ($path) = ($path =~ /^(.*)$/s);
115
116 $entry->{$kind} = $path;
117
118 return $path;
119 }
120
121 # Get a path for a controller.
122 #
123 # `$controller` may be `undef`, see get_subdir above for details.
124 sub get_path {
125 my ($self, $controller) = @_;
126
127 my $path = get_subdir($self, $controller)
128 or return undef;
129
130 # The main mount point we currently assume to be in a standard location.
131 return "/sys/fs/cgroup/$path" if cgroup_mode() == 2;
132 return "/sys/fs/cgroup/unified/$path" if !defined($controller);
133 return "/sys/fs/cgroup/$controller/$path";
134 }
135
136 # Parse a 'Nested keyed' file:
137 #
138 # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
139 my sub parse_nested_keyed_file($) {
140 my ($data) = @_;
141 my $res = {};
142 foreach my $line (split(/\n/, $data)) {
143 my ($key, @values) = split(/\s+/, $line);
144
145 my $d = ($res->{$key} = {});
146
147 foreach my $value (@values) {
148 if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) {
149 $d->{$key} = $value;
150 } else {
151 warn "bad key=value pair in nested keyed file\n";
152 }
153 }
154 }
155 return $res;
156 }
157
158 # Parse a 'Flat keyed' file:
159 #
160 # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1.
161 my sub parse_flat_keyed_file($) {
162 my ($data) = @_;
163 my $res = {};
164 foreach my $line (split(/\n/, $data)) {
165 if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) {
166 $res->{$key} = $value;
167 } else {
168 warn "bad 'key value' pair in flat keyed file\n";
169 }
170 }
171 return $res;
172 }
173
174 # Parse out 'diskread' and 'diskwrite' values from I/O stats for this container.
175 sub get_io_stats {
176 my ($self) = @_;
177
178 my $res = {
179 diskread => 0,
180 diskwrite => 0,
181 };
182
183 if (cgroup_mode() == 2) {
184 if (defined(my $path = $self->get_path('io'))) {
185 # cgroupv2 environment, io controller enabled
186 my $io_stat = file_get_contents("$path/io.stat");
187
188 my $data = parse_nested_keyed_file($io_stat);
189 foreach my $dev (keys %$data) {
190 my $dev = $data->{$dev};
191 if (my $b = $dev->{rbytes}) {
192 $res->{diskread} += $b;
193 }
194 if (my $b = $dev->{wbytes}) {
195 $res->{diskread} += $b;
196 }
197 }
198 } else {
199 # io controller not enabled or container not running
200 return undef;
201 }
202 } elsif (defined(my $path = $self->get_path('blkio'))) {
203 # cgroupv1 environment:
204 my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive");
205 foreach my $line (split(/\n/, $io)) {
206 if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) {
207 $res->{diskread} += $bytes if $type eq 'Read';
208 $res->{diskwrite} += $bytes if $type eq 'Write';
209 }
210 }
211 } else {
212 # container not running
213 return undef;
214 }
215
216 return $res;
217 }
218
219 # Read utime and stime for this container from the cpuacct cgroup.
220 # Values are in milliseconds!
221 sub get_cpu_stat {
222 my ($self) = @_;
223
224 my $res = {
225 utime => 0,
226 stime => 0,
227 };
228
229 if (cgroup_mode() == 2) {
230 if (defined(my $path = $self->get_path('cpu'))) {
231 my $data = eval { file_get_contents("$path/cpu.stat") };
232
233 # or no io controller available:
234 return undef if !defined($data);
235
236 $data = parse_flat_keyed_file($data);
237 $res->{utime} = int($data->{user_usec} / 1000);
238 $res->{stime} = int($data->{system_usec} / 1000);
239 } else {
240 # memory controller not enabled or container not running
241 return undef;
242 }
243 } elsif (defined(my $path = $self->get_path('cpuacct'))) {
244 # cgroupv1 environment:
245 my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK);
246 my $clk_to_usec = 1000 / $clock_ticks;
247
248 my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat"));
249 $res->{utime} = int($data->{user} * $clk_to_usec);
250 $res->{stime} = int($data->{system} * $clk_to_usec);
251 } else {
252 # container most likely isn't running
253 return undef;
254 }
255
256 return $res;
257 }
258
259 # Parse some memory data from `memory.stat`
260 sub get_memory_stat {
261 my ($self) = @_;
262
263 my $res = {
264 mem => 0,
265 swap => 0,
266 };
267
268 if (cgroup_mode() == 2) {
269 if (defined(my $path = $self->get_path('memory'))) {
270 my $mem = file_get_contents("$path/memory.current");
271 my $swap = file_get_contents("$path/memory.swap.current");
272
273 chomp ($mem, $swap);
274
275 # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up
276 # the values in `memory.stat`...
277
278 $res->{mem} = $mem;
279 $res->{swap} = $swap;
280 } else {
281 # memory controller not enabled or container not running
282 return undef;
283 }
284 } elsif (defined(my $path = $self->get_path('memory'))) {
285 # cgroupv1 environment:
286 my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat"));
287 my $mem = file_get_contents("$path/memory.usage_in_bytes");
288 my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes");
289 chomp ($mem, $memsw);
290
291 $res->{mem} = $mem - $stat->{total_cache};
292 $res->{swap} = $memsw - $mem;
293 } else {
294 # container most likely isn't running
295 return undef;
296 }
297
298 return $res;
299 }
300
301 # Change the memory limit for this container.
302 #
303 # Dies on error (including a not-running or currently-shutting-down guest).
304 sub change_memory_limit {
305 my ($self, $mem_bytes, $swap_bytes) = @_;
306
307 if (cgroup_mode() == 2) {
308 if (defined(my $path = $self->get_path('memory'))) {
309 PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes)
310 if defined($swap_bytes);
311 PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes)
312 if defined($mem_bytes);
313 return 1;
314 }
315 } elsif (defined(my $path = $self->get_path('memory'))) {
316 # With cgroupv1 we cannot control memory and swap limits separately.
317 # This also means that since the two values aren't independent, we need to handle
318 # growing and shrinking separately.
319 my $path_mem = "$path/memory.limit_in_bytes";
320 my $path_memsw = "$path/memory.memsw.limit_in_bytes";
321
322 my $old_mem_bytes = file_get_contents($path_mem);
323 my $old_memsw_bytes = file_get_contents($path_memsw);
324 chomp($old_mem_bytes, $old_memsw_bytes);
325
326 $mem_bytes //= $old_mem_bytes;
327 my $memsw_bytes = defined($swap_bytes) ? ($mem_bytes + $swap_bytes) : $old_memsw_bytes;
328
329 if ($memsw_bytes > $old_memsw_bytes) {
330 # Growing the limit means growing the combined limit first, then pulling the
331 # memory limitup.
332 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
333 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
334 } else {
335 # Shrinking means we first need to shrink the mem-only memsw cannot be
336 # shrunk below it.
337 PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes);
338 PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes);
339 }
340 return 1;
341 }
342
343 die "trying to change memory cgroup values: container not running\n";
344 }
345
346 # Change the cpu quota for a container.
347 #
348 # Dies on error (including a not-running or currently-shutting-down guest).
349 sub change_cpu_quota {
350 my ($self, $quota, $period) = @_;
351
352 die "quota without period not allowed\n" if !defined($period) && defined($quota);
353
354 if (cgroup_mode() == 2) {
355 if (defined(my $path = $self->get_path('cpu'))) {
356 # cgroupv2 environment, an undefined (unlimited) quota is defined as "max"
357 # in this interface:
358 $quota //= 'max'; # unlimited
359 if (defined($quota)) {
360 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period");
361 } else {
362 # we're allowed to only write the quota:
363 PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max');
364 }
365 return 1;
366 }
367 } elsif (defined(my $path = $self->get_path('cpu'))) {
368 $quota //= -1; # unlimited
369 $period //= -1;
370 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period);
371 PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota);
372 return 1;
373 }
374
375 die "trying to change cpu quota cgroup values: container not running\n";
376 }
377
378 # Change the cpu "shares" for a container.
379 #
380 # In cgroupv1 we used a value in `[0..500000]` with a default of 1024.
381 #
382 # In cgroupv2 we do not have "shares", we have "weights" in the range
383 # of `[1..10000]` with a default of 100.
384 #
385 # Since the default values don't match when scaling linearly, we use the
386 # values we get as-is and simply error for values >10000 in cgroupv2.
387 #
388 # It is left to the user to figure this out for now.
389 #
390 # Dies on error (including a not-running or currently-shutting-down guest).
391 sub change_cpu_shares {
392 my ($self, $shares, $cgroupv1_default) = @_;
393
394 if (cgroup_mode() == 2) {
395 if (defined(my $path = $self->get_path('cpu'))) {
396 # the cgroupv2 documentation defines the default to 100
397 $shares //= 100;
398 die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000;
399 PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares);
400 return 1;
401 }
402 } elsif (defined(my $path = $self->get_path('cpu'))) {
403 $shares //= 100;
404 PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default);
405 return 1;
406 }
407
408 # container most likely isn't running
409 die "trying to change cpu shares/weight cgroup values: container not running\n";
410 }
411
412 1;