]>
Commit | Line | Data |
---|---|---|
86dff11c AD |
1 | # cgroup handler |
2 | # | |
3 | # This package should deal with figuring out the right cgroup path for a | |
4 | # container (via the command socket), reading and writing cgroup values, and | |
5 | # handling cgroup v1 & v2 differences. | |
6 | # | |
7 | # Note that the long term plan is to have resource manage functions instead of | |
8 | # dealing with cgroup files on the outside. | |
9 | ||
10 | package PVE::CGroup; | |
11 | ||
12 | use strict; | |
13 | use warnings; | |
14 | ||
15 | use IO::File; | |
16 | use IO::Select; | |
17 | use POSIX qw(); | |
18 | ||
19 | use PVE::ProcFSTools; | |
20 | use PVE::Tools qw( | |
21 | file_get_contents | |
22 | file_read_firstline | |
23 | ); | |
24 | ||
86dff11c AD |
25 | # We don't want to do a command socket round trip for every cgroup read/write, |
26 | # so any cgroup function needs to have the container's path cached, so this | |
27 | # package has to be instantiated. | |
28 | # | |
29 | # LXC keeps separate paths by controller (although they're normally all the | |
30 | # same, in our # case anyway), so we cache them by controller as well. | |
31 | sub new { | |
32 | my ($class, $vmid) = @_; | |
33 | ||
34 | my $self = { vmid => $vmid }; | |
35 | ||
36 | return bless $self, $class; | |
37 | } | |
38 | ||
39 | # Get the v1 controller list. | |
40 | # | |
41 | # Returns a set (hash mapping names to `1`) of cgroupv1 controllers, and an | |
42 | # optional boolean whether a unified (cgroupv2) hierarchy exists. | |
43 | # | |
44 | # Deprecated: Use `get_cgroup_controllers()` instead. | |
45 | sub get_v1_controllers { | |
46 | my $v1 = {}; | |
47 | my $v2 = 0; | |
48 | my $data = PVE::Tools::file_get_contents('/proc/self/cgroup'); | |
49 | while ($data =~ /^\d+:([^:\n]*):.*$/gm) { | |
50 | my $type = $1; | |
51 | if (length($type)) { | |
52 | $v1->{$_} = 1 foreach split(/,/, $type); | |
53 | } else { | |
54 | $v2 = 1; | |
55 | } | |
56 | } | |
57 | return wantarray ? ($v1, $v2) : $v1; | |
58 | } | |
59 | ||
60 | # Get the set v2 controller list from the `cgroup.controllers` file. | |
61 | my sub get_v2_controllers { | |
62 | my $v2 = eval { file_get_contents('/sys/fs/cgroup/cgroup.controllers') } | |
63 | || eval { file_get_contents('/sys/fs/cgroup/unified/cgroup.controllers') }; | |
64 | return undef if !defined $v2; | |
65 | ||
66 | # It's a simple space separated list: | |
67 | return { map { $_ => 1 } split(/\s+/, $v2) }; | |
68 | } | |
69 | ||
70 | my $CGROUP_CONTROLLERS = undef; | |
71 | # Get a list of controllers enabled in each cgroup subsystem. | |
72 | # | |
73 | # This is a more complete version of `PVE::LXC::get_cgroup_subsystems`. | |
74 | # | |
75 | # Returns 2 sets (hashes mapping controller names to `1`), one for each cgroup | |
76 | # version. | |
77 | sub get_cgroup_controllers() { | |
78 | if (!defined($CGROUP_CONTROLLERS)) { | |
79 | my ($v1, undef) = get_v1_controllers(); | |
80 | my $v2 = get_v2_controllers(); | |
81 | ||
82 | $CGROUP_CONTROLLERS = [$v1, $v2]; | |
83 | } | |
84 | ||
85 | return $CGROUP_CONTROLLERS->@*; | |
86 | } | |
87 | ||
88 | my $CGROUP_MODE = undef; | |
89 | # Figure out which cgroup mode we're operating under: | |
90 | # | |
91 | # Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a | |
92 | # cgroupv2-only environment. | |
93 | # | |
94 | # NOTE: To fully support a hybrid layout it is better to use functions like | |
95 | # `cpuset_controller_path`. | |
96 | # | |
97 | # This is a function, not a method! | |
98 | sub cgroup_mode() { | |
99 | if (!defined($CGROUP_MODE)) { | |
100 | my ($v1, $v2) = get_cgroup_controllers(); | |
101 | if (keys %$v1) { | |
102 | # hybrid or legacy mode | |
103 | $CGROUP_MODE = 1; | |
104 | } elsif ($v2) { | |
105 | $CGROUP_MODE = 2; | |
106 | } | |
107 | } | |
108 | ||
109 | die "unknown cgroup mode\n" if !defined($CGROUP_MODE); | |
110 | return $CGROUP_MODE; | |
111 | } | |
112 | ||
113 | my $CGROUPV2_PATH = undef; | |
114 | sub cgroupv2_base_path() { | |
115 | if (!defined($CGROUPV2_PATH)) { | |
116 | if (cgroup_mode() == 2) { | |
117 | $CGROUPV2_PATH = '/sys/fs/cgroup'; | |
118 | } else { | |
119 | $CGROUPV2_PATH = '/sys/fs/cgroup/unified'; | |
120 | } | |
121 | } | |
122 | return $CGROUPV2_PATH; | |
123 | } | |
124 | ||
125 | # Find a cgroup controller and return its path and version. | |
126 | # | |
127 | # LXC initializes the unified hierarchy first, so if a controller is | |
128 | # available via both we favor cgroupv2 here as well. | |
129 | # | |
130 | # Returns nothing if the controller is not available. | |
9465abe2 | 131 | |
86dff11c AD |
132 | sub find_cgroup_controller($) { |
133 | my ($controller) = @_; | |
134 | ||
135 | my ($v1, $v2) = get_cgroup_controllers(); | |
136 | ||
137 | if (!defined($controller) || $v2->{$controller}) { | |
138 | my $path = cgroupv2_base_path(); | |
139 | return wantarray ? ($path, 2) : $path; | |
140 | } | |
141 | ||
142 | if (defined($controller) && $v1->{$controller}) { | |
143 | my $path = "/sys/fs/cgroup/$controller"; | |
144 | return wantarray ? ($path, 1) : $path; | |
145 | } | |
146 | ||
147 | return; | |
148 | } | |
149 | ||
150 | my $CG_PATH_CPUSET = undef; | |
151 | my $CG_VER_CPUSET = undef; | |
152 | # Find the cpuset cgroup controller. | |
153 | # | |
154 | # This is a function, not a method! | |
155 | sub cpuset_controller_path() { | |
156 | if (!defined($CG_PATH_CPUSET)) { | |
157 | ($CG_PATH_CPUSET, $CG_VER_CPUSET) = find_cgroup_controller('cpuset') | |
158 | or die "failed to find cpuset controller\n"; | |
159 | } | |
160 | ||
161 | return wantarray ? ($CG_PATH_CPUSET, $CG_VER_CPUSET) : $CG_PATH_CPUSET; | |
162 | } | |
163 | ||
164 | # Get a subdirectory (without the cgroup mount point) for a controller. | |
9465abe2 | 165 | sub get_subdir { |
86dff11c AD |
166 | my ($self, $controller, $limiting) = @_; |
167 | ||
9465abe2 | 168 | die "implement in subclass"; |
86dff11c AD |
169 | } |
170 | ||
171 | # Get path and version for a controller. | |
172 | # | |
173 | # `$controller` may be `undef`, see get_subdir above for details. | |
174 | # | |
175 | # Returns either just the path, or the path and cgroup version as a tuple. | |
176 | sub get_path { | |
177 | my ($self, $controller, $limiting) = @_; | |
86dff11c AD |
178 | # Find the controller before querying the lxc monitor via a socket: |
179 | my ($cgpath, $ver) = find_cgroup_controller($controller) | |
180 | or return undef; | |
181 | ||
9465abe2 | 182 | my $path = $self->get_subdir($controller, $limiting) |
86dff11c AD |
183 | or return undef; |
184 | ||
185 | $path = "$cgpath/$path"; | |
186 | return wantarray ? ($path, $ver) : $path; | |
187 | } | |
188 | ||
189 | # Convenience method to get the path info if the first existing controller. | |
190 | # | |
191 | # Returns the same as `get_path`. | |
192 | sub get_any_path { | |
193 | my ($self, $limiting, @controllers) = @_; | |
194 | ||
195 | my ($path, $ver); | |
196 | for my $c (@controllers) { | |
197 | ($path, $ver) = $self->get_path($c, $limiting); | |
198 | last if defined $path; | |
199 | } | |
200 | return wantarray ? ($path, $ver) : $path; | |
201 | } | |
202 | ||
203 | # Parse a 'Nested keyed' file: | |
204 | # | |
205 | # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. | |
206 | my sub parse_nested_keyed_file($) { | |
207 | my ($data) = @_; | |
208 | my $res = {}; | |
209 | foreach my $line (split(/\n/, $data)) { | |
210 | my ($key, @values) = split(/\s+/, $line); | |
211 | ||
212 | my $d = ($res->{$key} = {}); | |
213 | ||
214 | foreach my $value (@values) { | |
215 | if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) { | |
216 | $d->{$key} = $value; | |
217 | } else { | |
218 | warn "bad key=value pair in nested keyed file\n"; | |
219 | } | |
220 | } | |
221 | } | |
222 | return $res; | |
223 | } | |
224 | ||
225 | # Parse a 'Flat keyed' file: | |
226 | # | |
227 | # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. | |
228 | my sub parse_flat_keyed_file($) { | |
229 | my ($data) = @_; | |
230 | my $res = {}; | |
231 | foreach my $line (split(/\n/, $data)) { | |
232 | if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) { | |
233 | $res->{$key} = $value; | |
234 | } else { | |
235 | warn "bad 'key value' pair in flat keyed file\n"; | |
236 | } | |
237 | } | |
238 | return $res; | |
239 | } | |
240 | ||
241 | # Parse out 'diskread' and 'diskwrite' values from I/O stats for this container. | |
242 | sub get_io_stats { | |
243 | my ($self) = @_; | |
244 | ||
245 | my $res = { | |
246 | diskread => 0, | |
247 | diskwrite => 0, | |
248 | }; | |
249 | ||
250 | # With cgroupv1 we have a 'blkio' controller, with cgroupv2 it's just 'io': | |
251 | my ($path, $ver) = $self->get_any_path(1, 'io', 'blkio'); | |
252 | if (!defined($path)) { | |
253 | # container not running | |
254 | return undef; | |
255 | } elsif ($ver == 2) { | |
256 | # cgroupv2 environment, io controller enabled | |
257 | my $io_stat = file_get_contents("$path/io.stat"); | |
258 | ||
259 | my $data = parse_nested_keyed_file($io_stat); | |
260 | foreach my $dev (keys %$data) { | |
261 | my $dev = $data->{$dev}; | |
262 | if (my $b = $dev->{rbytes}) { | |
263 | $res->{diskread} += $b; | |
264 | } | |
265 | if (my $b = $dev->{wbytes}) { | |
266 | $res->{diskread} += $b; | |
267 | } | |
268 | } | |
269 | ||
270 | return $res; | |
271 | } elsif ($ver == 1) { | |
272 | # cgroupv1 environment: | |
273 | my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive"); | |
274 | foreach my $line (split(/\n/, $io)) { | |
275 | if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) { | |
276 | $res->{diskread} += $bytes if $type eq 'Read'; | |
277 | $res->{diskwrite} += $bytes if $type eq 'Write'; | |
278 | } | |
279 | } | |
280 | ||
281 | return $res; | |
282 | } else { | |
283 | die "bad cgroup version: $ver\n"; | |
284 | } | |
285 | ||
286 | # container not running | |
287 | return undef; | |
288 | } | |
289 | ||
290 | # Read utime and stime for this container from the cpuacct cgroup. | |
291 | # Values are in milliseconds! | |
292 | sub get_cpu_stat { | |
293 | my ($self) = @_; | |
294 | ||
295 | my $res = { | |
296 | utime => 0, | |
297 | stime => 0, | |
298 | }; | |
299 | ||
300 | my ($path, $ver) = $self->get_any_path(1, 'cpuacct', 'cpu'); | |
301 | if (!defined($path)) { | |
302 | # container not running | |
303 | return undef; | |
304 | } elsif ($ver == 2) { | |
305 | my $data = eval { file_get_contents("$path/cpu.stat") }; | |
306 | ||
307 | # or no io controller available: | |
308 | return undef if !defined($data); | |
309 | ||
310 | $data = parse_flat_keyed_file($data); | |
311 | $res->{utime} = int($data->{user_usec} / 1000); | |
312 | $res->{stime} = int($data->{system_usec} / 1000); | |
313 | } elsif ($ver == 1) { | |
314 | # cgroupv1 environment: | |
315 | my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); | |
316 | my $clk_to_usec = 1000 / $clock_ticks; | |
317 | ||
318 | my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat")); | |
319 | $res->{utime} = int($data->{user} * $clk_to_usec); | |
320 | $res->{stime} = int($data->{system} * $clk_to_usec); | |
321 | } else { | |
322 | die "bad cgroup version: $ver\n"; | |
323 | } | |
324 | ||
325 | return $res; | |
326 | } | |
327 | ||
328 | # Parse some memory data from `memory.stat` | |
329 | sub get_memory_stat { | |
330 | my ($self) = @_; | |
331 | ||
332 | my $res = { | |
333 | mem => 0, | |
334 | swap => 0, | |
335 | }; | |
336 | ||
337 | my ($path, $ver) = $self->get_path('memory', 1); | |
338 | if (!defined($path)) { | |
339 | # container most likely isn't running | |
340 | return undef; | |
341 | } elsif ($ver == 2) { | |
342 | my $mem = file_get_contents("$path/memory.current"); | |
343 | my $swap = file_get_contents("$path/memory.swap.current"); | |
344 | ||
345 | chomp ($mem, $swap); | |
346 | ||
347 | # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up | |
348 | # the values in `memory.stat`... | |
349 | ||
350 | $res->{mem} = $mem; | |
351 | $res->{swap} = $swap; | |
352 | } elsif ($ver == 1) { | |
353 | # cgroupv1 environment: | |
354 | my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat")); | |
355 | my $mem = file_get_contents("$path/memory.usage_in_bytes"); | |
356 | my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes"); | |
357 | chomp ($mem, $memsw); | |
358 | ||
359 | $res->{mem} = $mem - $stat->{total_cache}; | |
360 | $res->{swap} = $memsw - $mem; | |
361 | } else { | |
362 | die "bad cgroup version: $ver\n"; | |
363 | } | |
364 | ||
365 | return $res; | |
366 | } | |
367 | ||
368 | # Change the memory limit for this container. | |
369 | # | |
370 | # Dies on error (including a not-running or currently-shutting-down guest). | |
371 | sub change_memory_limit { | |
372 | my ($self, $mem_bytes, $swap_bytes) = @_; | |
373 | ||
374 | my ($path, $ver) = $self->get_path('memory', 1); | |
375 | if (!defined($path)) { | |
376 | die "trying to change memory cgroup values: container not running\n"; | |
377 | } elsif ($ver == 2) { | |
378 | PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes) | |
379 | if defined($swap_bytes); | |
380 | PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes) | |
381 | if defined($mem_bytes); | |
382 | } elsif ($ver == 1) { | |
383 | # With cgroupv1 we cannot control memory and swap limits separately. | |
384 | # This also means that since the two values aren't independent, we need to handle | |
385 | # growing and shrinking separately. | |
386 | my $path_mem = "$path/memory.limit_in_bytes"; | |
387 | my $path_memsw = "$path/memory.memsw.limit_in_bytes"; | |
388 | ||
389 | my $old_mem_bytes = file_get_contents($path_mem); | |
390 | my $old_memsw_bytes = file_get_contents($path_memsw); | |
391 | chomp($old_mem_bytes, $old_memsw_bytes); | |
392 | ||
393 | $mem_bytes //= $old_mem_bytes; | |
394 | $swap_bytes //= $old_memsw_bytes - $old_mem_bytes; | |
395 | my $memsw_bytes = $mem_bytes + $swap_bytes; | |
396 | ||
397 | if ($memsw_bytes > $old_memsw_bytes) { | |
398 | # Growing the limit means growing the combined limit first, then pulling the | |
399 | # memory limitup. | |
400 | PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); | |
401 | PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); | |
402 | } else { | |
403 | # Shrinking means we first need to shrink the mem-only memsw cannot be | |
404 | # shrunk below it. | |
405 | PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); | |
406 | PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); | |
407 | } | |
408 | } else { | |
409 | die "bad cgroup version: $ver\n"; | |
410 | } | |
411 | ||
412 | # return a truth value | |
413 | return 1; | |
414 | } | |
415 | ||
416 | # Change the cpu quota for a container. | |
417 | # | |
418 | # Dies on error (including a not-running or currently-shutting-down guest). | |
419 | sub change_cpu_quota { | |
420 | my ($self, $quota, $period) = @_; | |
421 | ||
422 | die "quota without period not allowed\n" if !defined($period) && defined($quota); | |
423 | ||
424 | my ($path, $ver) = $self->get_path('cpu', 1); | |
425 | if (!defined($path)) { | |
426 | die "trying to change cpu quota cgroup values: container not running\n"; | |
427 | } elsif ($ver == 2) { | |
428 | # cgroupv2 environment, an undefined (unlimited) quota is defined as "max" | |
429 | # in this interface: | |
430 | $quota //= 'max'; # unlimited | |
431 | if (defined($quota)) { | |
432 | PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period"); | |
433 | } else { | |
434 | # we're allowed to only write the quota: | |
435 | PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max'); | |
436 | } | |
437 | } elsif ($ver == 1) { | |
438 | $quota //= -1; # unlimited | |
439 | $period //= -1; | |
440 | PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period); | |
441 | PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota); | |
442 | } else { | |
443 | die "bad cgroup version: $ver\n"; | |
444 | } | |
445 | ||
446 | # return a truth value | |
447 | return 1; | |
448 | } | |
449 | ||
450 | # Change the cpu "shares" for a container. | |
451 | # | |
452 | # In cgroupv1 we used a value in `[0..500000]` with a default of 1024. | |
453 | # | |
454 | # In cgroupv2 we do not have "shares", we have "weights" in the range | |
455 | # of `[1..10000]` with a default of 100. | |
456 | # | |
457 | # Since the default values don't match when scaling linearly, we use the | |
458 | # values we get as-is and simply error for values >10000 in cgroupv2. | |
459 | # | |
460 | # It is left to the user to figure this out for now. | |
461 | # | |
462 | # Dies on error (including a not-running or currently-shutting-down guest). | |
463 | sub change_cpu_shares { | |
464 | my ($self, $shares, $cgroupv1_default) = @_; | |
465 | ||
466 | my ($path, $ver) = $self->get_path('cpu', 1); | |
467 | if (!defined($path)) { | |
468 | die "trying to change cpu shares/weight cgroup values: container not running\n"; | |
469 | } elsif ($ver == 2) { | |
470 | # the cgroupv2 documentation defines the default to 100 | |
471 | $shares //= 100; | |
472 | die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000; | |
473 | PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares); | |
474 | } elsif ($ver == 1) { | |
6d7c3065 | 475 | $shares //= 1024; |
86dff11c AD |
476 | PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default); |
477 | } else { | |
478 | die "bad cgroup version: $ver\n"; | |
479 | } | |
480 | ||
481 | # return a truth value | |
482 | return 1; | |
483 | } | |
484 | ||
485 | my sub v1_freeze_thaw { | |
486 | my ($self, $controller_path, $freeze) = @_; | |
9465abe2 | 487 | my $path = $self->get_subdir('freezer', 1) |
86dff11c AD |
488 | or die "trying to freeze container: container not running\n"; |
489 | $path = "$controller_path/$path/freezer.state"; | |
490 | ||
491 | my $data = $freeze ? 'FROZEN' : 'THAWED'; | |
492 | PVE::ProcFSTools::write_proc_entry($path, $data); | |
493 | ||
494 | # Here we just poll the freezer.state once per second. | |
495 | while (1) { | |
496 | my $state = file_get_contents($path); | |
497 | chomp $state; | |
498 | last if $state eq $data; | |
499 | } | |
500 | } | |
501 | ||
502 | my sub v2_freeze_thaw { | |
503 | my ($self, $controller_path, $freeze) = @_; | |
9465abe2 | 504 | my $path = $self->get_subdir(undef, 1) |
86dff11c AD |
505 | or die "trying to freeze container: container not running\n"; |
506 | $path = "$controller_path/$path"; | |
507 | ||
508 | my $desired_state = $freeze ? 1 : 0; | |
509 | ||
510 | # cgroupv2 supports poll events on cgroup.events which contains the frozen | |
511 | # state. | |
512 | my $fh = IO::File->new("$path/cgroup.events", 'r') | |
513 | or die "failed to open $path/cgroup.events file: $!\n"; | |
514 | my $select = IO::Select->new(); | |
515 | $select->add($fh); | |
516 | ||
517 | PVE::ProcFSTools::write_proc_entry("$path/cgroup.freeze", $desired_state); | |
518 | while (1) { | |
519 | my $data = do { | |
520 | local $/ = undef; | |
521 | <$fh> | |
522 | }; | |
523 | $data = parse_flat_keyed_file($data); | |
524 | last if $data->{frozen} == $desired_state; | |
525 | my @handles = $select->has_exception(); | |
526 | next if !@handles; | |
527 | seek($fh, 0, 0) | |
528 | or die "failed to rewind cgroup.events file: $!\n"; | |
529 | } | |
530 | } | |
531 | ||
532 | # Freeze or unfreeze a container. | |
533 | # | |
534 | # This will freeze the container at its outer (limiting) cgroup path. We use | |
535 | # this instead of `lxc-freeze` as `lxc-freeze` from lxc4 will not be able to | |
536 | # fetch the cgroup path from contaienrs still running on lxc3. | |
537 | sub freeze_thaw { | |
538 | my ($self, $freeze) = @_; | |
539 | ||
540 | my $controller_path = find_cgroup_controller('freezer'); | |
541 | if (defined($controller_path)) { | |
542 | return v1_freeze_thaw($self, $controller_path, $freeze); | |
543 | } else { | |
544 | # cgroupv2 always has a freezer, there can be both cgv1 and cgv2 | |
545 | # freezers, but we'll prefer v1 when it's available as that's what lxc | |
546 | # does as well... | |
547 | return v2_freeze_thaw($self, cgroupv2_base_path(), $freeze); | |
548 | } | |
549 | } | |
550 | ||
551 | 1; |