]>
Commit | Line | Data |
---|---|---|
80c7e72f WB |
1 | # cgroup handler |
2 | # | |
3 | # This package should deal with figuring out the right cgroup path for a | |
4 | # container (via the command socket), reading and writing cgroup values, and | |
5 | # handling cgroup v1 & v2 differences. | |
6 | # | |
6973a214 | 7 | # Note that the long term plan is to have resource manage functions instead of |
80c7e72f WB |
8 | # dealing with cgroup files on the outside. |
9 | ||
10 | package PVE::LXC::CGroup; | |
11 | ||
12 | use strict; | |
13 | use warnings; | |
14 | ||
a7d10aa3 WB |
15 | use POSIX qw(); |
16 | ||
397b1203 | 17 | use PVE::ProcFSTools; |
acb5fabf WB |
18 | use PVE::Tools qw( |
19 | file_get_contents | |
20 | file_read_firstline | |
21 | ); | |
22 | ||
80c7e72f WB |
23 | use PVE::LXC::Command; |
24 | ||
25 | # We don't want to do a command socket round trip for every cgroup read/write, | |
26 | # so any cgroup function needs to have the container's path cached, so this | |
27 | # package has to be instantiated. | |
28 | # | |
29 | # LXC keeps separate paths by controller (although they're normally all the | |
30 | # same, in our # case anyway), so we cache them by controller as well. | |
31 | sub new { | |
32 | my ($class, $vmid) = @_; | |
33 | ||
34 | my $self = { vmid => $vmid }; | |
35 | ||
36 | return bless $self, $class; | |
37 | } | |
38 | ||
39 | my $CPUSET_BASE = undef; | |
40 | # Find the cpuset cgroup controller. | |
41 | # | |
42 | # This is a function, not a method! | |
43 | sub cpuset_controller_path() { | |
44 | if (!defined($CPUSET_BASE)) { | |
45 | my $CPUSET_PATHS = [ | |
46 | # legacy cpuset cgroup: | |
47 | ['/sys/fs/cgroup/cpuset', 'cpuset.effective_cpus'], | |
48 | # pure cgroupv2 environment: | |
49 | ['/sys/fs/cgroup', 'cpuset.cpus.effective'], | |
50 | # hybrid, with cpuset moved to cgroupv2 | |
51 | ['/sys/fs/cgroup/unified', 'cpuset.cpus.effective'], | |
52 | ]; | |
53 | ||
54 | my ($result) = grep { -f "$_->[0]/$_->[1]" } @$CPUSET_PATHS; | |
55 | die "failed to find cpuset controller\n" if !defined($result); | |
56 | ||
57 | $CPUSET_BASE = $result->[0]; | |
58 | } | |
59 | ||
60 | return $CPUSET_BASE; | |
61 | } | |
62 | ||
63 | my $CGROUP_MODE = undef; | |
64 | # Figure out which cgroup mode we're operating under: | |
65 | # | |
66 | # Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a | |
67 | # cgroupv2-only environment. | |
68 | # | |
69 | # This is a function, not a method! | |
70 | sub cgroup_mode() { | |
71 | if (!defined($CGROUP_MODE)) { | |
72 | my ($v1, $v2) = PVE::LXC::get_cgroup_subsystems(); | |
73 | if (keys %$v1) { | |
74 | # hybrid or legacy mode | |
75 | $CGROUP_MODE = 1; | |
76 | } elsif ($v2) { | |
77 | $CGROUP_MODE = 2; | |
78 | } | |
79 | } | |
80 | ||
81 | die "unknown cgroup mode\n" if !defined($CGROUP_MODE); | |
82 | return $CGROUP_MODE; | |
83 | } | |
84 | ||
85 | # Get a subdirectory (without the cgroup mount point) for a controller. | |
86 | # | |
87 | # If `$controller` is `undef`, get the unified (cgroupv2) path. | |
88 | # | |
89 | # Note that in cgroup v2, lxc uses the activated controller names | |
90 | # (`cgroup.controllers` file) as list of controllers for the unified hierarchy, | |
91 | # so this returns a result when a `controller` is provided even when using | |
92 | # a pure cgroupv2 setup. | |
93 | my sub get_subdir { | |
94 | my ($self, $controller, $limiting) = @_; | |
95 | ||
96 | my $entry_name = $controller || 'unified'; | |
97 | my $entry = ($self->{controllers}->{$entry_name} //= {}); | |
98 | ||
99 | my $kind = $limiting ? 'limit' : 'ns'; | |
100 | my $path = $entry->{$kind}; | |
101 | ||
102 | return $path if defined $path; | |
103 | ||
104 | $path = PVE::LXC::Command::get_cgroup_path( | |
105 | $self->{vmid}, | |
106 | $controller, | |
107 | $limiting, | |
108 | ) or return undef; | |
109 | ||
110 | # untaint: | |
111 | if ($path =~ /\.\./) { | |
112 | die "lxc returned suspicious path: '$path'\n"; | |
113 | } | |
114 | ($path) = ($path =~ /^(.*)$/s); | |
115 | ||
116 | $entry->{$kind} = $path; | |
117 | ||
118 | return $path; | |
119 | } | |
120 | ||
121 | # Get a path for a controller. | |
122 | # | |
123 | # `$controller` may be `undef`, see get_subdir above for details. | |
124 | sub get_path { | |
125 | my ($self, $controller) = @_; | |
126 | ||
127 | my $path = get_subdir($self, $controller) | |
128 | or return undef; | |
129 | ||
6973a214 | 130 | # The main mount point we currently assume to be in a standard location. |
80c7e72f WB |
131 | return "/sys/fs/cgroup/$path" if cgroup_mode() == 2; |
132 | return "/sys/fs/cgroup/unified/$path" if !defined($controller); | |
133 | return "/sys/fs/cgroup/$controller/$path"; | |
134 | } | |
135 | ||
acb5fabf WB |
136 | # Parse a 'Nested keyed' file: |
137 | # | |
138 | # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. | |
139 | my sub parse_nested_keyed_file($) { | |
140 | my ($data) = @_; | |
141 | my $res = {}; | |
142 | foreach my $line (split(/\n/, $data)) { | |
143 | my ($key, @values) = split(/\s+/, $line); | |
144 | ||
145 | my $d = ($res->{$key} = {}); | |
146 | ||
147 | foreach my $value (@values) { | |
148 | if (my ($key, $value) = ($value =~ /^([^=]+)=(.*)$/)) { | |
149 | $d->{$key} = $value; | |
150 | } else { | |
151 | warn "bad key=value pair in nested keyed file\n"; | |
152 | } | |
153 | } | |
154 | } | |
a7d10aa3 | 155 | return $res; |
acb5fabf WB |
156 | } |
157 | ||
a7d10aa3 WB |
158 | # Parse a 'Flat keyed' file: |
159 | # | |
160 | # See kernel documentation `admin-guide/cgroup-v2.rst` 4.1. | |
161 | my sub parse_flat_keyed_file($) { | |
162 | my ($data) = @_; | |
163 | my $res = {}; | |
164 | foreach my $line (split(/\n/, $data)) { | |
165 | if (my ($key, $value) = ($line =~ /^(\S+)\s+(.*)$/)) { | |
166 | $res->{$key} = $value; | |
167 | } else { | |
168 | warn "bad 'key value' pair in flat keyed file\n"; | |
169 | } | |
170 | } | |
171 | return $res; | |
172 | } | |
173 | ||
174 | # Parse out 'diskread' and 'diskwrite' values from I/O stats for this container. | |
acb5fabf WB |
175 | sub get_io_stats { |
176 | my ($self) = @_; | |
177 | ||
178 | my $res = { | |
179 | diskread => 0, | |
180 | diskwrite => 0, | |
181 | }; | |
182 | ||
183 | if (cgroup_mode() == 2) { | |
184 | if (defined(my $path = $self->get_path('io'))) { | |
185 | # cgroupv2 environment, io controller enabled | |
186 | my $io_stat = file_get_contents("$path/io.stat"); | |
187 | ||
188 | my $data = parse_nested_keyed_file($io_stat); | |
189 | foreach my $dev (keys %$data) { | |
190 | my $dev = $data->{$dev}; | |
191 | if (my $b = $dev->{rbytes}) { | |
192 | $res->{diskread} += $b; | |
193 | } | |
194 | if (my $b = $dev->{wbytes}) { | |
195 | $res->{diskread} += $b; | |
196 | } | |
197 | } | |
198 | } else { | |
199 | # io controller not enabled or container not running | |
200 | return undef; | |
201 | } | |
202 | } elsif (defined(my $path = $self->get_path('blkio'))) { | |
203 | # cgroupv1 environment: | |
204 | my $io = file_get_contents("$path/blkio.throttle.io_service_bytes_recursive"); | |
205 | foreach my $line (split(/\n/, $io)) { | |
206 | if (my ($type, $bytes) = ($line =~ /^\d+:\d+\s+(Read|Write)\s+(\d+)$/)) { | |
207 | $res->{diskread} += $bytes if $type eq 'Read'; | |
208 | $res->{diskwrite} += $bytes if $type eq 'Write'; | |
209 | } | |
210 | } | |
211 | } else { | |
212 | # container not running | |
213 | return undef; | |
214 | } | |
215 | ||
216 | return $res; | |
217 | } | |
218 | ||
a7d10aa3 WB |
219 | # Read utime and stime for this container from the cpuacct cgroup. |
220 | # Values are in milliseconds! | |
221 | sub get_cpu_stat { | |
222 | my ($self) = @_; | |
223 | ||
224 | my $res = { | |
225 | utime => 0, | |
226 | stime => 0, | |
227 | }; | |
228 | ||
229 | if (cgroup_mode() == 2) { | |
230 | if (defined(my $path = $self->get_path('cpu'))) { | |
231 | my $data = eval { file_get_contents("$path/cpu.stat") }; | |
232 | ||
233 | # or no io controller available: | |
234 | return undef if !defined($data); | |
235 | ||
236 | $data = parse_flat_keyed_file($data); | |
237 | $res->{utime} = int($data->{user_usec} / 1000); | |
238 | $res->{stime} = int($data->{system_usec} / 1000); | |
239 | } else { | |
240 | # memory controller not enabled or container not running | |
241 | return undef; | |
242 | } | |
243 | } elsif (defined(my $path = $self->get_path('cpuacct'))) { | |
244 | # cgroupv1 environment: | |
245 | my $clock_ticks = POSIX::sysconf(&POSIX::_SC_CLK_TCK); | |
246 | my $clk_to_usec = 1000 / $clock_ticks; | |
247 | ||
248 | my $data = parse_flat_keyed_file(file_get_contents("$path/cpuacct.stat")); | |
249 | $res->{utime} = int($data->{user} * $clk_to_usec); | |
250 | $res->{stime} = int($data->{system} * $clk_to_usec); | |
251 | } else { | |
252 | # container most likely isn't running | |
253 | return undef; | |
254 | } | |
255 | ||
256 | return $res; | |
257 | } | |
258 | ||
8b882cd4 WB |
259 | # Parse some memory data from `memory.stat` |
260 | sub get_memory_stat { | |
261 | my ($self) = @_; | |
262 | ||
263 | my $res = { | |
264 | mem => 0, | |
265 | swap => 0, | |
266 | }; | |
267 | ||
268 | if (cgroup_mode() == 2) { | |
269 | if (defined(my $path = $self->get_path('memory'))) { | |
270 | my $mem = file_get_contents("$path/memory.current"); | |
271 | my $swap = file_get_contents("$path/memory.swap.current"); | |
272 | ||
273 | chomp ($mem, $swap); | |
274 | ||
275 | # FIXME: For the cgv1 equivalent of `total_cache` we may need to sum up | |
276 | # the values in `memory.stat`... | |
277 | ||
278 | $res->{mem} = $mem; | |
279 | $res->{swap} = $swap; | |
280 | } else { | |
281 | # memory controller not enabled or container not running | |
282 | return undef; | |
283 | } | |
284 | } elsif (defined(my $path = $self->get_path('memory'))) { | |
285 | # cgroupv1 environment: | |
286 | my $stat = parse_flat_keyed_file(file_get_contents("$path/memory.stat")); | |
287 | my $mem = file_get_contents("$path/memory.usage_in_bytes"); | |
288 | my $memsw = file_get_contents("$path/memory.memsw.usage_in_bytes"); | |
289 | chomp ($mem, $memsw); | |
290 | ||
291 | $res->{mem} = $mem - $stat->{total_cache}; | |
292 | $res->{swap} = $memsw - $mem; | |
293 | } else { | |
294 | # container most likely isn't running | |
295 | return undef; | |
296 | } | |
297 | ||
298 | return $res; | |
299 | } | |
300 | ||
397b1203 WB |
301 | # Change the memory limit for this container. |
302 | # | |
303 | # Dies on error (including a not-running or currently-shutting-down guest). | |
304 | sub change_memory_limit { | |
305 | my ($self, $mem_bytes, $swap_bytes) = @_; | |
306 | ||
307 | if (cgroup_mode() == 2) { | |
308 | if (defined(my $path = $self->get_path('memory'))) { | |
309 | PVE::ProcFSTools::write_proc_entry("$path/memory.swap.max", $swap_bytes) | |
310 | if defined($swap_bytes); | |
311 | PVE::ProcFSTools::write_proc_entry("$path/memory.max", $mem_bytes) | |
312 | if defined($mem_bytes); | |
313 | return 1; | |
314 | } | |
315 | } elsif (defined(my $path = $self->get_path('memory'))) { | |
316 | # With cgroupv1 we cannot control memory and swap limits separately. | |
317 | # This also means that since the two values aren't independent, we need to handle | |
318 | # growing and shrinking separately. | |
319 | my $path_mem = "$path/memory.limit_in_bytes"; | |
320 | my $path_memsw = "$path/memory.memsw.limit_in_bytes"; | |
321 | ||
322 | my $old_mem_bytes = file_get_contents($path_mem); | |
323 | my $old_memsw_bytes = file_get_contents($path_memsw); | |
324 | chomp($old_mem_bytes, $old_memsw_bytes); | |
325 | ||
326 | $mem_bytes //= $old_mem_bytes; | |
327 | my $memsw_bytes = defined($swap_bytes) ? ($mem_bytes + $swap_bytes) : $old_memsw_bytes; | |
328 | ||
329 | if ($memsw_bytes > $old_memsw_bytes) { | |
330 | # Growing the limit means growing the combined limit first, then pulling the | |
331 | # memory limitup. | |
332 | PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); | |
333 | PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); | |
334 | } else { | |
335 | # Shrinking means we first need to shrink the mem-only memsw cannot be | |
336 | # shrunk below it. | |
337 | PVE::ProcFSTools::write_proc_entry($path_mem, $mem_bytes); | |
338 | PVE::ProcFSTools::write_proc_entry($path_memsw, $memsw_bytes); | |
339 | } | |
340 | return 1; | |
341 | } | |
342 | ||
343 | die "trying to change memory cgroup values: container not running\n"; | |
344 | } | |
345 | ||
26b645e2 WB |
346 | # Change the cpu quota for a container. |
347 | # | |
348 | # Dies on error (including a not-running or currently-shutting-down guest). | |
349 | sub change_cpu_quota { | |
350 | my ($self, $quota, $period) = @_; | |
351 | ||
352 | die "quota without period not allowed\n" if !defined($period) && defined($quota); | |
353 | ||
354 | if (cgroup_mode() == 2) { | |
355 | if (defined(my $path = $self->get_path('cpu'))) { | |
356 | # cgroupv2 environment, an undefined (unlimited) quota is defined as "max" | |
357 | # in this interface: | |
358 | $quota //= 'max'; # unlimited | |
359 | if (defined($quota)) { | |
360 | PVE::ProcFSTools::write_proc_entry("$path/cpu.max", "$quota $period"); | |
361 | } else { | |
362 | # we're allowed to only write the quota: | |
363 | PVE::ProcFSTools::write_proc_entry("$path/cpu.max", 'max'); | |
364 | } | |
365 | return 1; | |
366 | } | |
367 | } elsif (defined(my $path = $self->get_path('cpu'))) { | |
368 | $quota //= -1; # unlimited | |
369 | $period //= -1; | |
370 | PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_period_us", $period); | |
371 | PVE::ProcFSTools::write_proc_entry("$path/cpu.cfs_quota_us", $quota); | |
372 | return 1; | |
373 | } | |
374 | ||
375 | die "trying to change cpu quota cgroup values: container not running\n"; | |
376 | } | |
377 | ||
378 | # Change the cpu "shares" for a container. | |
379 | # | |
380 | # In cgroupv1 we used a value in `[0..500000]` with a default of 1024. | |
381 | # | |
382 | # In cgroupv2 we do not have "shares", we have "weights" in the range | |
383 | # of `[1..10000]` with a default of 100. | |
384 | # | |
385 | # Since the default values don't match when scaling linearly, we use the | |
386 | # values we get as-is and simply error for values >10000 in cgroupv2. | |
387 | # | |
388 | # It is left to the user to figure this out for now. | |
389 | # | |
390 | # Dies on error (including a not-running or currently-shutting-down guest). | |
391 | sub change_cpu_shares { | |
392 | my ($self, $shares, $cgroupv1_default) = @_; | |
393 | ||
394 | if (cgroup_mode() == 2) { | |
395 | if (defined(my $path = $self->get_path('cpu'))) { | |
396 | # the cgroupv2 documentation defines the default to 100 | |
397 | $shares //= 100; | |
398 | die "cpu weight (shares) must be in range [1, 10000]\n" if $shares < 1 || $shares > 10000; | |
399 | PVE::ProcFSTools::write_proc_entry("$path/cpu.weight", $shares); | |
400 | return 1; | |
401 | } | |
402 | } elsif (defined(my $path = $self->get_path('cpu'))) { | |
403 | $shares //= 100; | |
404 | PVE::ProcFSTools::write_proc_entry("$path/cpu.shares", $shares // $cgroupv1_default); | |
405 | return 1; | |
406 | } | |
407 | ||
408 | # container most likely isn't running | |
409 | die "trying to change cpu shares/weight cgroup values: container not running\n"; | |
410 | } | |
411 | ||
80c7e72f | 412 | 1; |