]>
Commit | Line | Data |
---|---|---|
a0649da2 AD |
1 | package PVE::QemuServer::Memory; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
71aba4ea | 5 | |
0e32bf5b | 6 | use PVE::JSONSchema qw(parse_property_string); |
7023f3ea | 7 | use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach); |
9a1c413f | 8 | use PVE::Exception qw(raise raise_param_exc); |
a0649da2 | 9 | |
0e32bf5b | 10 | use PVE::QemuServer::Helpers qw(parse_number_sets); |
0a13e08e | 11 | use PVE::QemuServer::Monitor qw(mon_cmd); |
87b0f305 | 12 | use PVE::QemuServer::QMPHelpers qw(qemu_devicedel qemu_objectdel); |
71aba4ea | 13 | |
7f8c8087 AD |
14 | use base qw(Exporter); |
15 | ||
16 | our @EXPORT_OK = qw( | |
17 | get_current_memory | |
18 | ); | |
19 | ||
0e32bf5b FE |
20 | our $MAX_NUMA = 8; |
21 | ||
22 | my $numa_fmt = { | |
23 | cpus => { | |
24 | type => "string", | |
25 | pattern => qr/\d+(?:-\d+)?(?:;\d+(?:-\d+)?)*/, | |
26 | description => "CPUs accessing this NUMA node.", | |
27 | format_description => "id[-id];...", | |
28 | }, | |
29 | memory => { | |
30 | type => "number", | |
31 | description => "Amount of memory this NUMA node provides.", | |
32 | optional => 1, | |
33 | }, | |
34 | hostnodes => { | |
35 | type => "string", | |
36 | pattern => qr/\d+(?:-\d+)?(?:;\d+(?:-\d+)?)*/, | |
37 | description => "Host NUMA nodes to use.", | |
38 | format_description => "id[-id];...", | |
39 | optional => 1, | |
40 | }, | |
41 | policy => { | |
42 | type => 'string', | |
43 | enum => [qw(preferred bind interleave)], | |
44 | description => "NUMA allocation policy.", | |
45 | optional => 1, | |
46 | }, | |
47 | }; | |
48 | PVE::JSONSchema::register_format('pve-qm-numanode', $numa_fmt); | |
49 | our $numadesc = { | |
50 | optional => 1, | |
51 | type => 'string', format => $numa_fmt, | |
52 | description => "NUMA topology.", | |
53 | }; | |
54 | PVE::JSONSchema::register_standard_option("pve-qm-numanode", $numadesc); | |
55 | ||
56 | sub parse_numa { | |
57 | my ($data) = @_; | |
58 | ||
59 | my $res = parse_property_string($numa_fmt, $data); | |
60 | $res->{cpus} = parse_number_sets($res->{cpus}) if defined($res->{cpus}); | |
61 | $res->{hostnodes} = parse_number_sets($res->{hostnodes}) if defined($res->{hostnodes}); | |
62 | return $res; | |
63 | } | |
64 | ||
6779f1ac | 65 | my $STATICMEM = 1024; |
a0649da2 | 66 | |
7f8c8087 AD |
67 | our $memory_fmt = { |
68 | current => { | |
69 | description => "Current amount of online RAM for the VM in MiB. This is the maximum available memory when" | |
70 | ." you use the balloon device.", | |
71 | type => 'integer', | |
72 | default_key => 1, | |
73 | minimum => 16, | |
74 | default => 512, | |
75 | }, | |
76 | }; | |
77 | ||
78 | sub print_memory { | |
79 | my $memory = shift; | |
80 | ||
81 | return PVE::JSONSchema::print_property_string($memory, $memory_fmt); | |
82 | } | |
83 | ||
84 | sub parse_memory { | |
85 | my ($value) = @_; | |
86 | ||
87 | return { current => $memory_fmt->{current}->{default} } if !defined($value); | |
88 | ||
89 | my $res = PVE::JSONSchema::parse_property_string($memory_fmt, $value); | |
90 | ||
91 | return $res; | |
92 | } | |
93 | ||
33b0d3b7 | 94 | my $_host_bits; |
2166f6a9 | 95 | sub get_host_phys_address_bits { |
33b0d3b7 TL |
96 | return $_host_bits if defined($_host_bits); |
97 | ||
98 | my $fh = IO::File->new ('/proc/cpuinfo', "r") or return; | |
99 | while (defined(my $line = <$fh>)) { | |
100 | # hopefully we never need to care about mixed (big.LITTLE) archs | |
101 | if ($line =~ m/^address sizes\s*:\s*(\d+)\s*bits physical/i) { | |
102 | $_host_bits = int($1); | |
103 | $fh->close(); | |
104 | return $_host_bits; | |
105 | } | |
106 | } | |
107 | $fh->close(); | |
108 | return; # undef, cannot really do anything.. | |
109 | } | |
110 | ||
111 | my sub get_max_mem { | |
112 | my ($conf) = @_; | |
113 | ||
114 | my $cpu = {}; | |
115 | if (my $cpu_prop_str = $conf->{cpu}) { | |
116 | $cpu = PVE::JSONSchema::parse_property_string('pve-vm-cpu-conf', $cpu_prop_str) | |
117 | or die "Cannot parse cpu description: $cpu_prop_str\n"; | |
118 | } | |
119 | my $bits; | |
120 | if (my $phys_bits = $cpu->{'phys-bits'}) { | |
121 | if ($phys_bits eq 'host') { | |
122 | $bits = get_host_phys_address_bits(); | |
123 | } elsif ($phys_bits =~ /^(\d+)$/) { | |
124 | $bits = int($phys_bits); | |
125 | } | |
126 | } | |
127 | ||
128 | if (!defined($bits)) { | |
129 | my $host_bits = get_host_phys_address_bits() // 36; # fixme: what fallback? | |
130 | if ($cpu->{cputype} && $cpu->{cputype} =~ /^(host|max)$/) { | |
131 | $bits = $host_bits; | |
132 | } else { | |
133 | $bits = $host_bits > 40 ? 40 : $host_bits; # take the smaller one | |
134 | } | |
135 | } | |
136 | ||
305e9cec TL |
137 | $bits = $bits & ~1; # round down to nearest even as limit is lower with odd bit sizes |
138 | ||
139 | # heuristic: remove 20 bits to get MB and half that as QEMU needs some overhead | |
140 | my $bits_to_max_mem = int(1<<($bits - 21)); | |
33b0d3b7 TL |
141 | |
142 | return $bits_to_max_mem > 4*1024*1024 ? 4*1024*1024 : $bits_to_max_mem; | |
143 | } | |
144 | ||
7f8c8087 AD |
145 | sub get_current_memory { |
146 | my ($value) = @_; | |
147 | ||
148 | my $memory = parse_memory($value); | |
149 | return $memory->{current}; | |
150 | } | |
151 | ||
aaff69ad WB |
152 | sub get_numa_node_list { |
153 | my ($conf) = @_; | |
154 | my @numa_map; | |
155 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
156 | my $entry = $conf->{"numa$i"} or next; | |
0e32bf5b | 157 | my $numa = parse_numa($entry) or next; |
aaff69ad WB |
158 | push @numa_map, $i; |
159 | } | |
160 | return @numa_map if @numa_map; | |
161 | my $sockets = $conf->{sockets} || 1; | |
162 | return (0..($sockets-1)); | |
163 | } | |
164 | ||
2166f6a9 AD |
165 | sub host_numanode_exists { |
166 | my ($id) = @_; | |
167 | ||
168 | return -d "/sys/devices/system/node/node$id/"; | |
169 | } | |
170 | ||
aaff69ad WB |
171 | # only valid when numa nodes map to a single host node |
172 | sub get_numa_guest_to_host_map { | |
173 | my ($conf) = @_; | |
174 | my $map = {}; | |
175 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
176 | my $entry = $conf->{"numa$i"} or next; | |
0e32bf5b | 177 | my $numa = parse_numa($entry) or next; |
aaff69ad WB |
178 | $map->{$i} = print_numa_hostnodes($numa->{hostnodes}); |
179 | } | |
180 | return $map if %$map; | |
181 | my $sockets = $conf->{sockets} || 1; | |
1d682951 | 182 | return {map { $_ => $_ } (0..($sockets-1))}; |
aaff69ad WB |
183 | } |
184 | ||
3f669af2 | 185 | sub foreach_dimm{ |
a132d50e | 186 | my ($conf, $vmid, $memory, $static_memory, $func) = @_; |
3f669af2 AD |
187 | |
188 | my $dimm_id = 0; | |
a132d50e | 189 | my $current_size = $static_memory; |
7023f3ea AD |
190 | my $dimm_size = 0; |
191 | ||
192 | if($conf->{hugepages} && $conf->{hugepages} == 1024) { | |
7023f3ea AD |
193 | $dimm_size = 1024; |
194 | } else { | |
7023f3ea AD |
195 | $dimm_size = 512; |
196 | } | |
197 | ||
3f669af2 AD |
198 | return if $current_size == $memory; |
199 | ||
aaff69ad WB |
200 | my @numa_map = get_numa_node_list($conf); |
201 | ||
3f669af2 AD |
202 | for (my $j = 0; $j < 8; $j++) { |
203 | for (my $i = 0; $i < 32; $i++) { | |
204 | my $name = "dimm${dimm_id}"; | |
205 | $dimm_id++; | |
aaff69ad | 206 | my $numanode = $numa_map[$i % @numa_map]; |
3f669af2 AD |
207 | $current_size += $dimm_size; |
208 | &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); | |
209 | return $current_size if $current_size >= $memory; | |
210 | } | |
211 | $dimm_size *= 2; | |
212 | } | |
213 | } | |
214 | ||
6779f1ac | 215 | sub qemu_memory_hotplug { |
7f8c8087 | 216 | my ($vmid, $conf, $value) = @_; |
6779f1ac | 217 | |
2951c45e | 218 | return $value if !PVE::QemuServer::Helpers::vm_running_locally($vmid); |
6779f1ac | 219 | |
7f8c8087 AD |
220 | my $oldmem = parse_memory($conf->{memory}); |
221 | my $newmem = parse_memory($value); | |
7023f3ea | 222 | |
7f8c8087 | 223 | return $value if $newmem->{current} == $oldmem->{current}; |
6779f1ac | 224 | |
7f8c8087 AD |
225 | my $memory = $oldmem->{current}; |
226 | $value = $newmem->{current}; | |
227 | ||
228 | my $sockets = $conf->{sockets} || 1; | |
6779f1ac | 229 | my $static_memory = $STATICMEM; |
7023f3ea | 230 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); |
6779f1ac AD |
231 | |
232 | die "memory can't be lower than $static_memory MB" if $value < $static_memory; | |
33b0d3b7 | 233 | my $MAX_MEM = get_max_mem($conf); |
62b26624 | 234 | die "you cannot add more memory than max mem $MAX_MEM MB!\n" if $value > $MAX_MEM; |
6779f1ac | 235 | |
33b0d3b7 | 236 | if ($value > $memory) { |
6779f1ac | 237 | |
f7d1505b | 238 | my $numa_hostmap; |
aaff69ad | 239 | |
a132d50e | 240 | foreach_dimm($conf, $vmid, $value, $static_memory, sub { |
6779f1ac AD |
241 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; |
242 | ||
7f8c8087 | 243 | return if $current_size <= get_current_memory($conf->{memory}); |
6779f1ac | 244 | |
7023f3ea | 245 | if ($conf->{hugepages}) { |
f7d1505b | 246 | $numa_hostmap = get_numa_guest_to_host_map($conf) if !$numa_hostmap; |
7023f3ea AD |
247 | |
248 | my $hugepages_size = hugepages_size($conf, $dimm_size); | |
249 | my $path = hugepages_mount_path($hugepages_size); | |
aaff69ad WB |
250 | my $host_numanode = $numa_hostmap->{$numanode}; |
251 | my $hugepages_topology->{$hugepages_size}->{$host_numanode} = hugepages_nr($dimm_size, $hugepages_size); | |
7023f3ea AD |
252 | |
253 | my $code = sub { | |
254 | my $hugepages_host_topology = hugepages_host_topology(); | |
255 | hugepages_allocate($hugepages_topology, $hugepages_host_topology); | |
256 | ||
ae776a62 | 257 | eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", size => int($dimm_size*1024*1024), 'mem-path' => $path, share => JSON::true, prealloc => JSON::true ) }; |
7023f3ea AD |
258 | if (my $err = $@) { |
259 | hugepages_reset($hugepages_host_topology); | |
260 | die $err; | |
261 | } | |
262 | ||
263 | hugepages_pre_deallocate($hugepages_topology); | |
264 | }; | |
265 | eval { hugepages_update_locked($code); }; | |
266 | ||
267 | } else { | |
ae776a62 | 268 | eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", size => int($dimm_size*1024*1024) ) }; |
7023f3ea AD |
269 | } |
270 | ||
6779f1ac | 271 | if (my $err = $@) { |
87b0f305 | 272 | eval { qemu_objectdel($vmid, "mem-$name"); }; |
6779f1ac AD |
273 | die $err; |
274 | } | |
275 | ||
0a13e08e | 276 | eval { mon_cmd($vmid, "device_add", driver => "pc-dimm", id => "$name", memdev => "mem-$name", node => $numanode) }; |
6779f1ac | 277 | if (my $err = $@) { |
87b0f305 | 278 | eval { qemu_objectdel($vmid, "mem-$name"); }; |
6779f1ac AD |
279 | die $err; |
280 | } | |
281 | #update conf after each succesful module hotplug | |
7f8c8087 AD |
282 | $newmem->{current} = $current_size; |
283 | $conf->{memory} = print_memory($newmem); | |
6779f1ac AD |
284 | PVE::QemuConfig->write_config($vmid, $conf); |
285 | }); | |
286 | ||
287 | } else { | |
288 | ||
d1b25a22 | 289 | my $dimms = qemu_memdevices_list($vmid, 'dimm'); |
6779f1ac | 290 | |
d1b25a22 | 291 | my $current_size = $memory; |
2cf20319 | 292 | for my $name (sort { ($b =~ /^dimm(\d+)$/)[0] <=> ($a =~ /^dimm(\d+)$/)[0] } keys %$dimms) { |
6779f1ac | 293 | |
d1b25a22 | 294 | my $dimm_size = $dimms->{$name}->{size} / 1024 / 1024; |
6779f1ac | 295 | |
d1b25a22 AD |
296 | last if $current_size <= $value; |
297 | ||
298 | print "try to unplug memory dimm $name\n"; | |
299 | ||
300 | my $retry = 0; | |
301 | while (1) { | |
87b0f305 | 302 | eval { qemu_devicedel($vmid, $name) }; |
d1b25a22 AD |
303 | sleep 3; |
304 | my $dimm_list = qemu_memdevices_list($vmid, 'dimm'); | |
305 | last if !$dimm_list->{$name}; | |
306 | raise_param_exc({ $name => "error unplug memory module" }) if $retry > 5; | |
307 | $retry++; | |
308 | } | |
309 | $current_size -= $dimm_size; | |
310 | #update conf after each succesful module unplug | |
7f8c8087 AD |
311 | $newmem->{current} = $current_size; |
312 | $conf->{memory} = print_memory($newmem); | |
d1b25a22 | 313 | |
87b0f305 | 314 | eval { qemu_objectdel($vmid, "mem-$name"); }; |
d1b25a22 AD |
315 | PVE::QemuConfig->write_config($vmid, $conf); |
316 | } | |
6779f1ac | 317 | } |
d1b25a22 | 318 | return $conf->{memory}; |
6779f1ac AD |
319 | } |
320 | ||
1e28e8ba AD |
321 | sub qemu_memdevices_list { |
322 | my ($vmid, $type) = @_; | |
6779f1ac | 323 | |
0a13e08e | 324 | my $dimmarray = mon_cmd($vmid, "query-memory-devices"); |
6779f1ac AD |
325 | my $dimms = {}; |
326 | ||
327 | foreach my $dimm (@$dimmarray) { | |
1e28e8ba | 328 | next if $type && $dimm->{data}->{id} !~ /^$type(\d+)$/; |
6779f1ac AD |
329 | $dimms->{$dimm->{data}->{id}}->{id} = $dimm->{data}->{id}; |
330 | $dimms->{$dimm->{data}->{id}}->{node} = $dimm->{data}->{node}; | |
331 | $dimms->{$dimm->{data}->{id}}->{addr} = $dimm->{data}->{addr}; | |
332 | $dimms->{$dimm->{data}->{id}}->{size} = $dimm->{data}->{size}; | |
333 | $dimms->{$dimm->{data}->{id}}->{slot} = $dimm->{data}->{slot}; | |
334 | } | |
335 | return $dimms; | |
336 | } | |
337 | ||
0567a4d5 | 338 | sub config { |
7f8c8087 | 339 | my ($conf, $vmid, $sockets, $cores, $hotplug, $cmd) = @_; |
a022e3fd | 340 | |
7f8c8087 | 341 | my $memory = get_current_memory($conf->{memory}); |
0567a4d5 | 342 | my $static_memory = 0; |
0567a4d5 | 343 | |
dafb728c | 344 | if ($hotplug) { |
7023f3ea | 345 | die "NUMA needs to be enabled for memory hotplug\n" if !$conf->{numa}; |
33b0d3b7 | 346 | my $MAX_MEM = get_max_mem($conf); |
0567a4d5 | 347 | die "Total memory is bigger than ${MAX_MEM}MB\n" if $memory > $MAX_MEM; |
456bab54 SR |
348 | |
349 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
350 | die "cannot enable memory hotplugging with custom NUMA topology\n" | |
351 | if $conf->{"numa$i"}; | |
352 | } | |
353 | ||
d82ae201 | 354 | my $sockets = $conf->{sockets} || 1; |
7023f3ea | 355 | |
0567a4d5 | 356 | $static_memory = $STATICMEM; |
7023f3ea AD |
357 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); |
358 | ||
0567a4d5 | 359 | die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory); |
0567a4d5 AD |
360 | push @$cmd, '-m', "size=${static_memory},slots=255,maxmem=${MAX_MEM}M"; |
361 | ||
362 | } else { | |
363 | ||
364 | $static_memory = $memory; | |
365 | push @$cmd, '-m', $static_memory; | |
366 | } | |
367 | ||
7023f3ea AD |
368 | die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa}; |
369 | ||
0567a4d5 AD |
370 | if ($conf->{numa}) { |
371 | ||
372 | my $numa_totalmemory = undef; | |
373 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
374 | next if !$conf->{"numa$i"}; | |
0e32bf5b | 375 | my $numa = parse_numa($conf->{"numa$i"}); |
0567a4d5 AD |
376 | next if !$numa; |
377 | # memory | |
378 | die "missing NUMA node$i memory value\n" if !$numa->{memory}; | |
379 | my $numa_memory = $numa->{memory}; | |
380 | $numa_totalmemory += $numa_memory; | |
7023f3ea AD |
381 | |
382 | my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); | |
0567a4d5 AD |
383 | |
384 | # cpus | |
385 | my $cpulists = $numa->{cpus}; | |
386 | die "missing NUMA node$i cpus\n" if !defined($cpulists); | |
f612bd67 | 387 | my $cpus = join(',cpus=', map { |
0567a4d5 AD |
388 | my ($start, $end) = @$_; |
389 | defined($end) ? "$start-$end" : $start | |
390 | } @$cpulists); | |
391 | ||
392 | # hostnodes | |
393 | my $hostnodelists = $numa->{hostnodes}; | |
394 | if (defined($hostnodelists)) { | |
ac7b7087 AD |
395 | |
396 | my $hostnodes = print_numa_hostnodes($hostnodelists); | |
0567a4d5 AD |
397 | |
398 | # policy | |
399 | my $policy = $numa->{policy}; | |
400 | die "you need to define a policy for hostnode $hostnodes\n" if !$policy; | |
7023f3ea AD |
401 | $mem_object .= ",host-nodes=$hostnodes,policy=$policy"; |
402 | } else { | |
403 | die "numa hostnodes need to be defined to use hugepages" if $conf->{hugepages}; | |
0567a4d5 AD |
404 | } |
405 | ||
7023f3ea | 406 | push @$cmd, '-object', $mem_object; |
0567a4d5 AD |
407 | push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; |
408 | } | |
409 | ||
410 | die "total memory for NUMA nodes must be equal to vm static memory\n" | |
411 | if $numa_totalmemory && $numa_totalmemory != $static_memory; | |
412 | ||
413 | #if no custom tology, we split memory and cores across numa nodes | |
f7d1505b | 414 | if (!$numa_totalmemory) { |
7023f3ea | 415 | my $numa_memory = ($static_memory / $sockets); |
0567a4d5 AD |
416 | |
417 | for (my $i = 0; $i < $sockets; $i++) { | |
2166f6a9 AD |
418 | die "host NUMA node$i doesn't exist\n" |
419 | if !host_numanode_exists($i) && $conf->{hugepages}; | |
0567a4d5 | 420 | |
7023f3ea | 421 | my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); |
7023f3ea | 422 | push @$cmd, '-object', $mem_object; |
f7d1505b TL |
423 | |
424 | my $cpus = ($cores * $i); | |
425 | $cpus .= "-" . ($cpus + $cores - 1) if $cores > 1; | |
426 | ||
0567a4d5 AD |
427 | push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; |
428 | } | |
429 | } | |
430 | } | |
431 | ||
dafb728c | 432 | if ($hotplug) { |
a132d50e | 433 | foreach_dimm($conf, $vmid, $memory, $static_memory, sub { |
0567a4d5 | 434 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; |
7023f3ea AD |
435 | |
436 | my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size); | |
437 | ||
438 | push @$cmd, "-object" , $mem_object; | |
0567a4d5 AD |
439 | push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode"; |
440 | ||
746232ee SR |
441 | die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n" |
442 | if $current_size > $memory; | |
0567a4d5 AD |
443 | }); |
444 | } | |
445 | } | |
446 | ||
7023f3ea AD |
447 | sub print_mem_object { |
448 | my ($conf, $id, $size) = @_; | |
449 | ||
450 | if ($conf->{hugepages}) { | |
451 | ||
452 | my $hugepages_size = hugepages_size($conf, $size); | |
453 | my $path = hugepages_mount_path($hugepages_size); | |
454 | ||
455 | return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes"; | |
456 | } else { | |
457 | return "memory-backend-ram,id=$id,size=${size}M"; | |
458 | } | |
0567a4d5 | 459 | |
7023f3ea AD |
460 | } |
461 | ||
ac7b7087 AD |
462 | sub print_numa_hostnodes { |
463 | my ($hostnodelists) = @_; | |
464 | ||
465 | my $hostnodes; | |
466 | foreach my $hostnoderange (@$hostnodelists) { | |
467 | my ($start, $end) = @$hostnoderange; | |
468 | $hostnodes .= ',' if $hostnodes; | |
469 | $hostnodes .= $start; | |
470 | $hostnodes .= "-$end" if defined($end); | |
471 | $end //= $start; | |
472 | for (my $i = $start; $i <= $end; ++$i ) { | |
2166f6a9 | 473 | die "host NUMA node$i doesn't exist\n" if !host_numanode_exists($i); |
ac7b7087 AD |
474 | } |
475 | } | |
476 | return $hostnodes; | |
477 | } | |
478 | ||
7023f3ea AD |
479 | sub hugepages_mount { |
480 | ||
481 | my $mountdata = PVE::ProcFSTools::parse_proc_mounts(); | |
482 | ||
483 | foreach my $size (qw(2048 1048576)) { | |
3d8d2e8d | 484 | next if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB"); |
7023f3ea AD |
485 | |
486 | my $path = "/run/hugepages/kvm/${size}kB"; | |
487 | ||
488 | my $found = grep { | |
489 | $_->[2] =~ /^hugetlbfs/ && | |
490 | $_->[1] eq $path | |
491 | } @$mountdata; | |
492 | ||
493 | if (!$found) { | |
494 | ||
495 | File::Path::make_path($path) if (!-d $path); | |
496 | my $cmd = ['/bin/mount', '-t', 'hugetlbfs', '-o', "pagesize=${size}k", 'hugetlbfs', $path]; | |
497 | run_command($cmd, errmsg => "hugepage mount error"); | |
498 | } | |
499 | } | |
500 | } | |
501 | ||
502 | sub hugepages_mount_path { | |
503 | my ($size) = @_; | |
504 | ||
505 | $size = $size * 1024; | |
506 | return "/run/hugepages/kvm/${size}kB"; | |
507 | ||
508 | } | |
509 | ||
510 | sub hugepages_nr { | |
511 | my ($size, $hugepages_size) = @_; | |
512 | ||
513 | return $size / $hugepages_size; | |
514 | } | |
515 | ||
2166f6a9 AD |
516 | sub hugepages_chunk_size_supported { |
517 | my ($size) = @_; | |
518 | ||
519 | return -d "/sys/kernel/mm/hugepages/hugepages-". ($size * 1024) ."kB"; | |
520 | } | |
521 | ||
7023f3ea | 522 | sub hugepages_size { |
71aba4ea TL |
523 | my ($conf, $size) = @_; |
524 | die "hugepages option is not enabled" if !$conf->{hugepages}; | |
525 | die "memory size '$size' is not a positive even integer; cannot use for hugepages\n" | |
526 | if $size <= 0 || $size & 1; | |
7023f3ea | 527 | |
2166f6a9 AD |
528 | die "your system doesn't support hugepages\n" |
529 | if !hugepages_chunk_size_supported(2) && !hugepages_chunk_size_supported(1024); | |
7023f3ea | 530 | |
71aba4ea | 531 | if ($conf->{hugepages} eq 'any') { |
7023f3ea | 532 | |
71aba4ea | 533 | # try to use 1GB if available && memory size is matching |
2166f6a9 | 534 | if (hugepages_chunk_size_supported(1024) && ($size & 1023) == 0) { |
7023f3ea | 535 | return 1024; |
2166f6a9 | 536 | } elsif (hugepages_chunk_size_supported(2)) { |
7023f3ea | 537 | return 2; |
71aba4ea TL |
538 | } else { |
539 | die "host only supports 1024 GB hugepages, but requested size '$size' is not a multiple of 1024 MB\n" | |
7023f3ea | 540 | } |
71aba4ea | 541 | } else { |
7023f3ea | 542 | |
93981fa7 | 543 | my $hugepagesize = $conf->{hugepages}; |
7023f3ea | 544 | |
2166f6a9 | 545 | if (!hugepages_chunk_size_supported($hugepagesize)) { |
93981fa7 TL |
546 | die "your system doesn't support hugepages of $hugepagesize MB\n"; |
547 | } elsif (($size % $hugepagesize) != 0) { | |
548 | die "Memory size $size is not a multiple of the requested hugepages size $hugepagesize\n"; | |
7023f3ea | 549 | } |
93981fa7 TL |
550 | |
551 | return $hugepagesize | |
71aba4ea | 552 | } |
7023f3ea AD |
553 | } |
554 | ||
555 | sub hugepages_topology { | |
dafb728c | 556 | my ($conf, $hotplug) = @_; |
7023f3ea AD |
557 | |
558 | my $hugepages_topology = {}; | |
559 | ||
560 | return if !$conf->{numa}; | |
561 | ||
7f8c8087 | 562 | my $memory = get_current_memory($conf->{memory}); |
7023f3ea | 563 | my $static_memory = 0; |
d82ae201 | 564 | my $sockets = $conf->{sockets} || 1; |
7023f3ea | 565 | my $numa_custom_topology = undef; |
7023f3ea | 566 | |
dafb728c | 567 | if ($hotplug) { |
7023f3ea AD |
568 | $static_memory = $STATICMEM; |
569 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); | |
570 | } else { | |
571 | $static_memory = $memory; | |
572 | } | |
573 | ||
574 | #custom numa topology | |
575 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
576 | next if !$conf->{"numa$i"}; | |
0e32bf5b | 577 | my $numa = parse_numa($conf->{"numa$i"}); |
7023f3ea AD |
578 | next if !$numa; |
579 | ||
580 | $numa_custom_topology = 1; | |
581 | my $numa_memory = $numa->{memory}; | |
ac7b7087 AD |
582 | my $hostnodelists = $numa->{hostnodes}; |
583 | my $hostnodes = print_numa_hostnodes($hostnodelists); | |
7023f3ea | 584 | |
ac7b7087 | 585 | die "more than 1 hostnode value in numa node is not supported when hugepages are enabled" if $hostnodes !~ m/^(\d)$/; |
7023f3ea | 586 | my $hugepages_size = hugepages_size($conf, $numa_memory); |
ac7b7087 | 587 | $hugepages_topology->{$hugepages_size}->{$hostnodes} += hugepages_nr($numa_memory, $hugepages_size); |
7023f3ea AD |
588 | |
589 | } | |
590 | ||
591 | #if no custom numa tology, we split memory and cores across numa nodes | |
592 | if(!$numa_custom_topology) { | |
593 | ||
594 | my $numa_memory = ($static_memory / $sockets); | |
595 | ||
596 | for (my $i = 0; $i < $sockets; $i++) { | |
597 | ||
598 | my $hugepages_size = hugepages_size($conf, $numa_memory); | |
599 | $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size); | |
600 | } | |
601 | } | |
602 | ||
dafb728c | 603 | if ($hotplug) { |
aaff69ad WB |
604 | my $numa_hostmap = get_numa_guest_to_host_map($conf); |
605 | ||
a132d50e | 606 | foreach_dimm($conf, undef, $memory, $static_memory, sub { |
7023f3ea AD |
607 | my ($conf, undef, $name, $dimm_size, $numanode, $current_size, $memory) = @_; |
608 | ||
aaff69ad WB |
609 | $numanode = $numa_hostmap->{$numanode}; |
610 | ||
7023f3ea AD |
611 | my $hugepages_size = hugepages_size($conf, $dimm_size); |
612 | $hugepages_topology->{$hugepages_size}->{$numanode} += hugepages_nr($dimm_size, $hugepages_size); | |
613 | }); | |
614 | } | |
615 | ||
616 | return $hugepages_topology; | |
617 | } | |
618 | ||
619 | sub hugepages_host_topology { | |
620 | ||
621 | #read host hugepages | |
622 | my $hugepages_host_topology = {}; | |
623 | ||
624 | dir_glob_foreach("/sys/devices/system/node/", 'node(\d+)', sub { | |
625 | my ($nodepath, $numanode) = @_; | |
626 | ||
627 | dir_glob_foreach("/sys/devices/system/node/$nodepath/hugepages/", 'hugepages\-(\d+)kB', sub { | |
628 | my ($hugepages_path, $hugepages_size) = @_; | |
629 | ||
630 | $hugepages_size = $hugepages_size / 1024; | |
631 | my $hugepages_nr = PVE::Tools::file_read_firstline("/sys/devices/system/node/$nodepath/hugepages/$hugepages_path/nr_hugepages"); | |
632 | $hugepages_host_topology->{$hugepages_size}->{$numanode} = $hugepages_nr; | |
633 | }); | |
634 | }); | |
635 | ||
636 | return $hugepages_host_topology; | |
637 | } | |
638 | ||
639 | sub hugepages_allocate { | |
640 | my ($hugepages_topology, $hugepages_host_topology) = @_; | |
641 | ||
642 | #allocate new hupages if needed | |
643 | foreach my $size (sort keys %$hugepages_topology) { | |
644 | ||
645 | my $nodes = $hugepages_topology->{$size}; | |
646 | ||
647 | foreach my $numanode (keys %$nodes) { | |
648 | ||
649 | my $hugepages_size = $size * 1024; | |
650 | my $hugepages_requested = $hugepages_topology->{$size}->{$numanode}; | |
651 | my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/"; | |
652 | my $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages"); | |
653 | my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages"); | |
654 | ||
655 | if ($hugepages_requested > $hugepages_free) { | |
656 | my $hugepages_needed = $hugepages_requested - $hugepages_free; | |
657 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr + $hugepages_needed); | |
658 | #verify that is correctly allocated | |
659 | $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages"); | |
660 | if ($hugepages_free < $hugepages_requested) { | |
661 | #rollback to initial host config | |
662 | hugepages_reset($hugepages_host_topology); | |
663 | die "hugepage allocation failed"; | |
664 | } | |
665 | } | |
666 | ||
667 | } | |
668 | } | |
669 | ||
670 | } | |
671 | ||
ca0ef6b1 KT |
672 | sub hugepages_default_nr_hugepages { |
673 | my ($size) = @_; | |
674 | ||
675 | my $cmdline = PVE::Tools::file_read_firstline("/proc/cmdline"); | |
676 | my $args = PVE::Tools::split_args($cmdline); | |
677 | ||
678 | my $parsed_size = 2; # default is 2M | |
679 | ||
680 | foreach my $arg (@$args) { | |
681 | if ($arg eq "hugepagesz=2M") { | |
682 | $parsed_size = 2; | |
683 | } elsif ($arg eq "hugepagesz=1G") { | |
684 | $parsed_size = 1024; | |
685 | } elsif ($arg =~ m/^hugepages=(\d+)?$/) { | |
686 | if ($parsed_size == $size) { | |
687 | return $1; | |
688 | } | |
689 | } | |
690 | } | |
691 | ||
692 | return 0; | |
693 | } | |
694 | ||
7023f3ea AD |
695 | sub hugepages_pre_deallocate { |
696 | my ($hugepages_topology) = @_; | |
697 | ||
698 | foreach my $size (sort keys %$hugepages_topology) { | |
699 | ||
700 | my $hugepages_size = $size * 1024; | |
701 | my $path = "/sys/kernel/mm/hugepages/hugepages-${hugepages_size}kB/"; | |
ca0ef6b1 KT |
702 | my $hugepages_nr = hugepages_default_nr_hugepages($size); |
703 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr); | |
7023f3ea AD |
704 | } |
705 | } | |
706 | ||
707 | sub hugepages_reset { | |
708 | my ($hugepages_topology) = @_; | |
709 | ||
710 | foreach my $size (sort keys %$hugepages_topology) { | |
711 | ||
712 | my $nodes = $hugepages_topology->{$size}; | |
713 | foreach my $numanode (keys %$nodes) { | |
714 | ||
715 | my $hugepages_nr = $hugepages_topology->{$size}->{$numanode}; | |
716 | my $hugepages_size = $size * 1024; | |
717 | my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/"; | |
718 | ||
719 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr); | |
720 | } | |
721 | } | |
722 | } | |
723 | ||
724 | sub hugepages_update_locked { | |
725 | my ($code, @param) = @_; | |
726 | ||
727 | my $timeout = 60; #could be long if a lot of hugepages need to be alocated | |
728 | ||
729 | my $lock_filename = "/var/lock/hugepages.lck"; | |
730 | ||
731 | my $res = lock_file($lock_filename, $timeout, $code, @param); | |
732 | die $@ if $@; | |
733 | ||
734 | return $res; | |
735 | } | |
3f669af2 | 736 | 1; |
a0649da2 | 737 |