]>
Commit | Line | Data |
---|---|---|
a0649da2 AD |
1 | package PVE::QemuServer::Memory; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
71aba4ea | 5 | |
7023f3ea | 6 | use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach); |
9a1c413f | 7 | use PVE::Exception qw(raise raise_param_exc); |
a0649da2 | 8 | |
71aba4ea | 9 | use PVE::QemuServer; |
0a13e08e | 10 | use PVE::QemuServer::Monitor qw(mon_cmd); |
71aba4ea | 11 | |
0567a4d5 | 12 | my $MAX_NUMA = 8; |
6779f1ac AD |
13 | my $MAX_MEM = 4194304; |
14 | my $STATICMEM = 1024; | |
a0649da2 | 15 | |
aaff69ad WB |
16 | sub get_numa_node_list { |
17 | my ($conf) = @_; | |
18 | my @numa_map; | |
19 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
20 | my $entry = $conf->{"numa$i"} or next; | |
21 | my $numa = PVE::QemuServer::parse_numa($entry) or next; | |
22 | push @numa_map, $i; | |
23 | } | |
24 | return @numa_map if @numa_map; | |
25 | my $sockets = $conf->{sockets} || 1; | |
26 | return (0..($sockets-1)); | |
27 | } | |
28 | ||
29 | # only valid when numa nodes map to a single host node | |
30 | sub get_numa_guest_to_host_map { | |
31 | my ($conf) = @_; | |
32 | my $map = {}; | |
33 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
34 | my $entry = $conf->{"numa$i"} or next; | |
35 | my $numa = PVE::QemuServer::parse_numa($entry) or next; | |
36 | $map->{$i} = print_numa_hostnodes($numa->{hostnodes}); | |
37 | } | |
38 | return $map if %$map; | |
39 | my $sockets = $conf->{sockets} || 1; | |
1d682951 | 40 | return {map { $_ => $_ } (0..($sockets-1))}; |
aaff69ad WB |
41 | } |
42 | ||
3f669af2 AD |
43 | sub foreach_dimm{ |
44 | my ($conf, $vmid, $memory, $sockets, $func) = @_; | |
45 | ||
46 | my $dimm_id = 0; | |
7023f3ea AD |
47 | my $current_size = 0; |
48 | my $dimm_size = 0; | |
49 | ||
50 | if($conf->{hugepages} && $conf->{hugepages} == 1024) { | |
51 | $current_size = 1024 * $sockets; | |
52 | $dimm_size = 1024; | |
53 | } else { | |
54 | $current_size = 1024; | |
55 | $dimm_size = 512; | |
56 | } | |
57 | ||
3f669af2 AD |
58 | return if $current_size == $memory; |
59 | ||
aaff69ad WB |
60 | my @numa_map = get_numa_node_list($conf); |
61 | ||
3f669af2 AD |
62 | for (my $j = 0; $j < 8; $j++) { |
63 | for (my $i = 0; $i < 32; $i++) { | |
64 | my $name = "dimm${dimm_id}"; | |
65 | $dimm_id++; | |
aaff69ad | 66 | my $numanode = $numa_map[$i % @numa_map]; |
3f669af2 AD |
67 | $current_size += $dimm_size; |
68 | &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); | |
69 | return $current_size if $current_size >= $memory; | |
70 | } | |
71 | $dimm_size *= 2; | |
72 | } | |
73 | } | |
74 | ||
75 | sub foreach_reverse_dimm { | |
76 | my ($conf, $vmid, $memory, $sockets, $func) = @_; | |
77 | ||
78 | my $dimm_id = 253; | |
7023f3ea AD |
79 | my $current_size = 0; |
80 | my $dimm_size = 0; | |
81 | ||
82 | if($conf->{hugepages} && $conf->{hugepages} == 1024) { | |
83 | $current_size = 8355840; | |
84 | $dimm_size = 131072; | |
85 | } else { | |
86 | $current_size = 4177920; | |
87 | $dimm_size = 65536; | |
88 | } | |
89 | ||
3f669af2 AD |
90 | return if $current_size == $memory; |
91 | ||
aaff69ad WB |
92 | my @numa_map = get_numa_node_list($conf); |
93 | ||
3f669af2 AD |
94 | for (my $j = 0; $j < 8; $j++) { |
95 | for (my $i = 0; $i < 32; $i++) { | |
96 | my $name = "dimm${dimm_id}"; | |
97 | $dimm_id--; | |
aaff69ad | 98 | my $numanode = $numa_map[(31-$i) % @numa_map]; |
3f669af2 AD |
99 | $current_size -= $dimm_size; |
100 | &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); | |
101 | return $current_size if $current_size <= $memory; | |
102 | } | |
103 | $dimm_size /= 2; | |
104 | } | |
105 | } | |
106 | ||
6779f1ac AD |
107 | sub qemu_memory_hotplug { |
108 | my ($vmid, $conf, $defaults, $opt, $value) = @_; | |
109 | ||
37dbec89 | 110 | return $value if !PVE::QemuServer::check_running($vmid); |
6779f1ac | 111 | |
7023f3ea AD |
112 | my $sockets = 1; |
113 | $sockets = $conf->{sockets} if $conf->{sockets}; | |
114 | ||
6779f1ac AD |
115 | my $memory = $conf->{memory} || $defaults->{memory}; |
116 | $value = $defaults->{memory} if !$value; | |
117 | return $value if $value == $memory; | |
118 | ||
119 | my $static_memory = $STATICMEM; | |
7023f3ea | 120 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); |
6779f1ac AD |
121 | |
122 | die "memory can't be lower than $static_memory MB" if $value < $static_memory; | |
123 | die "you cannot add more memory than $MAX_MEM MB!\n" if $memory > $MAX_MEM; | |
124 | ||
6779f1ac AD |
125 | if($value > $memory) { |
126 | ||
aaff69ad WB |
127 | my $numa_hostmap = get_numa_guest_to_host_map($conf) if $conf->{hugepages}; |
128 | ||
6779f1ac AD |
129 | foreach_dimm($conf, $vmid, $value, $sockets, sub { |
130 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
131 | ||
132 | return if $current_size <= $conf->{memory}; | |
133 | ||
7023f3ea AD |
134 | if ($conf->{hugepages}) { |
135 | ||
136 | my $hugepages_size = hugepages_size($conf, $dimm_size); | |
137 | my $path = hugepages_mount_path($hugepages_size); | |
aaff69ad WB |
138 | my $host_numanode = $numa_hostmap->{$numanode}; |
139 | my $hugepages_topology->{$hugepages_size}->{$host_numanode} = hugepages_nr($dimm_size, $hugepages_size); | |
7023f3ea AD |
140 | |
141 | my $code = sub { | |
142 | my $hugepages_host_topology = hugepages_host_topology(); | |
143 | hugepages_allocate($hugepages_topology, $hugepages_host_topology); | |
144 | ||
0a13e08e | 145 | eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", props => { |
7023f3ea AD |
146 | size => int($dimm_size*1024*1024), 'mem-path' => $path, share => JSON::true, prealloc => JSON::true } ); }; |
147 | if (my $err = $@) { | |
148 | hugepages_reset($hugepages_host_topology); | |
149 | die $err; | |
150 | } | |
151 | ||
152 | hugepages_pre_deallocate($hugepages_topology); | |
153 | }; | |
154 | eval { hugepages_update_locked($code); }; | |
155 | ||
156 | } else { | |
0a13e08e | 157 | eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", props => { size => int($dimm_size*1024*1024) } ) }; |
7023f3ea AD |
158 | } |
159 | ||
6779f1ac AD |
160 | if (my $err = $@) { |
161 | eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; | |
162 | die $err; | |
163 | } | |
164 | ||
0a13e08e | 165 | eval { mon_cmd($vmid, "device_add", driver => "pc-dimm", id => "$name", memdev => "mem-$name", node => $numanode) }; |
6779f1ac AD |
166 | if (my $err = $@) { |
167 | eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; | |
168 | die $err; | |
169 | } | |
170 | #update conf after each succesful module hotplug | |
171 | $conf->{memory} = $current_size; | |
172 | PVE::QemuConfig->write_config($vmid, $conf); | |
173 | }); | |
174 | ||
175 | } else { | |
176 | ||
177 | foreach_reverse_dimm($conf, $vmid, $value, $sockets, sub { | |
178 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
179 | ||
180 | return if $current_size >= $conf->{memory}; | |
181 | print "try to unplug memory dimm $name\n"; | |
182 | ||
183 | my $retry = 0; | |
184 | while (1) { | |
185 | eval { PVE::QemuServer::qemu_devicedel($vmid, $name) }; | |
186 | sleep 3; | |
187 | my $dimm_list = qemu_dimm_list($vmid); | |
188 | last if !$dimm_list->{$name}; | |
189 | raise_param_exc({ $name => "error unplug memory module" }) if $retry > 5; | |
190 | $retry++; | |
191 | } | |
192 | ||
193 | #update conf after each succesful module unplug | |
194 | $conf->{memory} = $current_size; | |
195 | ||
196 | eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; | |
197 | PVE::QemuConfig->write_config($vmid, $conf); | |
198 | }); | |
199 | } | |
200 | } | |
201 | ||
202 | sub qemu_dimm_list { | |
203 | my ($vmid) = @_; | |
204 | ||
0a13e08e | 205 | my $dimmarray = mon_cmd($vmid, "query-memory-devices"); |
6779f1ac AD |
206 | my $dimms = {}; |
207 | ||
208 | foreach my $dimm (@$dimmarray) { | |
209 | ||
210 | $dimms->{$dimm->{data}->{id}}->{id} = $dimm->{data}->{id}; | |
211 | $dimms->{$dimm->{data}->{id}}->{node} = $dimm->{data}->{node}; | |
212 | $dimms->{$dimm->{data}->{id}}->{addr} = $dimm->{data}->{addr}; | |
213 | $dimms->{$dimm->{data}->{id}}->{size} = $dimm->{data}->{size}; | |
214 | $dimms->{$dimm->{data}->{id}}->{slot} = $dimm->{data}->{slot}; | |
215 | } | |
216 | return $dimms; | |
217 | } | |
218 | ||
0567a4d5 AD |
219 | sub config { |
220 | my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd) = @_; | |
a022e3fd | 221 | |
0567a4d5 AD |
222 | my $memory = $conf->{memory} || $defaults->{memory}; |
223 | my $static_memory = 0; | |
0567a4d5 AD |
224 | |
225 | if ($hotplug_features->{memory}) { | |
7023f3ea | 226 | die "NUMA needs to be enabled for memory hotplug\n" if !$conf->{numa}; |
0567a4d5 | 227 | die "Total memory is bigger than ${MAX_MEM}MB\n" if $memory > $MAX_MEM; |
456bab54 SR |
228 | |
229 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
230 | die "cannot enable memory hotplugging with custom NUMA topology\n" | |
231 | if $conf->{"numa$i"}; | |
232 | } | |
233 | ||
7023f3ea AD |
234 | my $sockets = 1; |
235 | $sockets = $conf->{sockets} if $conf->{sockets}; | |
236 | ||
0567a4d5 | 237 | $static_memory = $STATICMEM; |
7023f3ea AD |
238 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); |
239 | ||
0567a4d5 | 240 | die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory); |
0567a4d5 AD |
241 | push @$cmd, '-m', "size=${static_memory},slots=255,maxmem=${MAX_MEM}M"; |
242 | ||
243 | } else { | |
244 | ||
245 | $static_memory = $memory; | |
246 | push @$cmd, '-m', $static_memory; | |
247 | } | |
248 | ||
7023f3ea AD |
249 | die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa}; |
250 | ||
0567a4d5 AD |
251 | if ($conf->{numa}) { |
252 | ||
253 | my $numa_totalmemory = undef; | |
254 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
255 | next if !$conf->{"numa$i"}; | |
256 | my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"}); | |
257 | next if !$numa; | |
258 | # memory | |
259 | die "missing NUMA node$i memory value\n" if !$numa->{memory}; | |
260 | my $numa_memory = $numa->{memory}; | |
261 | $numa_totalmemory += $numa_memory; | |
7023f3ea AD |
262 | |
263 | my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); | |
0567a4d5 AD |
264 | |
265 | # cpus | |
266 | my $cpulists = $numa->{cpus}; | |
267 | die "missing NUMA node$i cpus\n" if !defined($cpulists); | |
f612bd67 | 268 | my $cpus = join(',cpus=', map { |
0567a4d5 AD |
269 | my ($start, $end) = @$_; |
270 | defined($end) ? "$start-$end" : $start | |
271 | } @$cpulists); | |
272 | ||
273 | # hostnodes | |
274 | my $hostnodelists = $numa->{hostnodes}; | |
275 | if (defined($hostnodelists)) { | |
ac7b7087 AD |
276 | |
277 | my $hostnodes = print_numa_hostnodes($hostnodelists); | |
0567a4d5 AD |
278 | |
279 | # policy | |
280 | my $policy = $numa->{policy}; | |
281 | die "you need to define a policy for hostnode $hostnodes\n" if !$policy; | |
7023f3ea AD |
282 | $mem_object .= ",host-nodes=$hostnodes,policy=$policy"; |
283 | } else { | |
284 | die "numa hostnodes need to be defined to use hugepages" if $conf->{hugepages}; | |
0567a4d5 AD |
285 | } |
286 | ||
7023f3ea | 287 | push @$cmd, '-object', $mem_object; |
0567a4d5 AD |
288 | push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; |
289 | } | |
290 | ||
291 | die "total memory for NUMA nodes must be equal to vm static memory\n" | |
292 | if $numa_totalmemory && $numa_totalmemory != $static_memory; | |
293 | ||
294 | #if no custom tology, we split memory and cores across numa nodes | |
295 | if(!$numa_totalmemory) { | |
296 | ||
7023f3ea | 297 | my $numa_memory = ($static_memory / $sockets); |
0567a4d5 AD |
298 | |
299 | for (my $i = 0; $i < $sockets; $i++) { | |
a2a5ffc9 | 300 | die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/" && $conf->{hugepages}; |
0567a4d5 AD |
301 | |
302 | my $cpustart = ($cores * $i); | |
303 | my $cpuend = ($cpustart + $cores - 1) if $cores && $cores > 1; | |
304 | my $cpus = $cpustart; | |
305 | $cpus .= "-$cpuend" if $cpuend; | |
306 | ||
7023f3ea AD |
307 | my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); |
308 | ||
309 | push @$cmd, '-object', $mem_object; | |
0567a4d5 AD |
310 | push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; |
311 | } | |
312 | } | |
313 | } | |
314 | ||
315 | if ($hotplug_features->{memory}) { | |
316 | foreach_dimm($conf, $vmid, $memory, $sockets, sub { | |
317 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
7023f3ea AD |
318 | |
319 | my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size); | |
320 | ||
321 | push @$cmd, "-object" , $mem_object; | |
0567a4d5 AD |
322 | push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode"; |
323 | ||
324 | #if dimm_memory is not aligned to dimm map | |
325 | if($current_size > $memory) { | |
326 | $conf->{memory} = $current_size; | |
327 | PVE::QemuConfig->write_config($vmid, $conf); | |
328 | } | |
329 | }); | |
330 | } | |
331 | } | |
332 | ||
7023f3ea AD |
333 | sub print_mem_object { |
334 | my ($conf, $id, $size) = @_; | |
335 | ||
336 | if ($conf->{hugepages}) { | |
337 | ||
338 | my $hugepages_size = hugepages_size($conf, $size); | |
339 | my $path = hugepages_mount_path($hugepages_size); | |
340 | ||
341 | return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes"; | |
342 | } else { | |
343 | return "memory-backend-ram,id=$id,size=${size}M"; | |
344 | } | |
0567a4d5 | 345 | |
7023f3ea AD |
346 | } |
347 | ||
ac7b7087 AD |
348 | sub print_numa_hostnodes { |
349 | my ($hostnodelists) = @_; | |
350 | ||
351 | my $hostnodes; | |
352 | foreach my $hostnoderange (@$hostnodelists) { | |
353 | my ($start, $end) = @$hostnoderange; | |
354 | $hostnodes .= ',' if $hostnodes; | |
355 | $hostnodes .= $start; | |
356 | $hostnodes .= "-$end" if defined($end); | |
357 | $end //= $start; | |
358 | for (my $i = $start; $i <= $end; ++$i ) { | |
359 | die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/"; | |
360 | } | |
361 | } | |
362 | return $hostnodes; | |
363 | } | |
364 | ||
7023f3ea AD |
365 | sub hugepages_mount { |
366 | ||
367 | my $mountdata = PVE::ProcFSTools::parse_proc_mounts(); | |
368 | ||
369 | foreach my $size (qw(2048 1048576)) { | |
3d8d2e8d | 370 | next if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB"); |
7023f3ea AD |
371 | |
372 | my $path = "/run/hugepages/kvm/${size}kB"; | |
373 | ||
374 | my $found = grep { | |
375 | $_->[2] =~ /^hugetlbfs/ && | |
376 | $_->[1] eq $path | |
377 | } @$mountdata; | |
378 | ||
379 | if (!$found) { | |
380 | ||
381 | File::Path::make_path($path) if (!-d $path); | |
382 | my $cmd = ['/bin/mount', '-t', 'hugetlbfs', '-o', "pagesize=${size}k", 'hugetlbfs', $path]; | |
383 | run_command($cmd, errmsg => "hugepage mount error"); | |
384 | } | |
385 | } | |
386 | } | |
387 | ||
388 | sub hugepages_mount_path { | |
389 | my ($size) = @_; | |
390 | ||
391 | $size = $size * 1024; | |
392 | return "/run/hugepages/kvm/${size}kB"; | |
393 | ||
394 | } | |
395 | ||
396 | sub hugepages_nr { | |
397 | my ($size, $hugepages_size) = @_; | |
398 | ||
399 | return $size / $hugepages_size; | |
400 | } | |
401 | ||
402 | sub hugepages_size { | |
71aba4ea TL |
403 | my ($conf, $size) = @_; |
404 | die "hugepages option is not enabled" if !$conf->{hugepages}; | |
405 | die "memory size '$size' is not a positive even integer; cannot use for hugepages\n" | |
406 | if $size <= 0 || $size & 1; | |
7023f3ea | 407 | |
71aba4ea TL |
408 | my $page_chunk = sub { -d "/sys/kernel/mm/hugepages/hugepages-". ($_[0] * 1024) ."kB" }; |
409 | die "your system doesn't support hugepages\n" if !$page_chunk->(2) && !$page_chunk->(1024); | |
7023f3ea | 410 | |
71aba4ea | 411 | if ($conf->{hugepages} eq 'any') { |
7023f3ea | 412 | |
71aba4ea TL |
413 | # try to use 1GB if available && memory size is matching |
414 | if ($page_chunk->(1024) && ($size & 1023) == 0) { | |
7023f3ea | 415 | return 1024; |
71aba4ea | 416 | } elsif ($page_chunk->(2)) { |
7023f3ea | 417 | return 2; |
71aba4ea TL |
418 | } else { |
419 | die "host only supports 1024 GB hugepages, but requested size '$size' is not a multiple of 1024 MB\n" | |
7023f3ea | 420 | } |
71aba4ea | 421 | } else { |
7023f3ea | 422 | |
93981fa7 | 423 | my $hugepagesize = $conf->{hugepages}; |
7023f3ea | 424 | |
93981fa7 TL |
425 | if (!$page_chunk->($hugepagesize)) { |
426 | die "your system doesn't support hugepages of $hugepagesize MB\n"; | |
427 | } elsif (($size % $hugepagesize) != 0) { | |
428 | die "Memory size $size is not a multiple of the requested hugepages size $hugepagesize\n"; | |
7023f3ea | 429 | } |
93981fa7 TL |
430 | |
431 | return $hugepagesize | |
71aba4ea | 432 | } |
7023f3ea AD |
433 | } |
434 | ||
435 | sub hugepages_topology { | |
436 | my ($conf) = @_; | |
437 | ||
438 | my $hugepages_topology = {}; | |
439 | ||
440 | return if !$conf->{numa}; | |
441 | ||
442 | my $defaults = PVE::QemuServer::load_defaults(); | |
443 | my $memory = $conf->{memory} || $defaults->{memory}; | |
444 | my $static_memory = 0; | |
445 | my $sockets = 1; | |
446 | $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused | |
447 | $sockets = $conf->{sockets} if $conf->{sockets}; | |
448 | my $numa_custom_topology = undef; | |
449 | my $hotplug_features = PVE::QemuServer::parse_hotplug_features(defined($conf->{hotplug}) ? $conf->{hotplug} : '1'); | |
450 | ||
451 | if ($hotplug_features->{memory}) { | |
452 | $static_memory = $STATICMEM; | |
453 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); | |
454 | } else { | |
455 | $static_memory = $memory; | |
456 | } | |
457 | ||
458 | #custom numa topology | |
459 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
460 | next if !$conf->{"numa$i"}; | |
461 | my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"}); | |
462 | next if !$numa; | |
463 | ||
464 | $numa_custom_topology = 1; | |
465 | my $numa_memory = $numa->{memory}; | |
ac7b7087 AD |
466 | my $hostnodelists = $numa->{hostnodes}; |
467 | my $hostnodes = print_numa_hostnodes($hostnodelists); | |
7023f3ea | 468 | |
ac7b7087 | 469 | die "more than 1 hostnode value in numa node is not supported when hugepages are enabled" if $hostnodes !~ m/^(\d)$/; |
7023f3ea | 470 | my $hugepages_size = hugepages_size($conf, $numa_memory); |
ac7b7087 | 471 | $hugepages_topology->{$hugepages_size}->{$hostnodes} += hugepages_nr($numa_memory, $hugepages_size); |
7023f3ea AD |
472 | |
473 | } | |
474 | ||
475 | #if no custom numa tology, we split memory and cores across numa nodes | |
476 | if(!$numa_custom_topology) { | |
477 | ||
478 | my $numa_memory = ($static_memory / $sockets); | |
479 | ||
480 | for (my $i = 0; $i < $sockets; $i++) { | |
481 | ||
482 | my $hugepages_size = hugepages_size($conf, $numa_memory); | |
483 | $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size); | |
484 | } | |
485 | } | |
486 | ||
487 | if ($hotplug_features->{memory}) { | |
aaff69ad WB |
488 | my $numa_hostmap = get_numa_guest_to_host_map($conf); |
489 | ||
7023f3ea AD |
490 | foreach_dimm($conf, undef, $memory, $sockets, sub { |
491 | my ($conf, undef, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
492 | ||
aaff69ad WB |
493 | $numanode = $numa_hostmap->{$numanode}; |
494 | ||
7023f3ea AD |
495 | my $hugepages_size = hugepages_size($conf, $dimm_size); |
496 | $hugepages_topology->{$hugepages_size}->{$numanode} += hugepages_nr($dimm_size, $hugepages_size); | |
497 | }); | |
498 | } | |
499 | ||
500 | return $hugepages_topology; | |
501 | } | |
502 | ||
503 | sub hugepages_host_topology { | |
504 | ||
505 | #read host hugepages | |
506 | my $hugepages_host_topology = {}; | |
507 | ||
508 | dir_glob_foreach("/sys/devices/system/node/", 'node(\d+)', sub { | |
509 | my ($nodepath, $numanode) = @_; | |
510 | ||
511 | dir_glob_foreach("/sys/devices/system/node/$nodepath/hugepages/", 'hugepages\-(\d+)kB', sub { | |
512 | my ($hugepages_path, $hugepages_size) = @_; | |
513 | ||
514 | $hugepages_size = $hugepages_size / 1024; | |
515 | my $hugepages_nr = PVE::Tools::file_read_firstline("/sys/devices/system/node/$nodepath/hugepages/$hugepages_path/nr_hugepages"); | |
516 | $hugepages_host_topology->{$hugepages_size}->{$numanode} = $hugepages_nr; | |
517 | }); | |
518 | }); | |
519 | ||
520 | return $hugepages_host_topology; | |
521 | } | |
522 | ||
523 | sub hugepages_allocate { | |
524 | my ($hugepages_topology, $hugepages_host_topology) = @_; | |
525 | ||
526 | #allocate new hupages if needed | |
527 | foreach my $size (sort keys %$hugepages_topology) { | |
528 | ||
529 | my $nodes = $hugepages_topology->{$size}; | |
530 | ||
531 | foreach my $numanode (keys %$nodes) { | |
532 | ||
533 | my $hugepages_size = $size * 1024; | |
534 | my $hugepages_requested = $hugepages_topology->{$size}->{$numanode}; | |
535 | my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/"; | |
536 | my $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages"); | |
537 | my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages"); | |
538 | ||
539 | if ($hugepages_requested > $hugepages_free) { | |
540 | my $hugepages_needed = $hugepages_requested - $hugepages_free; | |
541 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr + $hugepages_needed); | |
542 | #verify that is correctly allocated | |
543 | $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages"); | |
544 | if ($hugepages_free < $hugepages_requested) { | |
545 | #rollback to initial host config | |
546 | hugepages_reset($hugepages_host_topology); | |
547 | die "hugepage allocation failed"; | |
548 | } | |
549 | } | |
550 | ||
551 | } | |
552 | } | |
553 | ||
554 | } | |
555 | ||
ca0ef6b1 KT |
556 | sub hugepages_default_nr_hugepages { |
557 | my ($size) = @_; | |
558 | ||
559 | my $cmdline = PVE::Tools::file_read_firstline("/proc/cmdline"); | |
560 | my $args = PVE::Tools::split_args($cmdline); | |
561 | ||
562 | my $parsed_size = 2; # default is 2M | |
563 | ||
564 | foreach my $arg (@$args) { | |
565 | if ($arg eq "hugepagesz=2M") { | |
566 | $parsed_size = 2; | |
567 | } elsif ($arg eq "hugepagesz=1G") { | |
568 | $parsed_size = 1024; | |
569 | } elsif ($arg =~ m/^hugepages=(\d+)?$/) { | |
570 | if ($parsed_size == $size) { | |
571 | return $1; | |
572 | } | |
573 | } | |
574 | } | |
575 | ||
576 | return 0; | |
577 | } | |
578 | ||
7023f3ea AD |
579 | sub hugepages_pre_deallocate { |
580 | my ($hugepages_topology) = @_; | |
581 | ||
582 | foreach my $size (sort keys %$hugepages_topology) { | |
583 | ||
584 | my $hugepages_size = $size * 1024; | |
585 | my $path = "/sys/kernel/mm/hugepages/hugepages-${hugepages_size}kB/"; | |
ca0ef6b1 KT |
586 | my $hugepages_nr = hugepages_default_nr_hugepages($size); |
587 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr); | |
7023f3ea AD |
588 | } |
589 | } | |
590 | ||
591 | sub hugepages_reset { | |
592 | my ($hugepages_topology) = @_; | |
593 | ||
594 | foreach my $size (sort keys %$hugepages_topology) { | |
595 | ||
596 | my $nodes = $hugepages_topology->{$size}; | |
597 | foreach my $numanode (keys %$nodes) { | |
598 | ||
599 | my $hugepages_nr = $hugepages_topology->{$size}->{$numanode}; | |
600 | my $hugepages_size = $size * 1024; | |
601 | my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/"; | |
602 | ||
603 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr); | |
604 | } | |
605 | } | |
606 | } | |
607 | ||
608 | sub hugepages_update_locked { | |
609 | my ($code, @param) = @_; | |
610 | ||
611 | my $timeout = 60; #could be long if a lot of hugepages need to be alocated | |
612 | ||
613 | my $lock_filename = "/var/lock/hugepages.lck"; | |
614 | ||
615 | my $res = lock_file($lock_filename, $timeout, $code, @param); | |
616 | die $@ if $@; | |
617 | ||
618 | return $res; | |
619 | } | |
3f669af2 | 620 | 1; |
a0649da2 | 621 |