]>
Commit | Line | Data |
---|---|---|
a0649da2 AD |
1 | package PVE::QemuServer::Memory; |
2 | ||
3 | use strict; | |
4 | use warnings; | |
6779f1ac | 5 | use PVE::QemuServer; |
7023f3ea | 6 | use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach); |
9a1c413f | 7 | use PVE::Exception qw(raise raise_param_exc); |
a0649da2 | 8 | |
0567a4d5 | 9 | my $MAX_NUMA = 8; |
6779f1ac AD |
10 | my $MAX_MEM = 4194304; |
11 | my $STATICMEM = 1024; | |
a0649da2 | 12 | |
aaff69ad WB |
13 | sub get_numa_node_list { |
14 | my ($conf) = @_; | |
15 | my @numa_map; | |
16 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
17 | my $entry = $conf->{"numa$i"} or next; | |
18 | my $numa = PVE::QemuServer::parse_numa($entry) or next; | |
19 | push @numa_map, $i; | |
20 | } | |
21 | return @numa_map if @numa_map; | |
22 | my $sockets = $conf->{sockets} || 1; | |
23 | return (0..($sockets-1)); | |
24 | } | |
25 | ||
26 | # only valid when numa nodes map to a single host node | |
27 | sub get_numa_guest_to_host_map { | |
28 | my ($conf) = @_; | |
29 | my $map = {}; | |
30 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
31 | my $entry = $conf->{"numa$i"} or next; | |
32 | my $numa = PVE::QemuServer::parse_numa($entry) or next; | |
33 | $map->{$i} = print_numa_hostnodes($numa->{hostnodes}); | |
34 | } | |
35 | return $map if %$map; | |
36 | my $sockets = $conf->{sockets} || 1; | |
37 | return map { $_ => $_ } (0..($sockets-1)); | |
38 | } | |
39 | ||
3f669af2 AD |
40 | sub foreach_dimm{ |
41 | my ($conf, $vmid, $memory, $sockets, $func) = @_; | |
42 | ||
43 | my $dimm_id = 0; | |
7023f3ea AD |
44 | my $current_size = 0; |
45 | my $dimm_size = 0; | |
46 | ||
47 | if($conf->{hugepages} && $conf->{hugepages} == 1024) { | |
48 | $current_size = 1024 * $sockets; | |
49 | $dimm_size = 1024; | |
50 | } else { | |
51 | $current_size = 1024; | |
52 | $dimm_size = 512; | |
53 | } | |
54 | ||
3f669af2 AD |
55 | return if $current_size == $memory; |
56 | ||
aaff69ad WB |
57 | my @numa_map = get_numa_node_list($conf); |
58 | ||
3f669af2 AD |
59 | for (my $j = 0; $j < 8; $j++) { |
60 | for (my $i = 0; $i < 32; $i++) { | |
61 | my $name = "dimm${dimm_id}"; | |
62 | $dimm_id++; | |
aaff69ad | 63 | my $numanode = $numa_map[$i % @numa_map]; |
3f669af2 AD |
64 | $current_size += $dimm_size; |
65 | &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); | |
66 | return $current_size if $current_size >= $memory; | |
67 | } | |
68 | $dimm_size *= 2; | |
69 | } | |
70 | } | |
71 | ||
72 | sub foreach_reverse_dimm { | |
73 | my ($conf, $vmid, $memory, $sockets, $func) = @_; | |
74 | ||
75 | my $dimm_id = 253; | |
7023f3ea AD |
76 | my $current_size = 0; |
77 | my $dimm_size = 0; | |
78 | ||
79 | if($conf->{hugepages} && $conf->{hugepages} == 1024) { | |
80 | $current_size = 8355840; | |
81 | $dimm_size = 131072; | |
82 | } else { | |
83 | $current_size = 4177920; | |
84 | $dimm_size = 65536; | |
85 | } | |
86 | ||
3f669af2 AD |
87 | return if $current_size == $memory; |
88 | ||
aaff69ad WB |
89 | my @numa_map = get_numa_node_list($conf); |
90 | ||
3f669af2 AD |
91 | for (my $j = 0; $j < 8; $j++) { |
92 | for (my $i = 0; $i < 32; $i++) { | |
93 | my $name = "dimm${dimm_id}"; | |
94 | $dimm_id--; | |
aaff69ad | 95 | my $numanode = $numa_map[(31-$i) % @numa_map]; |
3f669af2 AD |
96 | $current_size -= $dimm_size; |
97 | &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); | |
98 | return $current_size if $current_size <= $memory; | |
99 | } | |
100 | $dimm_size /= 2; | |
101 | } | |
102 | } | |
103 | ||
6779f1ac AD |
104 | sub qemu_memory_hotplug { |
105 | my ($vmid, $conf, $defaults, $opt, $value) = @_; | |
106 | ||
37dbec89 | 107 | return $value if !PVE::QemuServer::check_running($vmid); |
6779f1ac | 108 | |
7023f3ea AD |
109 | my $sockets = 1; |
110 | $sockets = $conf->{sockets} if $conf->{sockets}; | |
111 | ||
6779f1ac AD |
112 | my $memory = $conf->{memory} || $defaults->{memory}; |
113 | $value = $defaults->{memory} if !$value; | |
114 | return $value if $value == $memory; | |
115 | ||
116 | my $static_memory = $STATICMEM; | |
7023f3ea | 117 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); |
6779f1ac AD |
118 | |
119 | die "memory can't be lower than $static_memory MB" if $value < $static_memory; | |
120 | die "you cannot add more memory than $MAX_MEM MB!\n" if $memory > $MAX_MEM; | |
121 | ||
6779f1ac AD |
122 | if($value > $memory) { |
123 | ||
aaff69ad WB |
124 | my $numa_hostmap = get_numa_guest_to_host_map($conf) if $conf->{hugepages}; |
125 | ||
6779f1ac AD |
126 | foreach_dimm($conf, $vmid, $value, $sockets, sub { |
127 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
128 | ||
129 | return if $current_size <= $conf->{memory}; | |
130 | ||
7023f3ea AD |
131 | if ($conf->{hugepages}) { |
132 | ||
133 | my $hugepages_size = hugepages_size($conf, $dimm_size); | |
134 | my $path = hugepages_mount_path($hugepages_size); | |
aaff69ad WB |
135 | my $host_numanode = $numa_hostmap->{$numanode}; |
136 | my $hugepages_topology->{$hugepages_size}->{$host_numanode} = hugepages_nr($dimm_size, $hugepages_size); | |
7023f3ea AD |
137 | |
138 | my $code = sub { | |
139 | my $hugepages_host_topology = hugepages_host_topology(); | |
140 | hugepages_allocate($hugepages_topology, $hugepages_host_topology); | |
141 | ||
142 | eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", props => { | |
143 | size => int($dimm_size*1024*1024), 'mem-path' => $path, share => JSON::true, prealloc => JSON::true } ); }; | |
144 | if (my $err = $@) { | |
145 | hugepages_reset($hugepages_host_topology); | |
146 | die $err; | |
147 | } | |
148 | ||
149 | hugepages_pre_deallocate($hugepages_topology); | |
150 | }; | |
151 | eval { hugepages_update_locked($code); }; | |
152 | ||
153 | } else { | |
154 | eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", props => { size => int($dimm_size*1024*1024) } ) }; | |
155 | } | |
156 | ||
6779f1ac AD |
157 | if (my $err = $@) { |
158 | eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; | |
159 | die $err; | |
160 | } | |
161 | ||
162 | eval { PVE::QemuServer::vm_mon_cmd($vmid, "device_add", driver => "pc-dimm", id => "$name", memdev => "mem-$name", node => $numanode) }; | |
163 | if (my $err = $@) { | |
164 | eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; | |
165 | die $err; | |
166 | } | |
167 | #update conf after each succesful module hotplug | |
168 | $conf->{memory} = $current_size; | |
169 | PVE::QemuConfig->write_config($vmid, $conf); | |
170 | }); | |
171 | ||
172 | } else { | |
173 | ||
174 | foreach_reverse_dimm($conf, $vmid, $value, $sockets, sub { | |
175 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
176 | ||
177 | return if $current_size >= $conf->{memory}; | |
178 | print "try to unplug memory dimm $name\n"; | |
179 | ||
180 | my $retry = 0; | |
181 | while (1) { | |
182 | eval { PVE::QemuServer::qemu_devicedel($vmid, $name) }; | |
183 | sleep 3; | |
184 | my $dimm_list = qemu_dimm_list($vmid); | |
185 | last if !$dimm_list->{$name}; | |
186 | raise_param_exc({ $name => "error unplug memory module" }) if $retry > 5; | |
187 | $retry++; | |
188 | } | |
189 | ||
190 | #update conf after each succesful module unplug | |
191 | $conf->{memory} = $current_size; | |
192 | ||
193 | eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; | |
194 | PVE::QemuConfig->write_config($vmid, $conf); | |
195 | }); | |
196 | } | |
197 | } | |
198 | ||
199 | sub qemu_dimm_list { | |
200 | my ($vmid) = @_; | |
201 | ||
202 | my $dimmarray = PVE::QemuServer::vm_mon_cmd_nocheck($vmid, "query-memory-devices"); | |
203 | my $dimms = {}; | |
204 | ||
205 | foreach my $dimm (@$dimmarray) { | |
206 | ||
207 | $dimms->{$dimm->{data}->{id}}->{id} = $dimm->{data}->{id}; | |
208 | $dimms->{$dimm->{data}->{id}}->{node} = $dimm->{data}->{node}; | |
209 | $dimms->{$dimm->{data}->{id}}->{addr} = $dimm->{data}->{addr}; | |
210 | $dimms->{$dimm->{data}->{id}}->{size} = $dimm->{data}->{size}; | |
211 | $dimms->{$dimm->{data}->{id}}->{slot} = $dimm->{data}->{slot}; | |
212 | } | |
213 | return $dimms; | |
214 | } | |
215 | ||
0567a4d5 AD |
216 | sub config { |
217 | my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd) = @_; | |
218 | ||
219 | my $memory = $conf->{memory} || $defaults->{memory}; | |
220 | my $static_memory = 0; | |
0567a4d5 AD |
221 | |
222 | if ($hotplug_features->{memory}) { | |
7023f3ea | 223 | die "NUMA needs to be enabled for memory hotplug\n" if !$conf->{numa}; |
0567a4d5 | 224 | die "Total memory is bigger than ${MAX_MEM}MB\n" if $memory > $MAX_MEM; |
7023f3ea AD |
225 | my $sockets = 1; |
226 | $sockets = $conf->{sockets} if $conf->{sockets}; | |
227 | ||
0567a4d5 | 228 | $static_memory = $STATICMEM; |
7023f3ea AD |
229 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); |
230 | ||
0567a4d5 | 231 | die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory); |
0567a4d5 AD |
232 | push @$cmd, '-m', "size=${static_memory},slots=255,maxmem=${MAX_MEM}M"; |
233 | ||
234 | } else { | |
235 | ||
236 | $static_memory = $memory; | |
237 | push @$cmd, '-m', $static_memory; | |
238 | } | |
239 | ||
7023f3ea AD |
240 | die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa}; |
241 | ||
0567a4d5 AD |
242 | if ($conf->{numa}) { |
243 | ||
244 | my $numa_totalmemory = undef; | |
245 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
246 | next if !$conf->{"numa$i"}; | |
247 | my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"}); | |
248 | next if !$numa; | |
249 | # memory | |
250 | die "missing NUMA node$i memory value\n" if !$numa->{memory}; | |
251 | my $numa_memory = $numa->{memory}; | |
252 | $numa_totalmemory += $numa_memory; | |
7023f3ea AD |
253 | |
254 | my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); | |
0567a4d5 AD |
255 | |
256 | # cpus | |
257 | my $cpulists = $numa->{cpus}; | |
258 | die "missing NUMA node$i cpus\n" if !defined($cpulists); | |
f612bd67 | 259 | my $cpus = join(',cpus=', map { |
0567a4d5 AD |
260 | my ($start, $end) = @$_; |
261 | defined($end) ? "$start-$end" : $start | |
262 | } @$cpulists); | |
263 | ||
264 | # hostnodes | |
265 | my $hostnodelists = $numa->{hostnodes}; | |
266 | if (defined($hostnodelists)) { | |
ac7b7087 AD |
267 | |
268 | my $hostnodes = print_numa_hostnodes($hostnodelists); | |
0567a4d5 AD |
269 | |
270 | # policy | |
271 | my $policy = $numa->{policy}; | |
272 | die "you need to define a policy for hostnode $hostnodes\n" if !$policy; | |
7023f3ea AD |
273 | $mem_object .= ",host-nodes=$hostnodes,policy=$policy"; |
274 | } else { | |
275 | die "numa hostnodes need to be defined to use hugepages" if $conf->{hugepages}; | |
0567a4d5 AD |
276 | } |
277 | ||
7023f3ea | 278 | push @$cmd, '-object', $mem_object; |
0567a4d5 AD |
279 | push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; |
280 | } | |
281 | ||
282 | die "total memory for NUMA nodes must be equal to vm static memory\n" | |
283 | if $numa_totalmemory && $numa_totalmemory != $static_memory; | |
284 | ||
285 | #if no custom tology, we split memory and cores across numa nodes | |
286 | if(!$numa_totalmemory) { | |
287 | ||
7023f3ea | 288 | my $numa_memory = ($static_memory / $sockets); |
0567a4d5 AD |
289 | |
290 | for (my $i = 0; $i < $sockets; $i++) { | |
a2a5ffc9 | 291 | die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/" && $conf->{hugepages}; |
0567a4d5 AD |
292 | |
293 | my $cpustart = ($cores * $i); | |
294 | my $cpuend = ($cpustart + $cores - 1) if $cores && $cores > 1; | |
295 | my $cpus = $cpustart; | |
296 | $cpus .= "-$cpuend" if $cpuend; | |
297 | ||
7023f3ea AD |
298 | my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory); |
299 | ||
300 | push @$cmd, '-object', $mem_object; | |
0567a4d5 AD |
301 | push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; |
302 | } | |
303 | } | |
304 | } | |
305 | ||
306 | if ($hotplug_features->{memory}) { | |
307 | foreach_dimm($conf, $vmid, $memory, $sockets, sub { | |
308 | my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
7023f3ea AD |
309 | |
310 | my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size); | |
311 | ||
312 | push @$cmd, "-object" , $mem_object; | |
0567a4d5 AD |
313 | push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode"; |
314 | ||
315 | #if dimm_memory is not aligned to dimm map | |
316 | if($current_size > $memory) { | |
317 | $conf->{memory} = $current_size; | |
318 | PVE::QemuConfig->write_config($vmid, $conf); | |
319 | } | |
320 | }); | |
321 | } | |
322 | } | |
323 | ||
7023f3ea AD |
324 | sub print_mem_object { |
325 | my ($conf, $id, $size) = @_; | |
326 | ||
327 | if ($conf->{hugepages}) { | |
328 | ||
329 | my $hugepages_size = hugepages_size($conf, $size); | |
330 | my $path = hugepages_mount_path($hugepages_size); | |
331 | ||
332 | return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes"; | |
333 | } else { | |
334 | return "memory-backend-ram,id=$id,size=${size}M"; | |
335 | } | |
0567a4d5 | 336 | |
7023f3ea AD |
337 | } |
338 | ||
ac7b7087 AD |
339 | sub print_numa_hostnodes { |
340 | my ($hostnodelists) = @_; | |
341 | ||
342 | my $hostnodes; | |
343 | foreach my $hostnoderange (@$hostnodelists) { | |
344 | my ($start, $end) = @$hostnoderange; | |
345 | $hostnodes .= ',' if $hostnodes; | |
346 | $hostnodes .= $start; | |
347 | $hostnodes .= "-$end" if defined($end); | |
348 | $end //= $start; | |
349 | for (my $i = $start; $i <= $end; ++$i ) { | |
350 | die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/"; | |
351 | } | |
352 | } | |
353 | return $hostnodes; | |
354 | } | |
355 | ||
7023f3ea AD |
356 | sub hugepages_mount { |
357 | ||
358 | my $mountdata = PVE::ProcFSTools::parse_proc_mounts(); | |
359 | ||
360 | foreach my $size (qw(2048 1048576)) { | |
361 | return if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB"); | |
362 | ||
363 | my $path = "/run/hugepages/kvm/${size}kB"; | |
364 | ||
365 | my $found = grep { | |
366 | $_->[2] =~ /^hugetlbfs/ && | |
367 | $_->[1] eq $path | |
368 | } @$mountdata; | |
369 | ||
370 | if (!$found) { | |
371 | ||
372 | File::Path::make_path($path) if (!-d $path); | |
373 | my $cmd = ['/bin/mount', '-t', 'hugetlbfs', '-o', "pagesize=${size}k", 'hugetlbfs', $path]; | |
374 | run_command($cmd, errmsg => "hugepage mount error"); | |
375 | } | |
376 | } | |
377 | } | |
378 | ||
379 | sub hugepages_mount_path { | |
380 | my ($size) = @_; | |
381 | ||
382 | $size = $size * 1024; | |
383 | return "/run/hugepages/kvm/${size}kB"; | |
384 | ||
385 | } | |
386 | ||
387 | sub hugepages_nr { | |
388 | my ($size, $hugepages_size) = @_; | |
389 | ||
390 | return $size / $hugepages_size; | |
391 | } | |
392 | ||
393 | sub hugepages_size { | |
394 | my ($conf, $size) = @_; | |
395 | ||
396 | die "hugepages option is not enabled" if !$conf->{hugepages}; | |
397 | ||
398 | if ($conf->{hugepages} eq 'any') { | |
399 | ||
400 | #try to use 1GB if available && memory size is matching | |
401 | if (-d "/sys/kernel/mm/hugepages/hugepages-1048576kB" && ($size % 1024 == 0)) { | |
402 | return 1024; | |
403 | } else { | |
404 | return 2; | |
405 | } | |
406 | ||
407 | } else { | |
408 | ||
409 | my $hugepagesize = $conf->{hugepages} * 1024 . "kB"; | |
410 | ||
411 | if (! -d "/sys/kernel/mm/hugepages/hugepages-$hugepagesize") { | |
412 | die "your system doesn't support hugepages of $hugepagesize"; | |
413 | } | |
414 | die "Memory size $size is not a multiple of the requested hugepages size $hugepagesize" if ($size % $conf->{hugepages}) != 0; | |
415 | return $conf->{hugepages}; | |
416 | } | |
417 | ||
418 | } | |
419 | ||
420 | sub hugepages_topology { | |
421 | my ($conf) = @_; | |
422 | ||
423 | my $hugepages_topology = {}; | |
424 | ||
425 | return if !$conf->{numa}; | |
426 | ||
427 | my $defaults = PVE::QemuServer::load_defaults(); | |
428 | my $memory = $conf->{memory} || $defaults->{memory}; | |
429 | my $static_memory = 0; | |
430 | my $sockets = 1; | |
431 | $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused | |
432 | $sockets = $conf->{sockets} if $conf->{sockets}; | |
433 | my $numa_custom_topology = undef; | |
434 | my $hotplug_features = PVE::QemuServer::parse_hotplug_features(defined($conf->{hotplug}) ? $conf->{hotplug} : '1'); | |
435 | ||
436 | if ($hotplug_features->{memory}) { | |
437 | $static_memory = $STATICMEM; | |
438 | $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024); | |
439 | } else { | |
440 | $static_memory = $memory; | |
441 | } | |
442 | ||
443 | #custom numa topology | |
444 | for (my $i = 0; $i < $MAX_NUMA; $i++) { | |
445 | next if !$conf->{"numa$i"}; | |
446 | my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"}); | |
447 | next if !$numa; | |
448 | ||
449 | $numa_custom_topology = 1; | |
450 | my $numa_memory = $numa->{memory}; | |
ac7b7087 AD |
451 | my $hostnodelists = $numa->{hostnodes}; |
452 | my $hostnodes = print_numa_hostnodes($hostnodelists); | |
7023f3ea | 453 | |
ac7b7087 | 454 | die "more than 1 hostnode value in numa node is not supported when hugepages are enabled" if $hostnodes !~ m/^(\d)$/; |
7023f3ea | 455 | my $hugepages_size = hugepages_size($conf, $numa_memory); |
ac7b7087 | 456 | $hugepages_topology->{$hugepages_size}->{$hostnodes} += hugepages_nr($numa_memory, $hugepages_size); |
7023f3ea AD |
457 | |
458 | } | |
459 | ||
460 | #if no custom numa tology, we split memory and cores across numa nodes | |
461 | if(!$numa_custom_topology) { | |
462 | ||
463 | my $numa_memory = ($static_memory / $sockets); | |
464 | ||
465 | for (my $i = 0; $i < $sockets; $i++) { | |
466 | ||
467 | my $hugepages_size = hugepages_size($conf, $numa_memory); | |
468 | $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size); | |
469 | } | |
470 | } | |
471 | ||
472 | if ($hotplug_features->{memory}) { | |
aaff69ad WB |
473 | my $numa_hostmap = get_numa_guest_to_host_map($conf); |
474 | ||
7023f3ea AD |
475 | foreach_dimm($conf, undef, $memory, $sockets, sub { |
476 | my ($conf, undef, $name, $dimm_size, $numanode, $current_size, $memory) = @_; | |
477 | ||
aaff69ad WB |
478 | $numanode = $numa_hostmap->{$numanode}; |
479 | ||
7023f3ea AD |
480 | my $hugepages_size = hugepages_size($conf, $dimm_size); |
481 | $hugepages_topology->{$hugepages_size}->{$numanode} += hugepages_nr($dimm_size, $hugepages_size); | |
482 | }); | |
483 | } | |
484 | ||
485 | return $hugepages_topology; | |
486 | } | |
487 | ||
488 | sub hugepages_host_topology { | |
489 | ||
490 | #read host hugepages | |
491 | my $hugepages_host_topology = {}; | |
492 | ||
493 | dir_glob_foreach("/sys/devices/system/node/", 'node(\d+)', sub { | |
494 | my ($nodepath, $numanode) = @_; | |
495 | ||
496 | dir_glob_foreach("/sys/devices/system/node/$nodepath/hugepages/", 'hugepages\-(\d+)kB', sub { | |
497 | my ($hugepages_path, $hugepages_size) = @_; | |
498 | ||
499 | $hugepages_size = $hugepages_size / 1024; | |
500 | my $hugepages_nr = PVE::Tools::file_read_firstline("/sys/devices/system/node/$nodepath/hugepages/$hugepages_path/nr_hugepages"); | |
501 | $hugepages_host_topology->{$hugepages_size}->{$numanode} = $hugepages_nr; | |
502 | }); | |
503 | }); | |
504 | ||
505 | return $hugepages_host_topology; | |
506 | } | |
507 | ||
508 | sub hugepages_allocate { | |
509 | my ($hugepages_topology, $hugepages_host_topology) = @_; | |
510 | ||
511 | #allocate new hupages if needed | |
512 | foreach my $size (sort keys %$hugepages_topology) { | |
513 | ||
514 | my $nodes = $hugepages_topology->{$size}; | |
515 | ||
516 | foreach my $numanode (keys %$nodes) { | |
517 | ||
518 | my $hugepages_size = $size * 1024; | |
519 | my $hugepages_requested = $hugepages_topology->{$size}->{$numanode}; | |
520 | my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/"; | |
521 | my $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages"); | |
522 | my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages"); | |
523 | ||
524 | if ($hugepages_requested > $hugepages_free) { | |
525 | my $hugepages_needed = $hugepages_requested - $hugepages_free; | |
526 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr + $hugepages_needed); | |
527 | #verify that is correctly allocated | |
528 | $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages"); | |
529 | if ($hugepages_free < $hugepages_requested) { | |
530 | #rollback to initial host config | |
531 | hugepages_reset($hugepages_host_topology); | |
532 | die "hugepage allocation failed"; | |
533 | } | |
534 | } | |
535 | ||
536 | } | |
537 | } | |
538 | ||
539 | } | |
540 | ||
541 | sub hugepages_pre_deallocate { | |
542 | my ($hugepages_topology) = @_; | |
543 | ||
544 | foreach my $size (sort keys %$hugepages_topology) { | |
545 | ||
546 | my $hugepages_size = $size * 1024; | |
547 | my $path = "/sys/kernel/mm/hugepages/hugepages-${hugepages_size}kB/"; | |
548 | my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages"); | |
549 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", 0); | |
550 | } | |
551 | } | |
552 | ||
553 | sub hugepages_reset { | |
554 | my ($hugepages_topology) = @_; | |
555 | ||
556 | foreach my $size (sort keys %$hugepages_topology) { | |
557 | ||
558 | my $nodes = $hugepages_topology->{$size}; | |
559 | foreach my $numanode (keys %$nodes) { | |
560 | ||
561 | my $hugepages_nr = $hugepages_topology->{$size}->{$numanode}; | |
562 | my $hugepages_size = $size * 1024; | |
563 | my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/"; | |
564 | ||
565 | PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr); | |
566 | } | |
567 | } | |
568 | } | |
569 | ||
570 | sub hugepages_update_locked { | |
571 | my ($code, @param) = @_; | |
572 | ||
573 | my $timeout = 60; #could be long if a lot of hugepages need to be alocated | |
574 | ||
575 | my $lock_filename = "/var/lock/hugepages.lck"; | |
576 | ||
577 | my $res = lock_file($lock_filename, $timeout, $code, @param); | |
578 | die $@ if $@; | |
579 | ||
580 | return $res; | |
581 | } | |
3f669af2 | 582 | 1; |
a0649da2 | 583 |