]> git.proxmox.com Git - qemu-server.git/blame - PVE/QemuServer/Memory.pm
memory hotplug: rework max memory handling, make phys-bits dependent
[qemu-server.git] / PVE / QemuServer / Memory.pm
CommitLineData
a0649da2
AD
1package PVE::QemuServer::Memory;
2
3use strict;
4use warnings;
71aba4ea 5
7023f3ea 6use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach);
9a1c413f 7use PVE::Exception qw(raise raise_param_exc);
a0649da2 8
71aba4ea 9use PVE::QemuServer;
0a13e08e 10use PVE::QemuServer::Monitor qw(mon_cmd);
71aba4ea 11
0567a4d5 12my $MAX_NUMA = 8;
6779f1ac 13my $STATICMEM = 1024;
a0649da2 14
33b0d3b7
TL
15my $_host_bits;
16my sub get_host_phys_address_bits {
17 return $_host_bits if defined($_host_bits);
18
19 my $fh = IO::File->new ('/proc/cpuinfo', "r") or return;
20 while (defined(my $line = <$fh>)) {
21 # hopefully we never need to care about mixed (big.LITTLE) archs
22 if ($line =~ m/^address sizes\s*:\s*(\d+)\s*bits physical/i) {
23 $_host_bits = int($1);
24 $fh->close();
25 return $_host_bits;
26 }
27 }
28 $fh->close();
29 return; # undef, cannot really do anything..
30}
31
32my sub get_max_mem {
33 my ($conf) = @_;
34
35 my $cpu = {};
36 if (my $cpu_prop_str = $conf->{cpu}) {
37 $cpu = PVE::JSONSchema::parse_property_string('pve-vm-cpu-conf', $cpu_prop_str)
38 or die "Cannot parse cpu description: $cpu_prop_str\n";
39 }
40 my $bits;
41 if (my $phys_bits = $cpu->{'phys-bits'}) {
42 if ($phys_bits eq 'host') {
43 $bits = get_host_phys_address_bits();
44 } elsif ($phys_bits =~ /^(\d+)$/) {
45 $bits = int($phys_bits);
46 }
47 }
48
49 if (!defined($bits)) {
50 my $host_bits = get_host_phys_address_bits() // 36; # fixme: what fallback?
51 if ($cpu->{cputype} && $cpu->{cputype} =~ /^(host|max)$/) {
52 $bits = $host_bits;
53 } else {
54 $bits = $host_bits > 40 ? 40 : $host_bits; # take the smaller one
55 }
56 }
57
58 # remove 20 bits to get MB and half that as QEMU needs some overhead
59 my $bits_to_max_mem = int(1 << ($bits - 21));
60
61 return $bits_to_max_mem > 4*1024*1024 ? 4*1024*1024 : $bits_to_max_mem;
62}
63
aaff69ad
WB
64sub get_numa_node_list {
65 my ($conf) = @_;
66 my @numa_map;
67 for (my $i = 0; $i < $MAX_NUMA; $i++) {
68 my $entry = $conf->{"numa$i"} or next;
69 my $numa = PVE::QemuServer::parse_numa($entry) or next;
70 push @numa_map, $i;
71 }
72 return @numa_map if @numa_map;
73 my $sockets = $conf->{sockets} || 1;
74 return (0..($sockets-1));
75}
76
77# only valid when numa nodes map to a single host node
78sub get_numa_guest_to_host_map {
79 my ($conf) = @_;
80 my $map = {};
81 for (my $i = 0; $i < $MAX_NUMA; $i++) {
82 my $entry = $conf->{"numa$i"} or next;
83 my $numa = PVE::QemuServer::parse_numa($entry) or next;
84 $map->{$i} = print_numa_hostnodes($numa->{hostnodes});
85 }
86 return $map if %$map;
87 my $sockets = $conf->{sockets} || 1;
1d682951 88 return {map { $_ => $_ } (0..($sockets-1))};
aaff69ad
WB
89}
90
3f669af2
AD
91sub foreach_dimm{
92 my ($conf, $vmid, $memory, $sockets, $func) = @_;
93
94 my $dimm_id = 0;
7023f3ea
AD
95 my $current_size = 0;
96 my $dimm_size = 0;
97
98 if($conf->{hugepages} && $conf->{hugepages} == 1024) {
99 $current_size = 1024 * $sockets;
100 $dimm_size = 1024;
101 } else {
102 $current_size = 1024;
103 $dimm_size = 512;
104 }
105
3f669af2
AD
106 return if $current_size == $memory;
107
aaff69ad
WB
108 my @numa_map = get_numa_node_list($conf);
109
3f669af2
AD
110 for (my $j = 0; $j < 8; $j++) {
111 for (my $i = 0; $i < 32; $i++) {
112 my $name = "dimm${dimm_id}";
113 $dimm_id++;
aaff69ad 114 my $numanode = $numa_map[$i % @numa_map];
3f669af2
AD
115 $current_size += $dimm_size;
116 &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory);
117 return $current_size if $current_size >= $memory;
118 }
119 $dimm_size *= 2;
120 }
121}
122
123sub foreach_reverse_dimm {
124 my ($conf, $vmid, $memory, $sockets, $func) = @_;
125
126 my $dimm_id = 253;
7023f3ea
AD
127 my $current_size = 0;
128 my $dimm_size = 0;
129
130 if($conf->{hugepages} && $conf->{hugepages} == 1024) {
131 $current_size = 8355840;
132 $dimm_size = 131072;
133 } else {
134 $current_size = 4177920;
135 $dimm_size = 65536;
136 }
137
3f669af2
AD
138 return if $current_size == $memory;
139
aaff69ad
WB
140 my @numa_map = get_numa_node_list($conf);
141
3f669af2
AD
142 for (my $j = 0; $j < 8; $j++) {
143 for (my $i = 0; $i < 32; $i++) {
f7d1505b
TL
144 my $name = "dimm${dimm_id}";
145 $dimm_id--;
aaff69ad 146 my $numanode = $numa_map[(31-$i) % @numa_map];
f7d1505b
TL
147 $current_size -= $dimm_size;
148 &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory);
3f669af2
AD
149 return $current_size if $current_size <= $memory;
150 }
151 $dimm_size /= 2;
152 }
153}
154
6779f1ac
AD
155sub qemu_memory_hotplug {
156 my ($vmid, $conf, $defaults, $opt, $value) = @_;
157
37dbec89 158 return $value if !PVE::QemuServer::check_running($vmid);
6779f1ac 159
7023f3ea
AD
160 my $sockets = 1;
161 $sockets = $conf->{sockets} if $conf->{sockets};
162
6779f1ac
AD
163 my $memory = $conf->{memory} || $defaults->{memory};
164 $value = $defaults->{memory} if !$value;
165 return $value if $value == $memory;
166
167 my $static_memory = $STATICMEM;
7023f3ea 168 $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024);
6779f1ac
AD
169
170 die "memory can't be lower than $static_memory MB" if $value < $static_memory;
33b0d3b7
TL
171 my $MAX_MEM = get_max_mem($conf);
172 die "you cannot add more memory than max mem $MAX_MEM MB!\n" if $memory > $MAX_MEM;
6779f1ac 173
33b0d3b7 174 if ($value > $memory) {
6779f1ac 175
f7d1505b 176 my $numa_hostmap;
aaff69ad 177
f7d1505b 178 foreach_dimm($conf, $vmid, $value, $sockets, sub {
6779f1ac
AD
179 my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
180
181 return if $current_size <= $conf->{memory};
182
7023f3ea 183 if ($conf->{hugepages}) {
f7d1505b 184 $numa_hostmap = get_numa_guest_to_host_map($conf) if !$numa_hostmap;
7023f3ea
AD
185
186 my $hugepages_size = hugepages_size($conf, $dimm_size);
187 my $path = hugepages_mount_path($hugepages_size);
aaff69ad
WB
188 my $host_numanode = $numa_hostmap->{$numanode};
189 my $hugepages_topology->{$hugepages_size}->{$host_numanode} = hugepages_nr($dimm_size, $hugepages_size);
7023f3ea
AD
190
191 my $code = sub {
192 my $hugepages_host_topology = hugepages_host_topology();
193 hugepages_allocate($hugepages_topology, $hugepages_host_topology);
194
ae776a62 195 eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", size => int($dimm_size*1024*1024), 'mem-path' => $path, share => JSON::true, prealloc => JSON::true ) };
7023f3ea
AD
196 if (my $err = $@) {
197 hugepages_reset($hugepages_host_topology);
198 die $err;
199 }
200
201 hugepages_pre_deallocate($hugepages_topology);
202 };
203 eval { hugepages_update_locked($code); };
204
205 } else {
ae776a62 206 eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", size => int($dimm_size*1024*1024) ) };
7023f3ea
AD
207 }
208
6779f1ac
AD
209 if (my $err = $@) {
210 eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); };
211 die $err;
212 }
213
0a13e08e 214 eval { mon_cmd($vmid, "device_add", driver => "pc-dimm", id => "$name", memdev => "mem-$name", node => $numanode) };
6779f1ac
AD
215 if (my $err = $@) {
216 eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); };
217 die $err;
218 }
219 #update conf after each succesful module hotplug
220 $conf->{memory} = $current_size;
221 PVE::QemuConfig->write_config($vmid, $conf);
222 });
223
224 } else {
225
f7d1505b 226 foreach_reverse_dimm($conf, $vmid, $value, $sockets, sub {
6779f1ac
AD
227 my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
228
229 return if $current_size >= $conf->{memory};
230 print "try to unplug memory dimm $name\n";
231
232 my $retry = 0;
f7d1505b 233 while (1) {
6779f1ac
AD
234 eval { PVE::QemuServer::qemu_devicedel($vmid, $name) };
235 sleep 3;
236 my $dimm_list = qemu_dimm_list($vmid);
237 last if !$dimm_list->{$name};
238 raise_param_exc({ $name => "error unplug memory module" }) if $retry > 5;
239 $retry++;
240 }
241
242 #update conf after each succesful module unplug
243 $conf->{memory} = $current_size;
244
245 eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); };
246 PVE::QemuConfig->write_config($vmid, $conf);
247 });
248 }
249}
250
251sub qemu_dimm_list {
252 my ($vmid) = @_;
253
0a13e08e 254 my $dimmarray = mon_cmd($vmid, "query-memory-devices");
6779f1ac
AD
255 my $dimms = {};
256
257 foreach my $dimm (@$dimmarray) {
258
259 $dimms->{$dimm->{data}->{id}}->{id} = $dimm->{data}->{id};
260 $dimms->{$dimm->{data}->{id}}->{node} = $dimm->{data}->{node};
261 $dimms->{$dimm->{data}->{id}}->{addr} = $dimm->{data}->{addr};
262 $dimms->{$dimm->{data}->{id}}->{size} = $dimm->{data}->{size};
263 $dimms->{$dimm->{data}->{id}}->{slot} = $dimm->{data}->{slot};
264 }
265 return $dimms;
266}
267
0567a4d5
AD
268sub config {
269 my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd) = @_;
a022e3fd 270
0567a4d5
AD
271 my $memory = $conf->{memory} || $defaults->{memory};
272 my $static_memory = 0;
0567a4d5
AD
273
274 if ($hotplug_features->{memory}) {
7023f3ea 275 die "NUMA needs to be enabled for memory hotplug\n" if !$conf->{numa};
33b0d3b7 276 my $MAX_MEM = get_max_mem($conf);
0567a4d5 277 die "Total memory is bigger than ${MAX_MEM}MB\n" if $memory > $MAX_MEM;
456bab54
SR
278
279 for (my $i = 0; $i < $MAX_NUMA; $i++) {
280 die "cannot enable memory hotplugging with custom NUMA topology\n"
281 if $conf->{"numa$i"};
282 }
283
7023f3ea
AD
284 my $sockets = 1;
285 $sockets = $conf->{sockets} if $conf->{sockets};
286
0567a4d5 287 $static_memory = $STATICMEM;
7023f3ea
AD
288 $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024);
289
0567a4d5 290 die "minimum memory must be ${static_memory}MB\n" if($memory < $static_memory);
0567a4d5
AD
291 push @$cmd, '-m', "size=${static_memory},slots=255,maxmem=${MAX_MEM}M";
292
293 } else {
294
295 $static_memory = $memory;
296 push @$cmd, '-m', $static_memory;
297 }
298
7023f3ea
AD
299 die "numa needs to be enabled to use hugepages" if $conf->{hugepages} && !$conf->{numa};
300
0567a4d5
AD
301 if ($conf->{numa}) {
302
303 my $numa_totalmemory = undef;
304 for (my $i = 0; $i < $MAX_NUMA; $i++) {
305 next if !$conf->{"numa$i"};
306 my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"});
307 next if !$numa;
308 # memory
309 die "missing NUMA node$i memory value\n" if !$numa->{memory};
310 my $numa_memory = $numa->{memory};
311 $numa_totalmemory += $numa_memory;
7023f3ea
AD
312
313 my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
0567a4d5
AD
314
315 # cpus
316 my $cpulists = $numa->{cpus};
317 die "missing NUMA node$i cpus\n" if !defined($cpulists);
f612bd67 318 my $cpus = join(',cpus=', map {
0567a4d5
AD
319 my ($start, $end) = @$_;
320 defined($end) ? "$start-$end" : $start
321 } @$cpulists);
322
323 # hostnodes
324 my $hostnodelists = $numa->{hostnodes};
325 if (defined($hostnodelists)) {
ac7b7087
AD
326
327 my $hostnodes = print_numa_hostnodes($hostnodelists);
0567a4d5
AD
328
329 # policy
330 my $policy = $numa->{policy};
331 die "you need to define a policy for hostnode $hostnodes\n" if !$policy;
7023f3ea
AD
332 $mem_object .= ",host-nodes=$hostnodes,policy=$policy";
333 } else {
334 die "numa hostnodes need to be defined to use hugepages" if $conf->{hugepages};
0567a4d5
AD
335 }
336
7023f3ea 337 push @$cmd, '-object', $mem_object;
0567a4d5
AD
338 push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
339 }
340
341 die "total memory for NUMA nodes must be equal to vm static memory\n"
342 if $numa_totalmemory && $numa_totalmemory != $static_memory;
343
344 #if no custom tology, we split memory and cores across numa nodes
f7d1505b 345 if (!$numa_totalmemory) {
7023f3ea 346 my $numa_memory = ($static_memory / $sockets);
0567a4d5
AD
347
348 for (my $i = 0; $i < $sockets; $i++) {
a2a5ffc9 349 die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/" && $conf->{hugepages};
0567a4d5 350
7023f3ea 351 my $mem_object = print_mem_object($conf, "ram-node$i", $numa_memory);
7023f3ea 352 push @$cmd, '-object', $mem_object;
f7d1505b
TL
353
354 my $cpus = ($cores * $i);
355 $cpus .= "-" . ($cpus + $cores - 1) if $cores > 1;
356
0567a4d5
AD
357 push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
358 }
359 }
360 }
361
362 if ($hotplug_features->{memory}) {
363 foreach_dimm($conf, $vmid, $memory, $sockets, sub {
364 my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
7023f3ea
AD
365
366 my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
367
368 push @$cmd, "-object" , $mem_object;
0567a4d5
AD
369 push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
370
746232ee
SR
371 die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n"
372 if $current_size > $memory;
0567a4d5
AD
373 });
374 }
375}
376
7023f3ea
AD
377sub print_mem_object {
378 my ($conf, $id, $size) = @_;
379
380 if ($conf->{hugepages}) {
381
382 my $hugepages_size = hugepages_size($conf, $size);
383 my $path = hugepages_mount_path($hugepages_size);
384
385 return "memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
386 } else {
387 return "memory-backend-ram,id=$id,size=${size}M";
388 }
0567a4d5 389
7023f3ea
AD
390}
391
ac7b7087
AD
392sub print_numa_hostnodes {
393 my ($hostnodelists) = @_;
394
395 my $hostnodes;
396 foreach my $hostnoderange (@$hostnodelists) {
397 my ($start, $end) = @$hostnoderange;
398 $hostnodes .= ',' if $hostnodes;
399 $hostnodes .= $start;
400 $hostnodes .= "-$end" if defined($end);
401 $end //= $start;
402 for (my $i = $start; $i <= $end; ++$i ) {
403 die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/";
404 }
405 }
406 return $hostnodes;
407}
408
7023f3ea
AD
409sub hugepages_mount {
410
411 my $mountdata = PVE::ProcFSTools::parse_proc_mounts();
412
413 foreach my $size (qw(2048 1048576)) {
3d8d2e8d 414 next if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB");
7023f3ea
AD
415
416 my $path = "/run/hugepages/kvm/${size}kB";
417
418 my $found = grep {
419 $_->[2] =~ /^hugetlbfs/ &&
420 $_->[1] eq $path
421 } @$mountdata;
422
423 if (!$found) {
424
425 File::Path::make_path($path) if (!-d $path);
426 my $cmd = ['/bin/mount', '-t', 'hugetlbfs', '-o', "pagesize=${size}k", 'hugetlbfs', $path];
427 run_command($cmd, errmsg => "hugepage mount error");
428 }
429 }
430}
431
432sub hugepages_mount_path {
433 my ($size) = @_;
434
435 $size = $size * 1024;
436 return "/run/hugepages/kvm/${size}kB";
437
438}
439
440sub hugepages_nr {
441 my ($size, $hugepages_size) = @_;
442
443 return $size / $hugepages_size;
444}
445
446sub hugepages_size {
71aba4ea
TL
447 my ($conf, $size) = @_;
448 die "hugepages option is not enabled" if !$conf->{hugepages};
449 die "memory size '$size' is not a positive even integer; cannot use for hugepages\n"
450 if $size <= 0 || $size & 1;
7023f3ea 451
71aba4ea
TL
452 my $page_chunk = sub { -d "/sys/kernel/mm/hugepages/hugepages-". ($_[0] * 1024) ."kB" };
453 die "your system doesn't support hugepages\n" if !$page_chunk->(2) && !$page_chunk->(1024);
7023f3ea 454
71aba4ea 455 if ($conf->{hugepages} eq 'any') {
7023f3ea 456
71aba4ea
TL
457 # try to use 1GB if available && memory size is matching
458 if ($page_chunk->(1024) && ($size & 1023) == 0) {
7023f3ea 459 return 1024;
71aba4ea 460 } elsif ($page_chunk->(2)) {
7023f3ea 461 return 2;
71aba4ea
TL
462 } else {
463 die "host only supports 1024 GB hugepages, but requested size '$size' is not a multiple of 1024 MB\n"
7023f3ea 464 }
71aba4ea 465 } else {
7023f3ea 466
93981fa7 467 my $hugepagesize = $conf->{hugepages};
7023f3ea 468
93981fa7
TL
469 if (!$page_chunk->($hugepagesize)) {
470 die "your system doesn't support hugepages of $hugepagesize MB\n";
471 } elsif (($size % $hugepagesize) != 0) {
472 die "Memory size $size is not a multiple of the requested hugepages size $hugepagesize\n";
7023f3ea 473 }
93981fa7
TL
474
475 return $hugepagesize
71aba4ea 476 }
7023f3ea
AD
477}
478
479sub hugepages_topology {
480 my ($conf) = @_;
481
482 my $hugepages_topology = {};
483
484 return if !$conf->{numa};
485
486 my $defaults = PVE::QemuServer::load_defaults();
487 my $memory = $conf->{memory} || $defaults->{memory};
488 my $static_memory = 0;
489 my $sockets = 1;
490 $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused
491 $sockets = $conf->{sockets} if $conf->{sockets};
492 my $numa_custom_topology = undef;
493 my $hotplug_features = PVE::QemuServer::parse_hotplug_features(defined($conf->{hotplug}) ? $conf->{hotplug} : '1');
494
495 if ($hotplug_features->{memory}) {
496 $static_memory = $STATICMEM;
497 $static_memory = $static_memory * $sockets if ($conf->{hugepages} && $conf->{hugepages} == 1024);
498 } else {
499 $static_memory = $memory;
500 }
501
502 #custom numa topology
503 for (my $i = 0; $i < $MAX_NUMA; $i++) {
504 next if !$conf->{"numa$i"};
505 my $numa = PVE::QemuServer::parse_numa($conf->{"numa$i"});
506 next if !$numa;
507
508 $numa_custom_topology = 1;
509 my $numa_memory = $numa->{memory};
ac7b7087
AD
510 my $hostnodelists = $numa->{hostnodes};
511 my $hostnodes = print_numa_hostnodes($hostnodelists);
7023f3ea 512
ac7b7087 513 die "more than 1 hostnode value in numa node is not supported when hugepages are enabled" if $hostnodes !~ m/^(\d)$/;
7023f3ea 514 my $hugepages_size = hugepages_size($conf, $numa_memory);
ac7b7087 515 $hugepages_topology->{$hugepages_size}->{$hostnodes} += hugepages_nr($numa_memory, $hugepages_size);
7023f3ea
AD
516
517 }
518
519 #if no custom numa tology, we split memory and cores across numa nodes
520 if(!$numa_custom_topology) {
521
522 my $numa_memory = ($static_memory / $sockets);
523
524 for (my $i = 0; $i < $sockets; $i++) {
525
526 my $hugepages_size = hugepages_size($conf, $numa_memory);
527 $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size);
528 }
529 }
530
531 if ($hotplug_features->{memory}) {
aaff69ad
WB
532 my $numa_hostmap = get_numa_guest_to_host_map($conf);
533
7023f3ea
AD
534 foreach_dimm($conf, undef, $memory, $sockets, sub {
535 my ($conf, undef, $name, $dimm_size, $numanode, $current_size, $memory) = @_;
536
aaff69ad
WB
537 $numanode = $numa_hostmap->{$numanode};
538
7023f3ea
AD
539 my $hugepages_size = hugepages_size($conf, $dimm_size);
540 $hugepages_topology->{$hugepages_size}->{$numanode} += hugepages_nr($dimm_size, $hugepages_size);
541 });
542 }
543
544 return $hugepages_topology;
545}
546
547sub hugepages_host_topology {
548
549 #read host hugepages
550 my $hugepages_host_topology = {};
551
552 dir_glob_foreach("/sys/devices/system/node/", 'node(\d+)', sub {
553 my ($nodepath, $numanode) = @_;
554
555 dir_glob_foreach("/sys/devices/system/node/$nodepath/hugepages/", 'hugepages\-(\d+)kB', sub {
556 my ($hugepages_path, $hugepages_size) = @_;
557
558 $hugepages_size = $hugepages_size / 1024;
559 my $hugepages_nr = PVE::Tools::file_read_firstline("/sys/devices/system/node/$nodepath/hugepages/$hugepages_path/nr_hugepages");
560 $hugepages_host_topology->{$hugepages_size}->{$numanode} = $hugepages_nr;
561 });
562 });
563
564 return $hugepages_host_topology;
565}
566
567sub hugepages_allocate {
568 my ($hugepages_topology, $hugepages_host_topology) = @_;
569
570 #allocate new hupages if needed
571 foreach my $size (sort keys %$hugepages_topology) {
572
573 my $nodes = $hugepages_topology->{$size};
574
575 foreach my $numanode (keys %$nodes) {
576
577 my $hugepages_size = $size * 1024;
578 my $hugepages_requested = $hugepages_topology->{$size}->{$numanode};
579 my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
580 my $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages");
581 my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages");
582
583 if ($hugepages_requested > $hugepages_free) {
584 my $hugepages_needed = $hugepages_requested - $hugepages_free;
585 PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr + $hugepages_needed);
586 #verify that is correctly allocated
587 $hugepages_free = PVE::Tools::file_read_firstline($path."free_hugepages");
588 if ($hugepages_free < $hugepages_requested) {
589 #rollback to initial host config
590 hugepages_reset($hugepages_host_topology);
591 die "hugepage allocation failed";
592 }
593 }
594
595 }
596 }
597
598}
599
ca0ef6b1
KT
600sub hugepages_default_nr_hugepages {
601 my ($size) = @_;
602
603 my $cmdline = PVE::Tools::file_read_firstline("/proc/cmdline");
604 my $args = PVE::Tools::split_args($cmdline);
605
606 my $parsed_size = 2; # default is 2M
607
608 foreach my $arg (@$args) {
609 if ($arg eq "hugepagesz=2M") {
610 $parsed_size = 2;
611 } elsif ($arg eq "hugepagesz=1G") {
612 $parsed_size = 1024;
613 } elsif ($arg =~ m/^hugepages=(\d+)?$/) {
614 if ($parsed_size == $size) {
615 return $1;
616 }
617 }
618 }
619
620 return 0;
621}
622
7023f3ea
AD
623sub hugepages_pre_deallocate {
624 my ($hugepages_topology) = @_;
625
626 foreach my $size (sort keys %$hugepages_topology) {
627
628 my $hugepages_size = $size * 1024;
629 my $path = "/sys/kernel/mm/hugepages/hugepages-${hugepages_size}kB/";
ca0ef6b1
KT
630 my $hugepages_nr = hugepages_default_nr_hugepages($size);
631 PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr);
7023f3ea
AD
632 }
633}
634
635sub hugepages_reset {
636 my ($hugepages_topology) = @_;
637
638 foreach my $size (sort keys %$hugepages_topology) {
639
640 my $nodes = $hugepages_topology->{$size};
641 foreach my $numanode (keys %$nodes) {
642
643 my $hugepages_nr = $hugepages_topology->{$size}->{$numanode};
644 my $hugepages_size = $size * 1024;
645 my $path = "/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
646
647 PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr);
648 }
649 }
650}
651
652sub hugepages_update_locked {
653 my ($code, @param) = @_;
654
655 my $timeout = 60; #could be long if a lot of hugepages need to be alocated
656
657 my $lock_filename = "/var/lock/hugepages.lck";
658
659 my $res = lock_file($lock_filename, $timeout, $code, @param);
660 die $@ if $@;
661
662 return $res;
663}
3f669af2 6641;
a0649da2 665