X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=PVE%2FQemuServer%2FMemory.pm;h=b7cf5d57e066013c5b4d87c53117f01ca585a9bc;hb=746232eeb12aa2e55941afd800b8fbe12241a10d;hp=58c7217ca8480035cc6144eccc3ef28565dfb4a8;hpb=a2a5ffc92868c839b2e0c206a904b440bcec2c87;p=qemu-server.git diff --git a/PVE/QemuServer/Memory.pm b/PVE/QemuServer/Memory.pm index 58c7217c..b7cf5d57 100644 --- a/PVE/QemuServer/Memory.pm +++ b/PVE/QemuServer/Memory.pm @@ -2,14 +2,44 @@ package PVE::QemuServer::Memory; use strict; use warnings; -use PVE::QemuServer; + use PVE::Tools qw(run_command lock_file lock_file_full file_read_firstline dir_glob_foreach); use PVE::Exception qw(raise raise_param_exc); +use PVE::QemuServer; +use PVE::QemuServer::Monitor qw(mon_cmd); + my $MAX_NUMA = 8; my $MAX_MEM = 4194304; my $STATICMEM = 1024; +sub get_numa_node_list { + my ($conf) = @_; + my @numa_map; + for (my $i = 0; $i < $MAX_NUMA; $i++) { + my $entry = $conf->{"numa$i"} or next; + my $numa = PVE::QemuServer::parse_numa($entry) or next; + push @numa_map, $i; + } + return @numa_map if @numa_map; + my $sockets = $conf->{sockets} || 1; + return (0..($sockets-1)); +} + +# only valid when numa nodes map to a single host node +sub get_numa_guest_to_host_map { + my ($conf) = @_; + my $map = {}; + for (my $i = 0; $i < $MAX_NUMA; $i++) { + my $entry = $conf->{"numa$i"} or next; + my $numa = PVE::QemuServer::parse_numa($entry) or next; + $map->{$i} = print_numa_hostnodes($numa->{hostnodes}); + } + return $map if %$map; + my $sockets = $conf->{sockets} || 1; + return {map { $_ => $_ } (0..($sockets-1))}; +} + sub foreach_dimm{ my ($conf, $vmid, $memory, $sockets, $func) = @_; @@ -27,11 +57,13 @@ sub foreach_dimm{ return if $current_size == $memory; + my @numa_map = get_numa_node_list($conf); + for (my $j = 0; $j < 8; $j++) { for (my $i = 0; $i < 32; $i++) { my $name = "dimm${dimm_id}"; $dimm_id++; - my $numanode = $i % $sockets; + my $numanode = $numa_map[$i % @numa_map]; $current_size += $dimm_size; &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); return $current_size if $current_size >= $memory; @@ -57,11 +89,13 @@ sub foreach_reverse_dimm { return if $current_size == $memory; + my @numa_map = get_numa_node_list($conf); + for (my $j = 0; $j < 8; $j++) { for (my $i = 0; $i < 32; $i++) { my $name = "dimm${dimm_id}"; $dimm_id--; - my $numanode = $i % $sockets; + my $numanode = $numa_map[(31-$i) % @numa_map]; $current_size -= $dimm_size; &$func($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory); return $current_size if $current_size <= $memory; @@ -90,6 +124,8 @@ sub qemu_memory_hotplug { if($value > $memory) { + my $numa_hostmap = get_numa_guest_to_host_map($conf) if $conf->{hugepages}; + foreach_dimm($conf, $vmid, $value, $sockets, sub { my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, $memory) = @_; @@ -99,13 +135,14 @@ sub qemu_memory_hotplug { my $hugepages_size = hugepages_size($conf, $dimm_size); my $path = hugepages_mount_path($hugepages_size); - my $hugepages_topology->{$hugepages_size}->{$numanode} = hugepages_nr($dimm_size, $hugepages_size); + my $host_numanode = $numa_hostmap->{$numanode}; + my $hugepages_topology->{$hugepages_size}->{$host_numanode} = hugepages_nr($dimm_size, $hugepages_size); my $code = sub { my $hugepages_host_topology = hugepages_host_topology(); hugepages_allocate($hugepages_topology, $hugepages_host_topology); - eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", props => { + eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-file", id => "mem-$name", props => { size => int($dimm_size*1024*1024), 'mem-path' => $path, share => JSON::true, prealloc => JSON::true } ); }; if (my $err = $@) { hugepages_reset($hugepages_host_topology); @@ -117,7 +154,7 @@ sub qemu_memory_hotplug { eval { hugepages_update_locked($code); }; } else { - eval { PVE::QemuServer::vm_mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", props => { size => int($dimm_size*1024*1024) } ) }; + eval { mon_cmd($vmid, "object-add", 'qom-type' => "memory-backend-ram", id => "mem-$name", props => { size => int($dimm_size*1024*1024) } ) }; } if (my $err = $@) { @@ -125,7 +162,7 @@ sub qemu_memory_hotplug { die $err; } - eval { PVE::QemuServer::vm_mon_cmd($vmid, "device_add", driver => "pc-dimm", id => "$name", memdev => "mem-$name", node => $numanode) }; + eval { mon_cmd($vmid, "device_add", driver => "pc-dimm", id => "$name", memdev => "mem-$name", node => $numanode) }; if (my $err = $@) { eval { PVE::QemuServer::qemu_objectdel($vmid, "mem-$name"); }; die $err; @@ -165,7 +202,7 @@ sub qemu_memory_hotplug { sub qemu_dimm_list { my ($vmid) = @_; - my $dimmarray = PVE::QemuServer::vm_mon_cmd_nocheck($vmid, "query-memory-devices"); + my $dimmarray = mon_cmd($vmid, "query-memory-devices"); my $dimms = {}; foreach my $dimm (@$dimmarray) { @@ -181,13 +218,19 @@ sub qemu_dimm_list { sub config { my ($conf, $vmid, $sockets, $cores, $defaults, $hotplug_features, $cmd) = @_; - + my $memory = $conf->{memory} || $defaults->{memory}; my $static_memory = 0; if ($hotplug_features->{memory}) { die "NUMA needs to be enabled for memory hotplug\n" if !$conf->{numa}; die "Total memory is bigger than ${MAX_MEM}MB\n" if $memory > $MAX_MEM; + + for (my $i = 0; $i < $MAX_NUMA; $i++) { + die "cannot enable memory hotplugging with custom NUMA topology\n" + if $conf->{"numa$i"}; + } + my $sockets = 1; $sockets = $conf->{sockets} if $conf->{sockets}; @@ -222,7 +265,7 @@ sub config { # cpus my $cpulists = $numa->{cpus}; die "missing NUMA node$i cpus\n" if !defined($cpulists); - my $cpus = join(',', map { + my $cpus = join(',cpus=', map { my ($start, $end) = @$_; defined($end) ? "$start-$end" : $start } @$cpulists); @@ -230,17 +273,8 @@ sub config { # hostnodes my $hostnodelists = $numa->{hostnodes}; if (defined($hostnodelists)) { - my $hostnodes; - foreach my $hostnoderange (@$hostnodelists) { - my ($start, $end) = @$hostnoderange; - $hostnodes .= ',' if $hostnodes; - $hostnodes .= $start; - $hostnodes .= "-$end" if defined($end); - $end //= $start; - for (my $i = $start; $i <= $end; ++$i ) { - die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/"; - } - } + + my $hostnodes = print_numa_hostnodes($hostnodelists); # policy my $policy = $numa->{policy}; @@ -287,11 +321,8 @@ sub config { push @$cmd, "-object" , $mem_object; push @$cmd, "-device", "pc-dimm,id=$name,memdev=mem-$name,node=$numanode"; - #if dimm_memory is not aligned to dimm map - if($current_size > $memory) { - $conf->{memory} = $current_size; - PVE::QemuConfig->write_config($vmid, $conf); - } + die "memory size ($memory) must be aligned to $dimm_size for hotplugging\n" + if $current_size > $memory; }); } } @@ -311,12 +342,29 @@ sub print_mem_object { } +sub print_numa_hostnodes { + my ($hostnodelists) = @_; + + my $hostnodes; + foreach my $hostnoderange (@$hostnodelists) { + my ($start, $end) = @$hostnoderange; + $hostnodes .= ',' if $hostnodes; + $hostnodes .= $start; + $hostnodes .= "-$end" if defined($end); + $end //= $start; + for (my $i = $start; $i <= $end; ++$i ) { + die "host NUMA node$i doesn't exist\n" if ! -d "/sys/devices/system/node/node$i/"; + } + } + return $hostnodes; +} + sub hugepages_mount { my $mountdata = PVE::ProcFSTools::parse_proc_mounts(); foreach my $size (qw(2048 1048576)) { - return if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB"); + next if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB"); my $path = "/run/hugepages/kvm/${size}kB"; @@ -349,30 +397,36 @@ sub hugepages_nr { } sub hugepages_size { - my ($conf, $size) = @_; + my ($conf, $size) = @_; + die "hugepages option is not enabled" if !$conf->{hugepages}; + die "memory size '$size' is not a positive even integer; cannot use for hugepages\n" + if $size <= 0 || $size & 1; - die "hugepages option is not enabled" if !$conf->{hugepages}; + my $page_chunk = sub { -d "/sys/kernel/mm/hugepages/hugepages-". ($_[0] * 1024) ."kB" }; + die "your system doesn't support hugepages\n" if !$page_chunk->(2) && !$page_chunk->(1024); - if ($conf->{hugepages} eq 'any') { + if ($conf->{hugepages} eq 'any') { - #try to use 1GB if available && memory size is matching - if (-d "/sys/kernel/mm/hugepages/hugepages-1048576kB" && ($size % 1024 == 0)) { + # try to use 1GB if available && memory size is matching + if ($page_chunk->(1024) && ($size & 1023) == 0) { return 1024; - } else { + } elsif ($page_chunk->(2)) { return 2; + } else { + die "host only supports 1024 GB hugepages, but requested size '$size' is not a multiple of 1024 MB\n" } + } else { - } else { - - my $hugepagesize = $conf->{hugepages} * 1024 . "kB"; + my $hugepagesize = $conf->{hugepages}; - if (! -d "/sys/kernel/mm/hugepages/hugepages-$hugepagesize") { - die "your system doesn't support hugepages of $hugepagesize"; + if (!$page_chunk->($hugepagesize)) { + die "your system doesn't support hugepages of $hugepagesize MB\n"; + } elsif (($size % $hugepagesize) != 0) { + die "Memory size $size is not a multiple of the requested hugepages size $hugepagesize\n"; } - die "Memory size $size is not a multiple of the requested hugepages size $hugepagesize" if ($size % $conf->{hugepages}) != 0; - return $conf->{hugepages}; - } + return $hugepagesize + } } sub hugepages_topology { @@ -406,9 +460,12 @@ sub hugepages_topology { $numa_custom_topology = 1; my $numa_memory = $numa->{memory}; + my $hostnodelists = $numa->{hostnodes}; + my $hostnodes = print_numa_hostnodes($hostnodelists); + die "more than 1 hostnode value in numa node is not supported when hugepages are enabled" if $hostnodes !~ m/^(\d)$/; my $hugepages_size = hugepages_size($conf, $numa_memory); - $hugepages_topology->{$hugepages_size}->{$i} += hugepages_nr($numa_memory, $hugepages_size); + $hugepages_topology->{$hugepages_size}->{$hostnodes} += hugepages_nr($numa_memory, $hugepages_size); } @@ -425,9 +482,13 @@ sub hugepages_topology { } if ($hotplug_features->{memory}) { + my $numa_hostmap = get_numa_guest_to_host_map($conf); + foreach_dimm($conf, undef, $memory, $sockets, sub { my ($conf, undef, $name, $dimm_size, $numanode, $current_size, $memory) = @_; + $numanode = $numa_hostmap->{$numanode}; + my $hugepages_size = hugepages_size($conf, $dimm_size); $hugepages_topology->{$hugepages_size}->{$numanode} += hugepages_nr($dimm_size, $hugepages_size); }); @@ -489,6 +550,29 @@ sub hugepages_allocate { } +sub hugepages_default_nr_hugepages { + my ($size) = @_; + + my $cmdline = PVE::Tools::file_read_firstline("/proc/cmdline"); + my $args = PVE::Tools::split_args($cmdline); + + my $parsed_size = 2; # default is 2M + + foreach my $arg (@$args) { + if ($arg eq "hugepagesz=2M") { + $parsed_size = 2; + } elsif ($arg eq "hugepagesz=1G") { + $parsed_size = 1024; + } elsif ($arg =~ m/^hugepages=(\d+)?$/) { + if ($parsed_size == $size) { + return $1; + } + } + } + + return 0; +} + sub hugepages_pre_deallocate { my ($hugepages_topology) = @_; @@ -496,8 +580,8 @@ sub hugepages_pre_deallocate { my $hugepages_size = $size * 1024; my $path = "/sys/kernel/mm/hugepages/hugepages-${hugepages_size}kB/"; - my $hugepages_nr = PVE::Tools::file_read_firstline($path."nr_hugepages"); - PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", 0); + my $hugepages_nr = hugepages_default_nr_hugepages($size); + PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", $hugepages_nr); } }