1 package PVE
::Service
::pvestatd
;
11 use Time
::HiRes qw
(gettimeofday
);
12 use PVE
::Tools
qw(dir_glob_foreach file_read_firstline);
18 use PVE
::Cluster
qw(cfs_read_file);
21 use PVE
::QemuServer
::Monitor
;
25 use PVE
::RPCEnvironment
;
26 use PVE
::API2
::Subscription
;
28 use PVE
::AccessControl
;
29 use PVE
::Ceph
::Services
;
34 use PVE
::Status
::Plugin
;
36 use base
qw(PVE::Daemon);
40 require PVE
::Network
::SDN
;
47 my $nodename = PVE
::INotify
::nodename
();
49 my $cmdline = [$0, @ARGV];
51 my %daemon_options = (restart_on_error
=> 5, stop_wait_time
=> 5);
52 my $daemon = __PACKAGE__-
>new('pvestatd', $cmdline, %daemon_options);
57 $opt_debug = $self->{debug
};
59 PVE
::Cluster
::cfs_update
();
65 syslog
('info' , "server closing");
68 1 while (waitpid(-1, POSIX
::WNOHANG
()) > 0);
70 $self->exit_daemon(0);
79 my $cached_kvm_version = '';
80 my $next_flag_update_time;
81 my $failed_flag_update_delay_sec = 120;
83 sub update_supported_cpuflags
{
84 my $kvm_version = PVE
::QemuServer
::kvm_user_version
();
86 # only update when QEMU/KVM version has changed, as that is the only reason
87 # why flags could change without restarting pvestatd
88 return if $cached_kvm_version && $cached_kvm_version eq $kvm_version;
90 if ($next_flag_update_time && $next_flag_update_time > time()) {
93 $next_flag_update_time = 0;
95 my $supported_cpuflags = eval { PVE
::QemuServer
::query_supported_cpu_flags
() };
98 if (!$supported_cpuflags ||
99 (!$supported_cpuflags->{tcg
} && !$supported_cpuflags->{kvm
})) {
100 # something went wrong, clear broadcast flags and set try-again delay
101 warn "CPU flag detection failed, will try again after delay\n";
102 $next_flag_update_time = time() + $failed_flag_update_delay_sec;
104 $supported_cpuflags = {};
106 # only set cached version if there's actually something to braodcast
107 $cached_kvm_version = $kvm_version;
110 for my $accel ("tcg", "kvm") {
111 if ($supported_cpuflags->{$accel}) {
112 PVE
::Cluster
::broadcast_node_kv
("cpuflags-$accel", join(' ', @{$supported_cpuflags->{$accel}}));
114 # clear potentially invalid data
115 PVE
::Cluster
::broadcast_node_kv
("cpuflags-$accel", '');
120 my $generate_rrd_string = sub {
123 return join(':', map { $_ // 'U' } @$data);
126 my sub broadcast_static_node_info
{
127 my ($cpus, $memory) = @_;
129 my $old = PVE
::Cluster
::get_node_kv
('static-info', $nodename);
130 $old = eval { decode_json
($old->{$nodename}) } if defined($old->{$nodename});
133 !defined($old->{cpus
}) || $old->{cpus
} != $cpus
134 || !defined($old->{memory
}) || $old->{memory
} != $memory
140 PVE
::Cluster
::broadcast_node_kv
('static-info', encode_json
($info));
144 sub update_node_status
{
145 my ($status_cfg) = @_;
147 my ($uptime) = PVE
::ProcFSTools
::read_proc_uptime
();
149 my ($avg1, $avg5, $avg15) = PVE
::ProcFSTools
::read_loadavg
();
150 my $stat = PVE
::ProcFSTools
::read_proc_stat
();
151 my $cpuinfo = PVE
::ProcFSTools
::read_cpuinfo
();
152 my $maxcpu = $cpuinfo->{cpus
};
154 update_supported_cpuflags
();
156 my $subinfo = PVE
::API2
::Subscription
::read_etc_subscription
();
157 my $sublevel = $subinfo->{level
} || '';
159 my $netdev = PVE
::ProcFSTools
::read_proc_net_dev
();
160 # traffic from/to physical interface cards
161 my ($netin, $netout) = (0, 0);
162 for my $dev (grep { /^$PVE::Network::PHYSICAL_NIC_RE$/ } keys %$netdev) {
163 $netin += $netdev->{$dev}->{receive
};
164 $netout += $netdev->{$dev}->{transmit
};
167 my $meminfo = PVE
::ProcFSTools
::read_meminfo
();
169 my $dinfo = df
('/', 1); # output is bytes
170 # everything not free is considered to be used
171 my $dused = $dinfo->{blocks
} - $dinfo->{bfree
};
175 my $data = $generate_rrd_string->(
176 [$uptime, $sublevel, $ctime, $avg1, $maxcpu, $stat->{cpu
}, $stat->{wait},
177 $meminfo->{memtotal
}, $meminfo->{memused
},
178 $meminfo->{swaptotal
}, $meminfo->{swapused
},
179 $dinfo->{blocks
}, $dused, $netin, $netout]
181 PVE
::Cluster
::broadcast_rrd
("pve2-node/$nodename", $data);
190 $node_metric->{cpustat
}->@{qw(avg1 avg5 avg15)} = ($avg1, $avg5, $avg15);
191 $node_metric->{cpustat
}->{cpus
} = $maxcpu;
193 my $transactions = PVE
::ExtMetric
::transactions_start
($status_cfg);
194 PVE
::ExtMetric
::update_all
($transactions, 'node', $nodename, $node_metric, $ctime);
195 PVE
::ExtMetric
::transactions_finish
($transactions);
197 broadcast_static_node_info
($maxcpu, $meminfo->{memtotal
});
203 my $log = sub { $opt_debug and printf @_ };
205 my $hostmeminfo = PVE
::ProcFSTools
::read_meminfo
();
206 # NOTE: to debug, run 'pvestatd -d' and set memtotal here
207 #$hostmeminfo->{memtotal} = int(2*1024*1024*1024/0.8); # you can set this to test
208 my $hostfreemem = $hostmeminfo->{memtotal
} - $hostmeminfo->{memused
};
210 # try to use ~80% host memory; goal is the change amount required to achieve that
211 my $goal = int($hostmeminfo->{memtotal
} * 0.8 - $hostmeminfo->{memused
});
212 $log->("host goal: $goal free: $hostfreemem total: $hostmeminfo->{memtotal}\n");
214 my $maxchange = 100*1024*1024;
215 my $res = PVE
::AutoBalloon
::compute_alg1
($vmstatus, $goal, $maxchange);
217 for my $vmid (sort keys %$res) {
218 my $target = int($res->{$vmid});
219 my $current = int($vmstatus->{$vmid}->{balloon
});
220 next if $target == $current; # no need to change
222 $log->("BALLOON $vmid to $target (%d)\n", $target - $current);
223 eval { PVE
::QemuServer
::Monitor
::mon_cmd
($vmid, "balloon", value
=> int($target)) };
228 sub update_qemu_status
{
229 my ($status_cfg) = @_;
232 my $vmstatus = PVE
::QemuServer
::vmstatus
(undef, 1);
234 eval { auto_balloning
($vmstatus); };
235 syslog
('err', "auto ballooning error: $@") if $@;
237 my $transactions = PVE
::ExtMetric
::transactions_start
($status_cfg);
238 foreach my $vmid (keys %$vmstatus) {
239 my $d = $vmstatus->{$vmid};
241 my $status = $d->{qmpstatus
} || $d->{status
} || 'stopped';
242 my $template = $d->{template
} ?
$d->{template
} : "0";
243 if ($d->{pid
}) { # running
244 $data = $generate_rrd_string->(
245 [$d->{uptime
}, $d->{name
}, $status, $template, $ctime, $d->{cpus
}, $d->{cpu
},
246 $d->{maxmem
}, $d->{mem
}, $d->{maxdisk
}, $d->{disk
},
247 $d->{netin
}, $d->{netout
}, $d->{diskread
}, $d->{diskwrite
}]);
249 $data = $generate_rrd_string->(
250 [0, $d->{name
}, $status, $template, $ctime, $d->{cpus
}, undef,
251 $d->{maxmem
}, undef, $d->{maxdisk
}, $d->{disk
}, undef, undef, undef, undef]);
253 PVE
::Cluster
::broadcast_rrd
("pve2.3-vm/$vmid", $data);
255 PVE
::ExtMetric
::update_all
($transactions, 'qemu', $vmid, $d, $ctime, $nodename);
258 PVE
::ExtMetric
::transactions_finish
($transactions);
261 sub remove_stale_lxc_consoles
{
263 my $vmstatus = PVE
::LXC
::vmstatus
();
264 my $pidhash = PVE
::LXC
::find_lxc_console_pids
();
266 foreach my $vmid (keys %$pidhash) {
267 next if defined($vmstatus->{$vmid});
268 syslog
('info', "remove stale lxc-console for CT $vmid");
269 foreach my $pid (@{$pidhash->{$vmid}}) {
275 my $rebalance_error_count = {};
278 sub rebalance_lxc_containers
{
279 # Make sure we can find the cpuset controller path:
280 return if $NO_REBALANCE;
281 my $cpuset_base = eval { PVE
::CGroup
::cpuset_controller_path
() };
283 syslog
('info', "could not get cpuset controller path: $err");
286 if (!defined($cpuset_base)) {
291 # Figure out the cpu count & highest ID
292 my $all_cpus = PVE
::CpuSet-
>new_from_path($cpuset_base, 1);
293 my @allowed_cpus = $all_cpus->members();
294 my $cpucount = scalar(@allowed_cpus);
295 my $max_cpuid = $allowed_cpus[-1];
297 my @cpu_ctcount = (0) x
($max_cpuid+1);
300 # A mapping { vmid => cgroup_payload_path } for containers where namespace
301 # separation is active and recognized.
304 my $modify_cpuset = sub {
305 my ($vmid, $cpuset, $newset) = @_;
307 if (!$rebalance_error_count->{$vmid}) {
308 syslog
('info', "modified cpu set for lxc/$vmid: " . $newset->short_string());
312 my $cgbase = $ctinfo->{$vmid};
314 if (defined($cgbase)) {
315 # allow all, so that we can set new cpuset in /ns
316 $all_cpus->write_to_path($cgbase);
318 $newset->write_to_path("$cgbase/ns");
321 warn $err if !$rebalance_error_count->{$vmid}++;
323 $cpuset->write_to_path($cgbase);
325 # also apply to container root cgroup
326 $newset->write_to_path($cgbase);
327 $rebalance_error_count->{$vmid} = 0;
330 # old style container
331 $newset->write_to_path($cgbase);
332 $rebalance_error_count->{$vmid} = 0;
336 warn $err if !$rebalance_error_count->{$vmid}++;
340 my $ctlist = PVE
::LXC
::config_list
();
342 foreach my $vmid (sort keys %$ctlist) {
343 my $cgpath = "$cpuset_base/lxc/$vmid";
344 if (-d
"$cgpath/ns") {
345 $ctinfo->{$vmid} = $cgpath;
347 next; # old style container
350 my ($conf, $cpuset) = eval {(
351 PVE
::LXC
::Config-
>load_config($vmid),
352 PVE
::CpuSet-
>new_from_path($cgpath),
359 my @cpuset_members = $cpuset->members();
361 if (!PVE
::LXC
::Config-
>has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')
362 && !PVE
::LXC
::Config-
>has_lxc_entry($conf, 'lxc.cgroup2.cpuset.cpus')
364 my $cores = $conf->{cores
} || $cpucount;
365 $cores = $cpucount if $cores > $cpucount;
367 # see if the number of cores was hot-reduced or hasn't been enacted at all yet
368 my $newset = PVE
::CpuSet-
>new();
369 if ($cores < scalar(@cpuset_members)) {
370 for (my $i = 0; $i < $cores; $i++) {
371 $newset->insert($cpuset_members[$i]);
373 } elsif ($cores > scalar(@cpuset_members)) {
374 my $count = $newset->insert(@cpuset_members);
375 foreach my $cpu (@allowed_cpus) {
376 $count += $newset->insert($cpu);
377 last if $count >= $cores;
380 $newset->insert(@cpuset_members);
383 # Apply hot-plugged changes if any:
384 if (!$newset->is_equal($cpuset)) {
385 @cpuset_members = $newset->members();
386 $modify_cpuset->($vmid, $cpuset, $newset);
389 # Note: no need to rebalance if we already use all cores
390 push @balanced_cts, [$vmid, $cores, $newset]
391 if defined($conf->{cores
}) && ($cores != $cpucount);
394 foreach my $cpu (@cpuset_members) {
395 $cpu_ctcount[$cpu]++ if $cpu <= $max_cpuid;
399 my $find_best_cpu = sub {
400 my ($cpulist, $cpu) = @_;
402 my $cur_cost = $cpu_ctcount[$cpu];
405 foreach my $candidate (@$cpulist) {
406 my $cost = $cpu_ctcount[$candidate];
407 if ($cost < ($cur_cost - 1)) {
409 $cur_cpu = $candidate;
416 foreach my $bct (@balanced_cts) {
417 my ($vmid, $cores, $cpuset) = @$bct;
419 my $rest = [ grep { !$cpuset->has($_) } @allowed_cpus ];
421 my $newset = PVE
::CpuSet-
>new();
422 for my $cpu ($cpuset->members()) {
423 my $best = $find_best_cpu->($rest, $cpu);
425 $cpu_ctcount[$best]++;
426 $cpu_ctcount[$cpu]--;
428 $newset->insert($best);
431 if (!$newset->is_equal($cpuset)) {
432 $modify_cpuset->($vmid, $cpuset, $newset);
437 sub update_lxc_status
{
438 my ($status_cfg) = @_;
441 my $vmstatus = PVE
::LXC
::vmstatus
();
443 my $transactions = PVE
::ExtMetric
::transactions_start
($status_cfg);
445 foreach my $vmid (keys %$vmstatus) {
446 my $d = $vmstatus->{$vmid};
447 my $template = $d->{template
} ?
$d->{template
} : "0";
449 if ($d->{status
} eq 'running') { # running
450 $data = $generate_rrd_string->(
451 [$d->{uptime
}, $d->{name
}, $d->{status
}, $template,
452 $ctime, $d->{cpus
}, $d->{cpu
},
453 $d->{maxmem
}, $d->{mem
},
454 $d->{maxdisk
}, $d->{disk
},
455 $d->{netin
}, $d->{netout
},
456 $d->{diskread
}, $d->{diskwrite
}]);
458 $data = $generate_rrd_string->(
459 [0, $d->{name
}, $d->{status
}, $template, $ctime, $d->{cpus
}, undef,
460 $d->{maxmem
}, undef, $d->{maxdisk
}, $d->{disk
}, undef, undef, undef, undef]);
462 PVE
::Cluster
::broadcast_rrd
("pve2.3-vm/$vmid", $data);
464 PVE
::ExtMetric
::update_all
($transactions, 'lxc', $vmid, $d, $ctime, $nodename);
466 PVE
::ExtMetric
::transactions_finish
($transactions);
469 sub update_storage_status
{
470 my ($status_cfg) = @_;
472 my $cfg = PVE
::Storage
::config
();
474 my $info = PVE
::Storage
::storage_info
($cfg);
476 my $transactions = PVE
::ExtMetric
::transactions_start
($status_cfg);
478 foreach my $storeid (keys %$info) {
479 my $d = $info->{$storeid};
480 next if !$d->{active
};
482 my $data = $generate_rrd_string->([$ctime, $d->{total
}, $d->{used
}]);
484 my $key = "pve2-storage/${nodename}/$storeid";
485 PVE
::Cluster
::broadcast_rrd
($key, $data);
487 PVE
::ExtMetric
::update_all
($transactions, 'storage', $nodename, $storeid, $d, $ctime);
489 PVE
::ExtMetric
::transactions_finish
($transactions);
492 sub rotate_authkeys
{
493 PVE
::AccessControl
::rotate_authkey
() if !PVE
::AccessControl
::check_authkey
(1);
496 sub update_ceph_metadata
{
497 return if !PVE
::Ceph
::Tools
::check_ceph_inited
(1); # nothing to do
499 PVE
::Ceph
::Services
::broadcast_ceph_services
();
501 PVE
::Ceph
::Services
::broadcast_ceph_versions
();
504 sub update_sdn_status
{
507 my ($transport_status, $vnet_status) = PVE
::Network
::SDN
::status
();
509 my $status = $transport_status ? encode_json
($transport_status) : undef;
510 PVE
::Cluster
::broadcast_node_kv
("sdn", $status);
514 my $broadcast_version_info_done = 0;
515 my sub broadcast_version_info
: prototype() {
516 if (!$broadcast_version_info_done) {
517 PVE
::Cluster
::broadcast_node_kv
(
519 encode_json
(PVE
::pvecfg
::version_info
()),
521 $broadcast_version_info_done = 1;
526 # update worker list. This is not really required, but we want to make sure that we also have a
527 # correct list in case of an unexpected crash.
528 my $rpcenv = PVE
::RPCEnvironment
::get
();
531 my $tlist = $rpcenv->active_workers();
532 PVE
::Cluster
::broadcast_tasklist
($tlist);
535 syslog
('err', $err) if $err;
537 my $status_cfg = PVE
::Cluster
::cfs_read_file
('status.cfg');
540 update_node_status
($status_cfg);
543 syslog
('err', "node status update error: $err") if $err;
546 update_qemu_status
($status_cfg);
549 syslog
('err', "qemu status update error: $err") if $err;
552 update_lxc_status
($status_cfg);
555 syslog
('err', "lxc status update error: $err") if $err;
558 rebalance_lxc_containers
();
561 syslog
('err', "lxc cpuset rebalance error: $err") if $err;
564 update_storage_status
($status_cfg);
567 syslog
('err', "storage status update error: $err") if $err;
570 remove_stale_lxc_consoles
();
573 syslog
('err', "lxc console cleanup error: $err") if $err;
579 syslog
('err', "authkey rotation error: $err") if $err;
582 update_ceph_metadata
();
585 syslog
('err', "ceph metadata update error: $err") if $err;
591 syslog
('err', "sdn status update error: $err") if $err;
594 broadcast_version_info
();
597 syslog
('err', "version info update error: $err") if $err;
602 # do not update directly after startup, because install scripts
603 # have a problem with that
607 my $initial_memory_usage;
614 $next_update = time() + $updatetime;
617 my ($ccsec, $cusec) = gettimeofday
();
619 # syslog('info', "start status update");
620 PVE
::Cluster
::cfs_update
();
626 syslog
('err', "status update error: $err");
629 my ($ccsec_end, $cusec_end) = gettimeofday
();
630 my $cptime = ($ccsec_end-$ccsec) + ($cusec_end - $cusec)/1000000;
632 syslog
('info', sprintf("status update time (%.3f seconds)", $cptime))
638 my $mem = PVE
::ProcFSTools
::read_memory_usage
();
639 my $resident_kb = $mem->{resident
} / 1024;
641 if (!defined($initial_memory_usage) || ($cycle < 10)) {
642 $initial_memory_usage = $resident_kb;
644 my $diff = $resident_kb - $initial_memory_usage;
645 if ($diff > 15 * 1024) {
646 syslog
('info', "restarting server after $cycle cycles to " .
647 "reduce memory usage (free $resident_kb ($diff) KB)");
648 $self->restart_daemon();
653 while ((time() < $next_update) &&
654 ($wcount < $updatetime) && # protect against time wrap
655 !$restart_request) { $wcount++; sleep (1); };
657 $self->restart_daemon() if $restart_request;
661 $daemon->register_start_command();
662 $daemon->register_restart_command(1);
663 $daemon->register_stop_command();
664 $daemon->register_status_command();
667 start
=> [ __PACKAGE__
, 'start', []],
668 restart
=> [ __PACKAGE__
, 'restart', []],
669 stop
=> [ __PACKAGE__
, 'stop', []],
670 status
=> [ __PACKAGE__
, 'status', [], undef, sub { print shift . "\n";} ],