]> git.proxmox.com Git - pve-manager.git/blob - PVE/Service/pvestatd.pm
72445ec0c2b20fa695c0434d21f851030c017ba7
[pve-manager.git] / PVE / Service / pvestatd.pm
1 package PVE::Service::pvestatd;
2
3 use strict;
4 use warnings;
5
6 use PVE::SafeSyslog;
7 use PVE::Daemon;
8
9 use JSON;
10
11 use Time::HiRes qw (gettimeofday);
12 use PVE::Tools qw(dir_glob_foreach file_read_firstline);
13 use PVE::ProcFSTools;
14 use PVE::CpuSet;
15 use Filesys::Df;
16 use PVE::INotify;
17 use PVE::Network;
18 use PVE::Cluster qw(cfs_read_file);
19 use PVE::Storage;
20 use PVE::QemuServer;
21 use PVE::QemuServer::Monitor;
22 use PVE::LXC;
23 use PVE::CGroup;
24 use PVE::LXC::Config;
25 use PVE::RPCEnvironment;
26 use PVE::API2::Subscription;
27 use PVE::AutoBalloon;
28 use PVE::AccessControl;
29 use PVE::Ceph::Services;
30 use PVE::Ceph::Tools;
31 use PVE::pvecfg;
32
33 use PVE::ExtMetric;
34 use PVE::Status::Plugin;
35
36 use base qw(PVE::Daemon);
37
38 my $have_sdn;
39 eval {
40 require PVE::Network::SDN;
41 $have_sdn = 1;
42 };
43
44 my $opt_debug;
45 my $restart_request;
46
47 my $nodename = PVE::INotify::nodename();
48
49 my $cmdline = [$0, @ARGV];
50
51 my %daemon_options = (restart_on_error => 5, stop_wait_time => 5);
52 my $daemon = __PACKAGE__->new('pvestatd', $cmdline, %daemon_options);
53
54 sub init {
55 my ($self) = @_;
56
57 $opt_debug = $self->{debug};
58
59 PVE::Cluster::cfs_update();
60 }
61
62 sub shutdown {
63 my ($self) = @_;
64
65 syslog('info' , "server closing");
66
67 # wait for children
68 1 while (waitpid(-1, POSIX::WNOHANG()) > 0);
69
70 $self->exit_daemon(0);
71 }
72
73 sub hup {
74 my ($self) = @_;
75
76 $restart_request = 1;
77 }
78
79 my $cached_kvm_version = '';
80 my $next_flag_update_time;
81 my $failed_flag_update_delay_sec = 120;
82
83 sub update_supported_cpuflags {
84 my $kvm_version = PVE::QemuServer::kvm_user_version();
85
86 # only update when QEMU/KVM version has changed, as that is the only reason
87 # why flags could change without restarting pvestatd
88 return if $cached_kvm_version && $cached_kvm_version eq $kvm_version;
89
90 if ($next_flag_update_time && $next_flag_update_time > time()) {
91 return;
92 }
93 $next_flag_update_time = 0;
94
95 my $supported_cpuflags = eval { PVE::QemuServer::query_supported_cpu_flags() };
96 warn $@ if $@;
97
98 if (!$supported_cpuflags ||
99 (!$supported_cpuflags->{tcg} && !$supported_cpuflags->{kvm})) {
100 # something went wrong, clear broadcast flags and set try-again delay
101 warn "CPU flag detection failed, will try again after delay\n";
102 $next_flag_update_time = time() + $failed_flag_update_delay_sec;
103
104 $supported_cpuflags = {};
105 } else {
106 # only set cached version if there's actually something to braodcast
107 $cached_kvm_version = $kvm_version;
108 }
109
110 for my $accel ("tcg", "kvm") {
111 if ($supported_cpuflags->{$accel}) {
112 PVE::Cluster::broadcast_node_kv("cpuflags-$accel", join(' ', @{$supported_cpuflags->{$accel}}));
113 } else {
114 # clear potentially invalid data
115 PVE::Cluster::broadcast_node_kv("cpuflags-$accel", '');
116 }
117 }
118 }
119
120 my $generate_rrd_string = sub {
121 my ($data) = @_;
122
123 return join(':', map { $_ // 'U' } @$data);
124 };
125
126 sub update_node_status {
127 my ($status_cfg) = @_;
128
129 my ($uptime) = PVE::ProcFSTools::read_proc_uptime();
130
131 my ($avg1, $avg5, $avg15) = PVE::ProcFSTools::read_loadavg();
132 my $stat = PVE::ProcFSTools::read_proc_stat();
133 my $cpuinfo = PVE::ProcFSTools::read_cpuinfo();
134 my $maxcpu = $cpuinfo->{cpus};
135
136 update_supported_cpuflags();
137
138 my $subinfo = PVE::INotify::read_file('subscription');
139 my $sublevel = $subinfo->{level} || '';
140
141 my $netdev = PVE::ProcFSTools::read_proc_net_dev();
142 # traffic from/to physical interface cards
143 my ($netin, $netout) = (0, 0);
144 for my $dev (grep { /^$PVE::Network::PHYSICAL_NIC_RE$/ } keys %$netdev) {
145 $netin += $netdev->{$dev}->{receive};
146 $netout += $netdev->{$dev}->{transmit};
147 }
148
149 my $meminfo = PVE::ProcFSTools::read_meminfo();
150
151 my $dinfo = df('/', 1); # output is bytes
152 # everything not free is considered to be used
153 my $dused = $dinfo->{blocks} - $dinfo->{bfree};
154
155 my $ctime = time();
156
157 my $data = $generate_rrd_string->(
158 [$uptime, $sublevel, $ctime, $avg1, $maxcpu, $stat->{cpu}, $stat->{wait},
159 $meminfo->{memtotal}, $meminfo->{memused},
160 $meminfo->{swaptotal}, $meminfo->{swapused},
161 $dinfo->{blocks}, $dused, $netin, $netout]
162 );
163 PVE::Cluster::broadcast_rrd("pve2-node/$nodename", $data);
164
165 my $node_metric = {
166 uptime => $uptime,
167 cpustat => $stat,
168 memory => $meminfo,
169 blockstat => $dinfo,
170 nics => $netdev,
171 };
172 $node_metric->{cpustat}->@{qw(avg1 avg5 avg15)} = ($avg1, $avg5, $avg15);
173 $node_metric->{cpustat}->{cpus} = $maxcpu;
174
175 my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
176 PVE::ExtMetric::update_all($transactions, 'node', $nodename, $node_metric, $ctime);
177 PVE::ExtMetric::transactions_finish($transactions);
178 }
179
180 sub auto_balloning {
181 my ($vmstatus) = @_;
182
183 my $log = sub { $opt_debug and printf @_ };
184
185 my $hostmeminfo = PVE::ProcFSTools::read_meminfo();
186 # NOTE: to debug, run 'pvestatd -d' and set memtotal here
187 #$hostmeminfo->{memtotal} = int(2*1024*1024*1024/0.8); # you can set this to test
188 my $hostfreemem = $hostmeminfo->{memtotal} - $hostmeminfo->{memused};
189
190 # try to use ~80% host memory; goal is the change amount required to achieve that
191 my $goal = int($hostmeminfo->{memtotal} * 0.8 - $hostmeminfo->{memused});
192 $log->("host goal: $goal free: $hostfreemem total: $hostmeminfo->{memtotal}\n");
193
194 my $maxchange = 100*1024*1024;
195 my $res = PVE::AutoBalloon::compute_alg1($vmstatus, $goal, $maxchange);
196
197 for my $vmid (sort keys %$res) {
198 my $target = int($res->{$vmid});
199 my $current = int($vmstatus->{$vmid}->{balloon});
200 next if $target == $current; # no need to change
201
202 $log->("BALLOON $vmid to $target (%d)\n", $target - $current);
203 eval { PVE::QemuServer::Monitor::mon_cmd($vmid, "balloon", value => int($target)) };
204 warn $@ if $@;
205 }
206 }
207
208 sub update_qemu_status {
209 my ($status_cfg) = @_;
210
211 my $ctime = time();
212 my $vmstatus = PVE::QemuServer::vmstatus(undef, 1);
213
214 eval { auto_balloning($vmstatus); };
215 syslog('err', "auto ballooning error: $@") if $@;
216
217 my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
218 foreach my $vmid (keys %$vmstatus) {
219 my $d = $vmstatus->{$vmid};
220 my $data;
221 my $status = $d->{qmpstatus} || $d->{status} || 'stopped';
222 my $template = $d->{template} ? $d->{template} : "0";
223 if ($d->{pid}) { # running
224 $data = $generate_rrd_string->(
225 [$d->{uptime}, $d->{name}, $status, $template, $ctime, $d->{cpus}, $d->{cpu},
226 $d->{maxmem}, $d->{mem}, $d->{maxdisk}, $d->{disk},
227 $d->{netin}, $d->{netout}, $d->{diskread}, $d->{diskwrite}]);
228 } else {
229 $data = $generate_rrd_string->(
230 [0, $d->{name}, $status, $template, $ctime, $d->{cpus}, undef,
231 $d->{maxmem}, undef, $d->{maxdisk}, $d->{disk}, undef, undef, undef, undef]);
232 }
233 PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);
234
235 PVE::ExtMetric::update_all($transactions, 'qemu', $vmid, $d, $ctime, $nodename);
236 }
237
238 PVE::ExtMetric::transactions_finish($transactions);
239 }
240
241 sub remove_stale_lxc_consoles {
242
243 my $vmstatus = PVE::LXC::vmstatus();
244 my $pidhash = PVE::LXC::find_lxc_console_pids();
245
246 foreach my $vmid (keys %$pidhash) {
247 next if defined($vmstatus->{$vmid});
248 syslog('info', "remove stale lxc-console for CT $vmid");
249 foreach my $pid (@{$pidhash->{$vmid}}) {
250 kill(9, $pid);
251 }
252 }
253 }
254
255 my $rebalance_error_count = {};
256
257 my $NO_REBALANCE;
258 sub rebalance_lxc_containers {
259 # Make sure we can find the cpuset controller path:
260 return if $NO_REBALANCE;
261 my $cpuset_base = eval { PVE::CGroup::cpuset_controller_path() };
262 if (my $err = $@) {
263 syslog('info', "could not get cpuset controller path: $err");
264 }
265
266 if (!defined($cpuset_base)) {
267 $NO_REBALANCE = 1;
268 return;
269 }
270
271 # Figure out the cpu count & highest ID
272 my $all_cpus = PVE::CpuSet->new_from_path($cpuset_base, 1);
273 my @allowed_cpus = $all_cpus->members();
274 my $cpucount = scalar(@allowed_cpus);
275 my $max_cpuid = $allowed_cpus[-1];
276
277 my @cpu_ctcount = (0) x ($max_cpuid+1);
278 my @balanced_cts;
279
280 # A mapping { vmid => cgroup_payload_path } for containers where namespace
281 # separation is active and recognized.
282 my $ctinfo = {};
283
284 my $modify_cpuset = sub {
285 my ($vmid, $cpuset, $newset) = @_;
286
287 if (!$rebalance_error_count->{$vmid}) {
288 syslog('info', "modified cpu set for lxc/$vmid: " . $newset->short_string());
289 }
290
291 eval {
292 my $cgbase = $ctinfo->{$vmid};
293
294 if (defined($cgbase)) {
295 # allow all, so that we can set new cpuset in /ns
296 $all_cpus->write_to_path($cgbase);
297 eval {
298 $newset->write_to_path("$cgbase/ns");
299 };
300 if (my $err = $@) {
301 warn $err if !$rebalance_error_count->{$vmid}++;
302 # restore original
303 $cpuset->write_to_path($cgbase);
304 } else {
305 # also apply to container root cgroup
306 $newset->write_to_path($cgbase);
307 $rebalance_error_count->{$vmid} = 0;
308 }
309 } else {
310 # old style container
311 $newset->write_to_path($cgbase);
312 $rebalance_error_count->{$vmid} = 0;
313 }
314 };
315 if (my $err = $@) {
316 warn $err if !$rebalance_error_count->{$vmid}++;
317 }
318 };
319
320 my $ctlist = PVE::LXC::config_list();
321
322 foreach my $vmid (sort keys %$ctlist) {
323 my $cgpath = "$cpuset_base/lxc/$vmid";
324 if (-d "$cgpath/ns") {
325 $ctinfo->{$vmid} = $cgpath;
326 } else {
327 next; # old style container
328 }
329
330 my ($conf, $cpuset) = eval {(
331 PVE::LXC::Config->load_config($vmid),
332 PVE::CpuSet->new_from_path($cgpath),
333 )};
334 if (my $err = $@) {
335 warn $err;
336 next;
337 }
338
339 my @cpuset_members = $cpuset->members();
340
341 if (!PVE::LXC::Config->has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')
342 && !PVE::LXC::Config->has_lxc_entry($conf, 'lxc.cgroup2.cpuset.cpus')
343 ) {
344 my $cores = $conf->{cores} || $cpucount;
345 $cores = $cpucount if $cores > $cpucount;
346
347 # see if the number of cores was hot-reduced or hasn't been enacted at all yet
348 my $newset = PVE::CpuSet->new();
349 if ($cores < scalar(@cpuset_members)) {
350 for (my $i = 0; $i < $cores; $i++) {
351 $newset->insert($cpuset_members[$i]);
352 }
353 } elsif ($cores > scalar(@cpuset_members)) {
354 my $count = $newset->insert(@cpuset_members);
355 foreach my $cpu (@allowed_cpus) {
356 $count += $newset->insert($cpu);
357 last if $count >= $cores;
358 }
359 } else {
360 $newset->insert(@cpuset_members);
361 }
362
363 # Apply hot-plugged changes if any:
364 if (!$newset->is_equal($cpuset)) {
365 @cpuset_members = $newset->members();
366 $modify_cpuset->($vmid, $cpuset, $newset);
367 }
368
369 # Note: no need to rebalance if we already use all cores
370 push @balanced_cts, [$vmid, $cores, $newset]
371 if defined($conf->{cores}) && ($cores != $cpucount);
372 }
373
374 foreach my $cpu (@cpuset_members) {
375 $cpu_ctcount[$cpu]++ if $cpu <= $max_cpuid;
376 }
377 }
378
379 my $find_best_cpu = sub {
380 my ($cpulist, $cpu) = @_;
381
382 my $cur_cost = $cpu_ctcount[$cpu];
383 my $cur_cpu = $cpu;
384
385 foreach my $candidate (@$cpulist) {
386 my $cost = $cpu_ctcount[$candidate];
387 if ($cost < ($cur_cost - 1)) {
388 $cur_cost = $cost;
389 $cur_cpu = $candidate;
390 }
391 }
392
393 return $cur_cpu;
394 };
395
396 foreach my $bct (@balanced_cts) {
397 my ($vmid, $cores, $cpuset) = @$bct;
398
399 my $rest = [ grep { !$cpuset->has($_) } @allowed_cpus ];
400
401 my $newset = PVE::CpuSet->new();
402 for my $cpu ($cpuset->members()) {
403 my $best = $find_best_cpu->($rest, $cpu);
404 if ($best != $cpu) {
405 $cpu_ctcount[$best]++;
406 $cpu_ctcount[$cpu]--;
407 }
408 $newset->insert($best);
409 }
410
411 if (!$newset->is_equal($cpuset)) {
412 $modify_cpuset->($vmid, $cpuset, $newset);
413 }
414 }
415 }
416
417 sub update_lxc_status {
418 my ($status_cfg) = @_;
419
420 my $ctime = time();
421 my $vmstatus = PVE::LXC::vmstatus();
422
423 my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
424
425 foreach my $vmid (keys %$vmstatus) {
426 my $d = $vmstatus->{$vmid};
427 my $template = $d->{template} ? $d->{template} : "0";
428 my $data;
429 if ($d->{status} eq 'running') { # running
430 $data = $generate_rrd_string->(
431 [$d->{uptime}, $d->{name}, $d->{status}, $template,
432 $ctime, $d->{cpus}, $d->{cpu},
433 $d->{maxmem}, $d->{mem},
434 $d->{maxdisk}, $d->{disk},
435 $d->{netin}, $d->{netout},
436 $d->{diskread}, $d->{diskwrite}]);
437 } else {
438 $data = $generate_rrd_string->(
439 [0, $d->{name}, $d->{status}, $template, $ctime, $d->{cpus}, undef,
440 $d->{maxmem}, undef, $d->{maxdisk}, $d->{disk}, undef, undef, undef, undef]);
441 }
442 PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);
443
444 PVE::ExtMetric::update_all($transactions, 'lxc', $vmid, $d, $ctime, $nodename);
445 }
446 PVE::ExtMetric::transactions_finish($transactions);
447 }
448
449 sub update_storage_status {
450 my ($status_cfg) = @_;
451
452 my $cfg = PVE::Storage::config();
453 my $ctime = time();
454 my $info = PVE::Storage::storage_info($cfg);
455
456 my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
457
458 foreach my $storeid (keys %$info) {
459 my $d = $info->{$storeid};
460 next if !$d->{active};
461
462 my $data = $generate_rrd_string->([$ctime, $d->{total}, $d->{used}]);
463
464 my $key = "pve2-storage/${nodename}/$storeid";
465 PVE::Cluster::broadcast_rrd($key, $data);
466
467 PVE::ExtMetric::update_all($transactions, 'storage', $nodename, $storeid, $d, $ctime);
468 }
469 PVE::ExtMetric::transactions_finish($transactions);
470 }
471
472 sub rotate_authkeys {
473 PVE::AccessControl::rotate_authkey() if !PVE::AccessControl::check_authkey(1);
474 }
475
476 sub update_ceph_metadata {
477 return if !PVE::Ceph::Tools::check_ceph_inited(1); # nothing to do
478
479 PVE::Ceph::Services::broadcast_ceph_services();
480
481 PVE::Ceph::Services::broadcast_ceph_versions();
482 }
483
484 sub update_sdn_status {
485
486 if($have_sdn) {
487 my ($transport_status, $vnet_status) = PVE::Network::SDN::status();
488
489 my $status = $transport_status ? encode_json($transport_status) : undef;
490 PVE::Cluster::broadcast_node_kv("sdn", $status);
491 }
492 }
493
494 my $broadcast_version_info_done = 0;
495 my sub broadcast_version_info : prototype() {
496 if (!$broadcast_version_info_done) {
497 PVE::Cluster::broadcast_node_kv(
498 'version-info',
499 encode_json(PVE::pvecfg::version_info()),
500 );
501 $broadcast_version_info_done = 1;
502 }
503 }
504
505 sub update_status {
506 # update worker list. This is not really required, but we want to make sure that we also have a
507 # correct list in case of an unexpected crash.
508 my $rpcenv = PVE::RPCEnvironment::get();
509
510 eval {
511 my $tlist = $rpcenv->active_workers();
512 PVE::Cluster::broadcast_tasklist($tlist);
513 };
514 my $err = $@;
515 syslog('err', $err) if $err;
516
517 my $status_cfg = PVE::Cluster::cfs_read_file('status.cfg');
518
519 eval {
520 update_node_status($status_cfg);
521 };
522 $err = $@;
523 syslog('err', "node status update error: $err") if $err;
524
525 eval {
526 update_qemu_status($status_cfg);
527 };
528 $err = $@;
529 syslog('err', "qemu status update error: $err") if $err;
530
531 eval {
532 update_lxc_status($status_cfg);
533 };
534 $err = $@;
535 syslog('err', "lxc status update error: $err") if $err;
536
537 eval {
538 rebalance_lxc_containers();
539 };
540 $err = $@;
541 syslog('err', "lxc cpuset rebalance error: $err") if $err;
542
543 eval {
544 update_storage_status($status_cfg);
545 };
546 $err = $@;
547 syslog('err', "storage status update error: $err") if $err;
548
549 eval {
550 remove_stale_lxc_consoles();
551 };
552 $err = $@;
553 syslog('err', "lxc console cleanup error: $err") if $err;
554
555 eval {
556 rotate_authkeys();
557 };
558 $err = $@;
559 syslog('err', "authkey rotation error: $err") if $err;
560
561 eval {
562 update_ceph_metadata();
563 };
564 $err = $@;
565 syslog('err', "ceph metadata update error: $err") if $err;
566
567 eval {
568 update_sdn_status();
569 };
570 $err = $@;
571 syslog('err', "sdn status update error: $err") if $err;
572
573 eval {
574 broadcast_version_info();
575 };
576 $err = $@;
577 syslog('err', "version info update error: $err") if $err;
578 }
579
580 my $next_update = 0;
581
582 # do not update directly after startup, because install scripts
583 # have a problem with that
584 my $cycle = 0;
585 my $updatetime = 10;
586
587 my $initial_memory_usage;
588
589 sub run {
590 my ($self) = @_;
591
592 for (;;) { # forever
593
594 $next_update = time() + $updatetime;
595
596 if ($cycle) {
597 my ($ccsec, $cusec) = gettimeofday ();
598 eval {
599 # syslog('info', "start status update");
600 PVE::Cluster::cfs_update();
601 update_status();
602 };
603 my $err = $@;
604
605 if ($err) {
606 syslog('err', "status update error: $err");
607 }
608
609 my ($ccsec_end, $cusec_end) = gettimeofday ();
610 my $cptime = ($ccsec_end-$ccsec) + ($cusec_end - $cusec)/1000000;
611
612 syslog('info', sprintf("status update time (%.3f seconds)", $cptime))
613 if ($cptime > 5);
614 }
615
616 $cycle++;
617
618 my $mem = PVE::ProcFSTools::read_memory_usage();
619 my $resident_kb = $mem->{resident} / 1024;
620
621 if (!defined($initial_memory_usage) || ($cycle < 10)) {
622 $initial_memory_usage = $resident_kb;
623 } else {
624 my $diff = $resident_kb - $initial_memory_usage;
625 if ($diff > 15 * 1024) {
626 syslog ('info', "restarting server after $cycle cycles to " .
627 "reduce memory usage (free $resident_kb ($diff) KB)");
628 $self->restart_daemon();
629 }
630 }
631
632 my $wcount = 0;
633 while ((time() < $next_update) &&
634 ($wcount < $updatetime) && # protect against time wrap
635 !$restart_request) { $wcount++; sleep (1); };
636
637 $self->restart_daemon() if $restart_request;
638 }
639 }
640
641 $daemon->register_start_command();
642 $daemon->register_restart_command(1);
643 $daemon->register_stop_command();
644 $daemon->register_status_command();
645
646 our $cmddef = {
647 start => [ __PACKAGE__, 'start', []],
648 restart => [ __PACKAGE__, 'restart', []],
649 stop => [ __PACKAGE__, 'stop', []],
650 status => [ __PACKAGE__, 'status', [], undef, sub { print shift . "\n";} ],
651 };
652
653 1;
654
655
656
657
658