PVE/Service/pvestatd.pm

   1 package PVE::Service::pvestatd;
   2
   3 use strict;
   4 use warnings;
   5
   6 use PVE::SafeSyslog;
   7 use PVE::Daemon;
   8
   9 use JSON;
  10
  11 use Time::HiRes qw (gettimeofday);
  12 use PVE::Tools qw(dir_glob_foreach file_read_firstline);
  13 use PVE::ProcFSTools;
  14 use PVE::CpuSet;
  15 use Filesys::Df;
  16 use PVE::INotify;
  17 use PVE::Network;
  18 use PVE::Cluster qw(cfs_read_file);
  19 use PVE::Storage;
  20 use PVE::QemuServer;
  21 use PVE::QemuServer::Monitor;
  22 use PVE::LXC;
  23 use PVE::CGroup;
  24 use PVE::LXC::Config;
  25 use PVE::RPCEnvironment;
  26 use PVE::API2::Subscription;
  27 use PVE::AutoBalloon;
  28 use PVE::AccessControl;
  29 use PVE::Ceph::Services;
  30 use PVE::Ceph::Tools;
  31 use PVE::pvecfg;
  32
  33 use PVE::ExtMetric;
  34 use PVE::Status::Plugin;
  35
  36 use base qw(PVE::Daemon);
  37
  38 my $have_sdn;
  39 eval {
  40     require PVE::Network::SDN;
  41     $have_sdn = 1;
  42 };
  43
  44 my $opt_debug;
  45 my $restart_request;
  46
  47 my $nodename = PVE::INotify::nodename();
  48
  49 my $cmdline = [$0, @ARGV];
  50
  51 my %daemon_options = (restart_on_error => 5, stop_wait_time => 5);
  52 my $daemon = __PACKAGE__->new('pvestatd', $cmdline, %daemon_options);
  53
  54 sub init {
  55     my ($self) = @_;
  56
  57     $opt_debug = $self->{debug};
  58
  59     PVE::Cluster::cfs_update();
  60 }
  61
  62 sub shutdown {
  63     my ($self) = @_;
  64
  65     syslog('info' , "server closing");
  66
  67     # wait for children
  68     1 while (waitpid(-1, POSIX::WNOHANG()) > 0);
  69
  70     $self->exit_daemon(0);
  71 }
  72
  73 sub hup {
  74     my ($self) = @_;
  75
  76     $restart_request = 1;
  77 }
  78
  79 my $cached_kvm_version = '';
  80 my $next_flag_update_time;
  81 my $failed_flag_update_delay_sec = 120;
  82
  83 sub update_supported_cpuflags {
  84     my $kvm_version = PVE::QemuServer::kvm_user_version();
  85
  86     # only update when QEMU/KVM version has changed, as that is the only reason
  87     # why flags could change without restarting pvestatd
  88     return if $cached_kvm_version && $cached_kvm_version eq $kvm_version;
  89
  90     if ($next_flag_update_time && $next_flag_update_time > time()) {
  91         return;
  92     }
  93     $next_flag_update_time = 0;
  94
  95     my $supported_cpuflags = eval { PVE::QemuServer::query_supported_cpu_flags() };
  96     warn $@ if $@;
  97
  98     if (!$supported_cpuflags ||
  99         (!$supported_cpuflags->{tcg} && !$supported_cpuflags->{kvm})) {
 100         # something went wrong, clear broadcast flags and set try-again delay
 101         warn "CPU flag detection failed, will try again after delay\n";
 102         $next_flag_update_time = time() + $failed_flag_update_delay_sec;
 103
 104         $supported_cpuflags = {};
 105     } else {
 106         # only set cached version if there's actually something to braodcast
 107         $cached_kvm_version = $kvm_version;
 108     }
 109
 110     for my $accel ("tcg", "kvm") {
 111         if ($supported_cpuflags->{$accel}) {
 112             PVE::Cluster::broadcast_node_kv("cpuflags-$accel", join(' ', @{$supported_cpuflags->{$accel}}));
 113         } else {
 114             # clear potentially invalid data
 115             PVE::Cluster::broadcast_node_kv("cpuflags-$accel", '');
 116         }
 117     }
 118 }
 119
 120 my $generate_rrd_string = sub {
 121     my ($data) = @_;
 122
 123     return join(':', map { $_ // 'U' } @$data);
 124 };
 125
 126 sub update_node_status {
 127     my ($status_cfg) = @_;
 128
 129     my ($uptime) = PVE::ProcFSTools::read_proc_uptime();
 130
 131     my ($avg1, $avg5, $avg15) = PVE::ProcFSTools::read_loadavg();
 132     my $stat = PVE::ProcFSTools::read_proc_stat();
 133     my $cpuinfo = PVE::ProcFSTools::read_cpuinfo();
 134     my $maxcpu = $cpuinfo->{cpus};
 135
 136     update_supported_cpuflags();
 137
 138     my $subinfo = PVE::INotify::read_file('subscription');
 139     my $sublevel = $subinfo->{level} || '';
 140
 141     my $netdev = PVE::ProcFSTools::read_proc_net_dev();
 142     # traffic from/to physical interface cards
 143     my ($netin, $netout) = (0, 0);
 144     for my $dev (grep { /^$PVE::Network::PHYSICAL_NIC_RE$/ } keys %$netdev) {
 145         $netin += $netdev->{$dev}->{receive};
 146         $netout += $netdev->{$dev}->{transmit};
 147     }
 148
 149     my $meminfo = PVE::ProcFSTools::read_meminfo();
 150
 151     my $dinfo = df('/', 1);     # output is bytes
 152     # everything not free is considered to be used
 153     my $dused = $dinfo->{blocks} - $dinfo->{bfree};
 154
 155     my $ctime = time();
 156
 157     my $data = $generate_rrd_string->(
 158         [$uptime, $sublevel, $ctime, $avg1, $maxcpu, $stat->{cpu}, $stat->{wait},
 159          $meminfo->{memtotal}, $meminfo->{memused},
 160          $meminfo->{swaptotal}, $meminfo->{swapused},
 161          $dinfo->{blocks}, $dused, $netin, $netout]
 162     );
 163     PVE::Cluster::broadcast_rrd("pve2-node/$nodename", $data);
 164
 165     my $node_metric = {
 166         uptime => $uptime,
 167         cpustat => $stat,
 168         memory => $meminfo,
 169         blockstat => $dinfo,
 170         nics => $netdev,
 171     };
 172     $node_metric->{cpustat}->@{qw(avg1 avg5 avg15)} = ($avg1, $avg5, $avg15);
 173     $node_metric->{cpustat}->{cpus} = $maxcpu;
 174
 175     my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
 176     PVE::ExtMetric::update_all($transactions, 'node', $nodename, $node_metric, $ctime);
 177     PVE::ExtMetric::transactions_finish($transactions);
 178 }
 179
 180 sub auto_balloning {
 181     my ($vmstatus) =  @_;
 182
 183     my $log = sub { $opt_debug and printf @_ };
 184
 185     my $hostmeminfo = PVE::ProcFSTools::read_meminfo();
 186     # NOTE: to debug, run 'pvestatd -d' and set  memtotal here
 187     #$hostmeminfo->{memtotal} = int(2*1024*1024*1024/0.8); # you can set this to test
 188     my $hostfreemem = $hostmeminfo->{memtotal} - $hostmeminfo->{memused};
 189
 190     # try to use ~80% host memory; goal is the change amount required to achieve that
 191     my $goal = int($hostmeminfo->{memtotal} * 0.8 - $hostmeminfo->{memused});
 192     $log->("host goal: $goal free: $hostfreemem total: $hostmeminfo->{memtotal}\n");
 193
 194     my $maxchange = 100*1024*1024;
 195     my $res = PVE::AutoBalloon::compute_alg1($vmstatus, $goal, $maxchange);
 196
 197     for my $vmid (sort keys %$res) {
 198         my $target = int($res->{$vmid});
 199         my $current = int($vmstatus->{$vmid}->{balloon});
 200         next if $target == $current; # no need to change
 201
 202         $log->("BALLOON $vmid to $target (%d)\n", $target - $current);
 203         eval { PVE::QemuServer::Monitor::mon_cmd($vmid, "balloon", value => int($target)) };
 204         warn $@ if $@;
 205     }
 206 }
 207
 208 sub update_qemu_status {
 209     my ($status_cfg) = @_;
 210
 211     my $ctime = time();
 212     my $vmstatus = PVE::QemuServer::vmstatus(undef, 1);
 213
 214     eval { auto_balloning($vmstatus); };
 215     syslog('err', "auto ballooning error: $@") if $@;
 216
 217     my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
 218     foreach my $vmid (keys %$vmstatus) {
 219         my $d = $vmstatus->{$vmid};
 220         my $data;
 221         my $status = $d->{qmpstatus} || $d->{status} || 'stopped';
 222         my $template = $d->{template} ? $d->{template} : "0";
 223         if ($d->{pid}) { # running
 224             $data = $generate_rrd_string->(
 225                 [$d->{uptime}, $d->{name}, $status, $template, $ctime, $d->{cpus}, $d->{cpu},
 226                  $d->{maxmem}, $d->{mem}, $d->{maxdisk}, $d->{disk},
 227                  $d->{netin}, $d->{netout}, $d->{diskread}, $d->{diskwrite}]);
 228         } else {
 229             $data = $generate_rrd_string->(
 230                 [0, $d->{name}, $status, $template, $ctime, $d->{cpus}, undef,
 231                  $d->{maxmem}, undef, $d->{maxdisk}, $d->{disk}, undef, undef, undef, undef]);
 232         }
 233         PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);
 234
 235         PVE::ExtMetric::update_all($transactions, 'qemu', $vmid, $d, $ctime, $nodename);
 236     }
 237
 238     PVE::ExtMetric::transactions_finish($transactions);
 239 }
 240
 241 sub remove_stale_lxc_consoles {
 242
 243     my $vmstatus = PVE::LXC::vmstatus();
 244     my $pidhash = PVE::LXC::find_lxc_console_pids();
 245
 246     foreach my $vmid (keys %$pidhash) {
 247         next if defined($vmstatus->{$vmid});
 248         syslog('info', "remove stale lxc-console for CT $vmid");
 249         foreach my $pid (@{$pidhash->{$vmid}}) {
 250             kill(9, $pid);
 251         }
 252     }
 253 }
 254
 255 my $rebalance_error_count = {};
 256
 257 my $NO_REBALANCE;
 258 sub rebalance_lxc_containers {
 259     # Make sure we can find the cpuset controller path:
 260     return if $NO_REBALANCE;
 261     my $cpuset_base = eval { PVE::CGroup::cpuset_controller_path() };
 262     if (my $err = $@) {
 263         syslog('info', "could not get cpuset controller path: $err");
 264     }
 265
 266     if (!defined($cpuset_base)) {
 267         $NO_REBALANCE = 1;
 268         return;
 269     }
 270
 271     # Figure out the cpu count & highest ID
 272     my $all_cpus = PVE::CpuSet->new_from_path($cpuset_base, 1);
 273     my @allowed_cpus = $all_cpus->members();
 274     my $cpucount = scalar(@allowed_cpus);
 275     my $max_cpuid = $allowed_cpus[-1];
 276
 277     my @cpu_ctcount = (0) x ($max_cpuid+1);
 278     my @balanced_cts;
 279
 280     # A mapping { vmid => cgroup_payload_path } for containers where namespace
 281     # separation is active and recognized.
 282     my $ctinfo = {};
 283
 284     my $modify_cpuset = sub {
 285         my ($vmid, $cpuset, $newset) = @_;
 286
 287         if (!$rebalance_error_count->{$vmid}) {
 288             syslog('info', "modified cpu set for lxc/$vmid: " . $newset->short_string());
 289         }
 290
 291         eval {
 292             my $cgbase = $ctinfo->{$vmid};
 293
 294             if (defined($cgbase)) {
 295                 # allow all, so that we can set new cpuset in /ns
 296                 $all_cpus->write_to_path($cgbase);
 297                 eval {
 298                     $newset->write_to_path("$cgbase/ns");
 299                 };
 300                 if (my $err = $@) {
 301                     warn $err if !$rebalance_error_count->{$vmid}++;
 302                     # restore original
 303                     $cpuset->write_to_path($cgbase);
 304                 } else {
 305                     # also apply to container root cgroup
 306                     $newset->write_to_path($cgbase);
 307                     $rebalance_error_count->{$vmid} = 0;
 308                 }
 309             } else {
 310                 # old style container
 311                 $newset->write_to_path($cgbase);
 312                 $rebalance_error_count->{$vmid} = 0;
 313             }
 314         };
 315         if (my $err = $@) {
 316             warn $err if !$rebalance_error_count->{$vmid}++;
 317         }
 318     };
 319
 320     my $ctlist = PVE::LXC::config_list();
 321
 322     foreach my $vmid (sort keys %$ctlist) {
 323         my $cgpath = "$cpuset_base/lxc/$vmid";
 324         if (-d "$cgpath/ns") {
 325             $ctinfo->{$vmid} = $cgpath;
 326         } else {
 327             next; # old style container
 328         }
 329
 330         my ($conf, $cpuset) = eval {(
 331             PVE::LXC::Config->load_config($vmid),
 332             PVE::CpuSet->new_from_path($cgpath),
 333         )};
 334         if (my $err = $@) {
 335             warn $err;
 336             next;
 337         }
 338
 339         my @cpuset_members = $cpuset->members();
 340
 341         if (!PVE::LXC::Config->has_lxc_entry($conf, 'lxc.cgroup.cpuset.cpus')
 342             && !PVE::LXC::Config->has_lxc_entry($conf, 'lxc.cgroup2.cpuset.cpus')
 343         ) {
 344             my $cores = $conf->{cores} || $cpucount;
 345             $cores = $cpucount if $cores > $cpucount;
 346
 347             # see if the number of cores was hot-reduced or hasn't been enacted at all yet
 348             my $newset = PVE::CpuSet->new();
 349             if ($cores <  scalar(@cpuset_members)) {
 350                 for (my $i = 0; $i < $cores; $i++) {
 351                     $newset->insert($cpuset_members[$i]);
 352                 }
 353             } elsif ($cores > scalar(@cpuset_members)) {
 354                 my $count = $newset->insert(@cpuset_members);
 355                 foreach my $cpu (@allowed_cpus) {
 356                     $count += $newset->insert($cpu);
 357                     last if $count >= $cores;
 358                 }
 359             } else {
 360                 $newset->insert(@cpuset_members);
 361             }
 362
 363             # Apply hot-plugged changes if any:
 364             if (!$newset->is_equal($cpuset)) {
 365                 @cpuset_members = $newset->members();
 366                 $modify_cpuset->($vmid, $cpuset, $newset);
 367             }
 368
 369             # Note: no need to rebalance if we already use all cores
 370             push @balanced_cts, [$vmid, $cores, $newset]
 371                 if defined($conf->{cores}) && ($cores != $cpucount);
 372         }
 373
 374         foreach my $cpu (@cpuset_members) {
 375             $cpu_ctcount[$cpu]++ if $cpu <= $max_cpuid;
 376         }
 377     }
 378
 379     my $find_best_cpu = sub {
 380         my ($cpulist, $cpu) = @_;
 381
 382         my $cur_cost = $cpu_ctcount[$cpu];
 383         my $cur_cpu = $cpu;
 384
 385         foreach my $candidate (@$cpulist) {
 386             my $cost = $cpu_ctcount[$candidate];
 387             if ($cost < ($cur_cost - 1)) {
 388                 $cur_cost = $cost;
 389                 $cur_cpu = $candidate;
 390             }
 391         }
 392
 393         return $cur_cpu;
 394     };
 395
 396     foreach my $bct (@balanced_cts) {
 397         my ($vmid, $cores, $cpuset) = @$bct;
 398
 399         my $rest = [ grep { !$cpuset->has($_) } @allowed_cpus ];
 400
 401         my $newset = PVE::CpuSet->new();
 402         for my $cpu ($cpuset->members()) {
 403             my $best = $find_best_cpu->($rest, $cpu);
 404             if ($best != $cpu) {
 405                 $cpu_ctcount[$best]++;
 406                 $cpu_ctcount[$cpu]--;
 407             }
 408             $newset->insert($best);
 409         }
 410
 411         if (!$newset->is_equal($cpuset)) {
 412             $modify_cpuset->($vmid, $cpuset, $newset);
 413         }
 414     }
 415 }
 416
 417 sub update_lxc_status {
 418     my ($status_cfg) = @_;
 419
 420     my $ctime = time();
 421     my $vmstatus = PVE::LXC::vmstatus();
 422
 423     my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
 424
 425     foreach my $vmid (keys %$vmstatus) {
 426         my $d = $vmstatus->{$vmid};
 427         my $template = $d->{template} ? $d->{template} : "0";
 428         my $data;
 429         if ($d->{status} eq 'running') { # running
 430             $data = $generate_rrd_string->(
 431                 [$d->{uptime}, $d->{name}, $d->{status}, $template,
 432                  $ctime, $d->{cpus}, $d->{cpu},
 433                  $d->{maxmem}, $d->{mem},
 434                  $d->{maxdisk}, $d->{disk},
 435                  $d->{netin}, $d->{netout},
 436                  $d->{diskread}, $d->{diskwrite}]);
 437         } else {
 438             $data = $generate_rrd_string->(
 439                 [0, $d->{name}, $d->{status}, $template, $ctime, $d->{cpus}, undef,
 440                  $d->{maxmem}, undef, $d->{maxdisk}, $d->{disk}, undef, undef, undef, undef]);
 441         }
 442         PVE::Cluster::broadcast_rrd("pve2.3-vm/$vmid", $data);
 443
 444         PVE::ExtMetric::update_all($transactions, 'lxc', $vmid, $d, $ctime, $nodename);
 445     }
 446     PVE::ExtMetric::transactions_finish($transactions);
 447 }
 448
 449 sub update_storage_status {
 450     my ($status_cfg) = @_;
 451
 452     my $cfg = PVE::Storage::config();
 453     my $ctime = time();
 454     my $info = PVE::Storage::storage_info($cfg);
 455
 456     my $transactions = PVE::ExtMetric::transactions_start($status_cfg);
 457
 458     foreach my $storeid (keys %$info) {
 459         my $d = $info->{$storeid};
 460         next if !$d->{active};
 461
 462         my $data = $generate_rrd_string->([$ctime, $d->{total}, $d->{used}]);
 463
 464         my $key = "pve2-storage/${nodename}/$storeid";
 465         PVE::Cluster::broadcast_rrd($key, $data);
 466
 467         PVE::ExtMetric::update_all($transactions, 'storage', $nodename, $storeid, $d, $ctime);
 468     }
 469     PVE::ExtMetric::transactions_finish($transactions);
 470 }
 471
 472 sub rotate_authkeys {
 473     PVE::AccessControl::rotate_authkey() if !PVE::AccessControl::check_authkey(1);
 474 }
 475
 476 sub update_ceph_metadata {
 477     return if !PVE::Ceph::Tools::check_ceph_inited(1); # nothing to do
 478
 479     PVE::Ceph::Services::broadcast_ceph_services();
 480
 481     PVE::Ceph::Services::broadcast_ceph_versions();
 482 }
 483
 484 sub update_sdn_status {
 485
 486     if($have_sdn) {
 487         my ($transport_status, $vnet_status) = PVE::Network::SDN::status();
 488
 489         my $status = $transport_status ? encode_json($transport_status) : undef;
 490         PVE::Cluster::broadcast_node_kv("sdn", $status);
 491     }
 492 }
 493
 494 my $broadcast_version_info_done = 0;
 495 my sub broadcast_version_info : prototype() {
 496     if (!$broadcast_version_info_done) {
 497         PVE::Cluster::broadcast_node_kv(
 498             'version-info',
 499             encode_json(PVE::pvecfg::version_info()),
 500         );
 501         $broadcast_version_info_done = 1;
 502     }
 503 }
 504
 505 sub update_status {
 506     # update worker list. This is not really required, but we want to make sure that we also have a
 507     # correct list in case of an unexpected crash.
 508     my $rpcenv = PVE::RPCEnvironment::get();
 509
 510     eval {
 511         my $tlist = $rpcenv->active_workers();
 512         PVE::Cluster::broadcast_tasklist($tlist);
 513     };
 514     my $err = $@;
 515     syslog('err', $err) if $err;
 516
 517     my $status_cfg = PVE::Cluster::cfs_read_file('status.cfg');
 518
 519     eval {
 520         update_node_status($status_cfg);
 521     };
 522     $err = $@;
 523     syslog('err', "node status update error: $err") if $err;
 524
 525     eval {
 526         update_qemu_status($status_cfg);
 527     };
 528     $err = $@;
 529     syslog('err', "qemu status update error: $err") if $err;
 530
 531     eval {
 532         update_lxc_status($status_cfg);
 533     };
 534     $err = $@;
 535     syslog('err', "lxc status update error: $err") if $err;
 536
 537     eval {
 538         rebalance_lxc_containers();
 539     };
 540     $err = $@;
 541     syslog('err', "lxc cpuset rebalance error: $err") if $err;
 542
 543     eval {
 544         update_storage_status($status_cfg);
 545     };
 546     $err = $@;
 547     syslog('err', "storage status update error: $err") if $err;
 548
 549     eval {
 550         remove_stale_lxc_consoles();
 551     };
 552     $err = $@;
 553     syslog('err', "lxc console cleanup error: $err") if $err;
 554
 555     eval {
 556         rotate_authkeys();
 557     };
 558     $err = $@;
 559     syslog('err', "authkey rotation error: $err") if $err;
 560
 561     eval {
 562         update_ceph_metadata();
 563     };
 564     $err = $@;
 565     syslog('err', "ceph metadata update error: $err") if $err;
 566
 567     eval {
 568         update_sdn_status();
 569     };
 570     $err = $@;
 571     syslog('err', "sdn status update error: $err") if $err;
 572
 573     eval {
 574         broadcast_version_info();
 575     };
 576     $err = $@;
 577     syslog('err', "version info update error: $err") if $err;
 578 }
 579
 580 my $next_update = 0;
 581
 582 # do not update directly after startup, because install scripts
 583 # have a problem with that
 584 my $cycle = 0;
 585 my $updatetime = 10;
 586
 587 my $initial_memory_usage;
 588
 589 sub run {
 590     my ($self) = @_;
 591
 592     for (;;) { # forever
 593
 594         $next_update = time() + $updatetime;
 595
 596         if ($cycle) {
 597             my ($ccsec, $cusec) = gettimeofday ();
 598             eval {
 599                 # syslog('info', "start status update");
 600                 PVE::Cluster::cfs_update();
 601                 update_status();
 602             };
 603             my $err = $@;
 604
 605             if ($err) {
 606                 syslog('err', "status update error: $err");
 607             }
 608
 609             my ($ccsec_end, $cusec_end) = gettimeofday ();
 610             my $cptime = ($ccsec_end-$ccsec) + ($cusec_end - $cusec)/1000000;
 611
 612             syslog('info', sprintf("status update time (%.3f seconds)", $cptime))
 613                 if ($cptime > 5);
 614         }
 615
 616         $cycle++;
 617
 618         my $mem = PVE::ProcFSTools::read_memory_usage();
 619         my $resident_kb = $mem->{resident} / 1024;
 620
 621         if (!defined($initial_memory_usage) || ($cycle < 10)) {
 622             $initial_memory_usage = $resident_kb;
 623         } else {
 624             my $diff = $resident_kb - $initial_memory_usage;
 625             if ($diff > 15 * 1024) {
 626                 syslog ('info', "restarting server after $cycle cycles to " .
 627                         "reduce memory usage (free $resident_kb ($diff) KB)");
 628                 $self->restart_daemon();
 629             }
 630         }
 631
 632         my $wcount = 0;
 633         while ((time() < $next_update) &&
 634                ($wcount < $updatetime) && # protect against time wrap
 635                !$restart_request) { $wcount++; sleep (1); };
 636
 637         $self->restart_daemon() if $restart_request;
 638     }
 639 }
 640
 641 $daemon->register_start_command();
 642 $daemon->register_restart_command(1);
 643 $daemon->register_stop_command();
 644 $daemon->register_status_command();
 645
 646 our $cmddef = {
 647     start => [ __PACKAGE__, 'start', []],
 648     restart => [ __PACKAGE__, 'restart', []],
 649     stop => [ __PACKAGE__, 'stop', []],
 650     status => [ __PACKAGE__, 'status', [], undef, sub { print shift . "\n";} ],
 651 };
 652
 653 1;
 654
 655
 656
 657
 658