src/PVE/HA/Env/PVE2.pm

   1 package PVE::HA::Env::PVE2;
   2
   3 use strict;
   4 use warnings;
   5 use POSIX qw(:errno_h :fcntl_h);
   6 use IO::File;
   7 use IO::Socket::UNIX;
   8 use JSON;
   9
  10 use PVE::SafeSyslog;
  11 use PVE::Tools;
  12 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
  13 use PVE::DataCenterConfig;
  14 use PVE::INotify;
  15 use PVE::RPCEnvironment;
  16
  17 use PVE::HA::Tools ':exit_codes';
  18 use PVE::HA::Env;
  19 use PVE::HA::Config;
  20 use PVE::HA::FenceConfig;
  21 use PVE::HA::Resources;
  22 use PVE::HA::Resources::PVEVM;
  23 use PVE::HA::Resources::PVECT;
  24
  25 PVE::HA::Resources::PVEVM->register();
  26 PVE::HA::Resources::PVECT->register();
  27
  28 PVE::HA::Resources->init();
  29
  30 my $lockdir = "/etc/pve/priv/lock";
  31
  32 sub new {
  33     my ($this, $nodename) = @_;
  34
  35     die "missing nodename" if !$nodename;
  36
  37     my $class = ref($this) || $this;
  38
  39     my $self = bless {}, $class;
  40
  41     $self->{nodename} = $nodename;
  42
  43     return $self;
  44 }
  45
  46 sub nodename {
  47     my ($self) = @_;
  48
  49     return $self->{nodename};
  50 }
  51
  52 sub hardware {
  53     my ($self) = @_;
  54
  55     die "hardware is for testing and simulation only";
  56 }
  57
  58 sub read_manager_status {
  59     my ($self) = @_;
  60
  61     return PVE::HA::Config::read_manager_status();
  62 }
  63
  64 sub write_manager_status {
  65     my ($self, $status_obj) = @_;
  66
  67     PVE::HA::Config::write_manager_status($status_obj);
  68 }
  69
  70 sub read_lrm_status {
  71     my ($self, $node) = @_;
  72
  73     $node = $self->{nodename} if !defined($node);
  74
  75     return PVE::HA::Config::read_lrm_status($node);
  76 }
  77
  78 sub write_lrm_status {
  79     my ($self, $status_obj) = @_;
  80
  81     my $node = $self->{nodename};
  82
  83     PVE::HA::Config::write_lrm_status($node, $status_obj);
  84 }
  85
  86 sub is_node_shutdown {
  87     my ($self) = @_;
  88
  89     my $shutdown = 0;
  90     my $reboot = 0;
  91
  92     my $code = sub {
  93         my $line = shift;
  94
  95         # ensure we match the full unit name by matching /^JOB_ID UNIT /
  96         # see: man systemd.special
  97         $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
  98         $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
  99     };
 100
 101     my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
 102     eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
 103
 104     return ($shutdown, $reboot);
 105 }
 106
 107 sub queue_crm_commands {
 108     my ($self, $cmd) = @_;
 109
 110     return PVE::HA::Config::queue_crm_commands($cmd);
 111 }
 112
 113 sub read_crm_commands {
 114     my ($self) = @_;
 115
 116     return PVE::HA::Config::read_crm_commands();
 117 }
 118
 119 sub read_service_config {
 120     my ($self) = @_;
 121
 122     return PVE::HA::Config::read_and_check_resources_config();
 123 }
 124
 125 sub update_service_config {
 126     my ($self, $sid, $param) = @_;
 127
 128     return PVE::HA::Config::update_resources_config($sid, $param);
 129 }
 130
 131 sub parse_sid {
 132     my ($self, $sid) = @_;
 133
 134     return PVE::HA::Config::parse_sid($sid);
 135 }
 136
 137 sub read_fence_config {
 138     my ($self) = @_;
 139
 140     return PVE::HA::Config::read_fence_config();
 141 }
 142
 143 sub fencing_mode {
 144     my ($self) = @_;
 145
 146     my $datacenterconfig = cfs_read_file('datacenter.cfg');
 147
 148     return 'watchdog' if !$datacenterconfig->{fencing};
 149
 150     return $datacenterconfig->{fencing};
 151 }
 152
 153 sub exec_fence_agent {
 154     my ($self, $agent, $node, @param) = @_;
 155
 156     # setup execution environment
 157     $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
 158
 159     my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
 160
 161     exec($cmd);
 162     exit -1;
 163 }
 164
 165 # this is only allowed by the master to recover a _fenced_ service
 166 sub steal_service {
 167     my ($self, $sid, $current_node, $new_node) = @_;
 168
 169     my (undef, $type, $name) = PVE::HA::Config::parse_sid($sid);
 170
 171     if(my $plugin = PVE::HA::Resources->lookup($type)) {
 172         my $old = $plugin->config_file($name, $current_node);
 173         my $new = $plugin->config_file($name, $new_node);
 174         rename($old, $new) ||
 175             die "rename '$old' to '$new' failed - $!\n";
 176     } else {
 177         die "implement me";
 178     }
 179
 180     # Necessary for (at least) static usage plugin to always be able to read service config from new
 181     # node right away.
 182     $self->cluster_state_update();
 183 }
 184
 185 sub read_group_config {
 186     my ($self) = @_;
 187
 188     return PVE::HA::Config::read_group_config();
 189 }
 190
 191 # this should return a hash containing info
 192 # what nodes are members and online.
 193 sub get_node_info {
 194     my ($self) = @_;
 195
 196     my ($node_info, $quorate) = ({}, 0);
 197
 198     my $nodename = $self->{nodename};
 199
 200     $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
 201
 202     my $members = PVE::Cluster::get_members();
 203
 204     foreach my $node (keys %$members) {
 205         my $d = $members->{$node};
 206         $node_info->{$node}->{online} = $d->{online};
 207     }
 208
 209     $node_info->{$nodename}->{online} = 1; # local node is always up
 210
 211     return ($node_info, $quorate);
 212 }
 213
 214 sub log {
 215     my ($self, $level, $msg) = @_;
 216
 217     chomp $msg;
 218
 219     syslog($level, $msg);
 220 }
 221
 222 sub sendmail {
 223     my ($self, $subject, $text) = @_;
 224
 225     # Leave it to postfix to append the correct hostname
 226     my $mailfrom = 'root';
 227     # /root/.forward makes pvemailforward redirect the
 228     # mail to the address configured in the datacenter
 229     my $mailto = 'root';
 230
 231     PVE::Tools::sendmail($mailto, $subject, $text, undef, $mailfrom);
 232 }
 233
 234 my $last_lock_status_hash = {};
 235
 236 sub get_pve_lock {
 237     my ($self, $lockid) = @_;
 238
 239     my $got_lock = 0;
 240
 241     my $filename = "$lockdir/$lockid";
 242
 243     $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
 244     my $last = $last_lock_status_hash->{$lockid};
 245
 246     my $ctime = time();
 247     my $last_lock_time = $last->{lock_time} // 0;
 248     my $last_got_lock = $last->{got_lock};
 249
 250     my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
 251
 252     eval {
 253
 254         mkdir $lockdir;
 255
 256         # pve cluster filesystem not online
 257         die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
 258
 259         if (($ctime - $last_lock_time) < $retry_timeout) {
 260             # try cfs lock update request (utime)
 261             if (utime(0, $ctime, $filename))  {
 262                 $got_lock = 1;
 263                 return;
 264             }
 265             die "cfs lock update failed - $!\n";
 266         }
 267
 268         if (mkdir $filename) {
 269             $got_lock = 1;
 270             return;
 271         }
 272
 273         utime 0, 0, $filename; # cfs unlock request
 274         die "can't get cfs lock\n";
 275     };
 276
 277     my $err = $@;
 278
 279     #$self->log('err', $err) if $err; # for debugging
 280
 281     $last->{got_lock} = $got_lock;
 282     $last->{lock_time} = $ctime if $got_lock;
 283
 284     if (!!$got_lock != !!$last_got_lock) {
 285         if ($got_lock) {
 286             $self->log('info', "successfully acquired lock '$lockid'");
 287         } else {
 288             my $msg = "lost lock '$lockid";
 289             $msg .= " - $err" if $err;
 290             $self->log('err', $msg);
 291         }
 292     }
 293
 294     return $got_lock;
 295 }
 296
 297 sub get_ha_manager_lock {
 298     my ($self) = @_;
 299
 300     return $self->get_pve_lock("ha_manager_lock");
 301 }
 302
 303 # release the cluster wide manager lock.
 304 # when released another CRM may step up and get the lock, thus this should only
 305 # get called when shutting down/deactivating the current master
 306 sub release_ha_manager_lock {
 307     my ($self) = @_;
 308
 309     return rmdir("$lockdir/ha_manager_lock");
 310 }
 311
 312 sub get_ha_agent_lock {
 313     my ($self, $node) = @_;
 314
 315     $node = $self->nodename() if !defined($node);
 316
 317     return $self->get_pve_lock("ha_agent_${node}_lock");
 318 }
 319
 320 # release the respective node agent lock.
 321 # this should only get called if the nodes LRM gracefully shuts down with
 322 # all services already cleanly stopped!
 323 sub release_ha_agent_lock {
 324     my ($self) = @_;
 325
 326     my $node = $self->nodename();
 327
 328     return rmdir("$lockdir/ha_agent_${node}_lock");
 329 }
 330
 331 sub quorate {
 332     my ($self) = @_;
 333
 334     my $quorate = 0;
 335     eval {
 336         $quorate = PVE::Cluster::check_cfs_quorum();
 337     };
 338
 339     return $quorate;
 340 }
 341
 342 sub get_time {
 343     my ($self) = @_;
 344
 345     return time();
 346 }
 347
 348 sub sleep {
 349     my ($self, $delay) = @_;
 350
 351     CORE::sleep($delay);
 352 }
 353
 354 sub sleep_until {
 355    my ($self, $end_time) = @_;
 356
 357    for (;;) {
 358        my $cur_time = time();
 359
 360        last if $cur_time >= $end_time;
 361
 362        $self->sleep(1);
 363    }
 364 }
 365
 366 sub loop_start_hook {
 367     my ($self) = @_;
 368
 369     $self->{loop_start} = $self->get_time();
 370
 371 }
 372
 373 sub loop_end_hook {
 374     my ($self) = @_;
 375
 376     my $delay = $self->get_time() - $self->{loop_start};
 377
 378     warn "loop take too long ($delay seconds)\n" if $delay > 30;
 379 }
 380
 381 sub cluster_state_update {
 382     my ($self) = @_;
 383
 384     eval { PVE::Cluster::cfs_update(1) };
 385     if (my $err = $@) {
 386         $self->log('warn', "cluster file system update failed - $err");
 387         return 0;
 388     }
 389
 390     return 1;
 391 }
 392
 393 my $watchdog_fh;
 394
 395 sub watchdog_open {
 396     my ($self) = @_;
 397
 398     die "watchdog already open\n" if defined($watchdog_fh);
 399
 400     $watchdog_fh = IO::Socket::UNIX->new(
 401         Type => SOCK_STREAM(),
 402         Peer => "/run/watchdog-mux.sock") ||
 403         die "unable to open watchdog socket - $!\n";
 404
 405     $self->log('info', "watchdog active");
 406 }
 407
 408 sub watchdog_update {
 409     my ($self, $wfh) = @_;
 410
 411     my $res = $watchdog_fh->syswrite("\0", 1);
 412     if (!defined($res)) {
 413         $self->log('err', "watchdog update failed - $!\n");
 414         return 0;
 415     }
 416     if ($res != 1) {
 417         $self->log('err', "watchdog update failed - write $res bytes\n");
 418         return 0;
 419     }
 420
 421     return 1;
 422 }
 423
 424 sub watchdog_close {
 425     my ($self, $wfh) = @_;
 426
 427     $watchdog_fh->syswrite("V", 1); # magic watchdog close
 428     if (!$watchdog_fh->close()) {
 429         $self->log('err', "watchdog close failed - $!");
 430     } else {
 431         $watchdog_fh = undef;
 432         $self->log('info', "watchdog closed (disabled)");
 433     }
 434 }
 435
 436 sub after_fork {
 437     my ($self) = @_;
 438
 439     # close inherited inotify FD from parent and reopen our own
 440     PVE::INotify::inotify_close();
 441     PVE::INotify::inotify_init();
 442
 443     PVE::Cluster::cfs_update();
 444 }
 445
 446 sub get_max_workers {
 447     my ($self) = @_;
 448
 449     my $datacenterconfig = cfs_read_file('datacenter.cfg');
 450
 451     return $datacenterconfig->{max_workers} || 4;
 452 }
 453
 454 # return cluster wide enforced HA settings
 455 sub get_ha_settings {
 456     my ($self) = @_;
 457
 458     my $datacenterconfig = eval { cfs_read_file('datacenter.cfg') };
 459     if (my $err = $@) {
 460         $self->log('err', "unable to get HA settings from datacenter.cfg - $err");
 461         return {};
 462     }
 463
 464     return $datacenterconfig->{ha};
 465 }
 466
 467 sub get_static_node_stats {
 468     my ($self) = @_;
 469
 470     my $stats = PVE::Cluster::get_node_kv('static-info');
 471     for my $node (keys $stats->%*) {
 472         $stats->{$node} = eval { decode_json($stats->{$node}) };
 473         $self->log('err', "unable to decode static node info for '$node' - $@") if $@;
 474     }
 475
 476     return $stats;
 477 }
 478
 479 1;