src/PVE/HA/Env/PVE2.pm

   1 package PVE::HA::Env::PVE2;
   2
   3 use strict;
   4 use warnings;
   5 use POSIX qw(:errno_h :fcntl_h);
   6 use IO::File;
   7 use IO::Socket::UNIX;
   8
   9 use PVE::SafeSyslog;
  10 use PVE::Tools;
  11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
  12 use PVE::DataCenterConfig;
  13 use PVE::INotify;
  14 use PVE::RPCEnvironment;
  15
  16 use PVE::HA::Tools ':exit_codes';
  17 use PVE::HA::Env;
  18 use PVE::HA::Config;
  19 use PVE::HA::FenceConfig;
  20 use PVE::HA::Resources;
  21 use PVE::HA::Resources::PVEVM;
  22 use PVE::HA::Resources::PVECT;
  23
  24 PVE::HA::Resources::PVEVM->register();
  25 PVE::HA::Resources::PVECT->register();
  26
  27 PVE::HA::Resources->init();
  28
  29 my $lockdir = "/etc/pve/priv/lock";
  30
  31 sub new {
  32     my ($this, $nodename) = @_;
  33
  34     die "missing nodename" if !$nodename;
  35
  36     my $class = ref($this) || $this;
  37
  38     my $self = bless {}, $class;
  39
  40     $self->{nodename} = $nodename;
  41
  42     return $self;
  43 }
  44
  45 sub nodename {
  46     my ($self) = @_;
  47
  48     return $self->{nodename};
  49 }
  50
  51 sub hardware {
  52     my ($self) = @_;
  53
  54     die "hardware is for testing and simulation only";
  55 }
  56
  57 sub read_manager_status {
  58     my ($self) = @_;
  59
  60     return PVE::HA::Config::read_manager_status();
  61 }
  62
  63 sub write_manager_status {
  64     my ($self, $status_obj) = @_;
  65
  66     PVE::HA::Config::write_manager_status($status_obj);
  67 }
  68
  69 sub read_lrm_status {
  70     my ($self, $node) = @_;
  71
  72     $node = $self->{nodename} if !defined($node);
  73
  74     return PVE::HA::Config::read_lrm_status($node);
  75 }
  76
  77 sub write_lrm_status {
  78     my ($self, $status_obj) = @_;
  79
  80     my $node = $self->{nodename};
  81
  82     PVE::HA::Config::write_lrm_status($node, $status_obj);
  83 }
  84
  85 sub is_node_shutdown {
  86     my ($self) = @_;
  87
  88     my $shutdown = 0;
  89     my $reboot = 0;
  90
  91     my $code = sub {
  92         my $line = shift;
  93
  94         # ensure we match the full unit name by matching /^JOB_ID UNIT /
  95         # see: man systemd.special
  96         $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
  97         $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
  98     };
  99
 100     my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
 101     eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
 102
 103     return ($shutdown, $reboot);
 104 }
 105
 106 sub queue_crm_commands {
 107     my ($self, $cmd) = @_;
 108
 109     return PVE::HA::Config::queue_crm_commands($cmd);
 110 }
 111
 112 sub read_crm_commands {
 113     my ($self) = @_;
 114
 115     return PVE::HA::Config::read_crm_commands();
 116 }
 117
 118 sub read_service_config {
 119     my ($self) = @_;
 120
 121     return PVE::HA::Config::read_and_check_resources_config();
 122 }
 123
 124 sub update_service_config {
 125     my ($self, $sid, $param) = @_;
 126
 127     return PVE::HA::Config::update_resources_config($sid, $param);
 128 }
 129
 130 sub parse_sid {
 131     my ($self, $sid) = @_;
 132
 133     return PVE::HA::Config::parse_sid($sid);
 134 }
 135
 136 sub read_fence_config {
 137     my ($self) = @_;
 138
 139     return PVE::HA::Config::read_fence_config();
 140 }
 141
 142 sub fencing_mode {
 143     my ($self) = @_;
 144
 145     my $datacenterconfig = cfs_read_file('datacenter.cfg');
 146
 147     return 'watchdog' if !$datacenterconfig->{fencing};
 148
 149     return $datacenterconfig->{fencing};
 150 }
 151
 152 sub exec_fence_agent {
 153     my ($self, $agent, $node, @param) = @_;
 154
 155     # setup execution environment
 156     $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
 157
 158     my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
 159
 160     exec($cmd);
 161     exit -1;
 162 }
 163
 164 # this is only allowed by the master to recover a _fenced_ service
 165 sub steal_service {
 166     my ($self, $sid, $current_node, $new_node) = @_;
 167
 168     my (undef, $type, $name) = PVE::HA::Config::parse_sid($sid);
 169
 170     if(my $plugin = PVE::HA::Resources->lookup($type)) {
 171         my $old = $plugin->config_file($name, $current_node);
 172         my $new = $plugin->config_file($name, $new_node);
 173         rename($old, $new) ||
 174             die "rename '$old' to '$new' failed - $!\n";
 175     } else {
 176         die "implement me";
 177     }
 178 }
 179
 180 sub read_group_config {
 181     my ($self) = @_;
 182
 183     return PVE::HA::Config::read_group_config();
 184 }
 185
 186 # this should return a hash containing info
 187 # what nodes are members and online.
 188 sub get_node_info {
 189     my ($self) = @_;
 190
 191     my ($node_info, $quorate) = ({}, 0);
 192
 193     my $nodename = $self->{nodename};
 194
 195     $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
 196
 197     my $members = PVE::Cluster::get_members();
 198
 199     foreach my $node (keys %$members) {
 200         my $d = $members->{$node};
 201         $node_info->{$node}->{online} = $d->{online};
 202     }
 203
 204     $node_info->{$nodename}->{online} = 1; # local node is always up
 205
 206     return ($node_info, $quorate);
 207 }
 208
 209 sub log {
 210     my ($self, $level, $msg) = @_;
 211
 212     chomp $msg;
 213
 214     syslog($level, $msg);
 215 }
 216
 217 sub sendmail {
 218     my ($self, $subject, $text) = @_;
 219
 220     # Leave it to postfix to append the correct hostname
 221     my $mailfrom = 'root';
 222     # /root/.forward makes pvemailforward redirect the
 223     # mail to the address configured in the datacenter
 224     my $mailto = 'root';
 225
 226     PVE::Tools::sendmail($mailto, $subject, $text, undef, $mailfrom);
 227 }
 228
 229 my $last_lock_status_hash = {};
 230
 231 sub get_pve_lock {
 232     my ($self, $lockid) = @_;
 233
 234     my $got_lock = 0;
 235
 236     my $filename = "$lockdir/$lockid";
 237
 238     $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
 239     my $last = $last_lock_status_hash->{$lockid};
 240
 241     my $ctime = time();
 242     my $last_lock_time = $last->{lock_time} // 0;
 243     my $last_got_lock = $last->{got_lock};
 244
 245     my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
 246
 247     eval {
 248
 249         mkdir $lockdir;
 250
 251         # pve cluster filesystem not online
 252         die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
 253
 254         if (($ctime - $last_lock_time) < $retry_timeout) {
 255             # try cfs lock update request (utime)
 256             if (utime(0, $ctime, $filename))  {
 257                 $got_lock = 1;
 258                 return;
 259             }
 260             die "cfs lock update failed - $!\n";
 261         }
 262
 263         if (mkdir $filename) {
 264             $got_lock = 1;
 265             return;
 266         }
 267
 268         utime 0, 0, $filename; # cfs unlock request
 269         die "can't get cfs lock\n";
 270     };
 271
 272     my $err = $@;
 273
 274     #$self->log('err', $err) if $err; # for debugging
 275
 276     $last->{got_lock} = $got_lock;
 277     $last->{lock_time} = $ctime if $got_lock;
 278
 279     if (!!$got_lock != !!$last_got_lock) {
 280         if ($got_lock) {
 281             $self->log('info', "successfully acquired lock '$lockid'");
 282         } else {
 283             my $msg = "lost lock '$lockid";
 284             $msg .= " - $err" if $err;
 285             $self->log('err', $msg);
 286         }
 287     }
 288
 289     return $got_lock;
 290 }
 291
 292 sub get_ha_manager_lock {
 293     my ($self) = @_;
 294
 295     return $self->get_pve_lock("ha_manager_lock");
 296 }
 297
 298 # release the cluster wide manager lock.
 299 # when released another CRM may step up and get the lock, thus this should only
 300 # get called when shutting down/deactivating the current master
 301 sub release_ha_manager_lock {
 302     my ($self) = @_;
 303
 304     return rmdir("$lockdir/ha_manager_lock");
 305 }
 306
 307 sub get_ha_agent_lock {
 308     my ($self, $node) = @_;
 309
 310     $node = $self->nodename() if !defined($node);
 311
 312     return $self->get_pve_lock("ha_agent_${node}_lock");
 313 }
 314
 315 # release the respective node agent lock.
 316 # this should only get called if the nodes LRM gracefully shuts down with
 317 # all services already cleanly stopped!
 318 sub release_ha_agent_lock {
 319     my ($self) = @_;
 320
 321     my $node = $self->nodename();
 322
 323     return rmdir("$lockdir/ha_agent_${node}_lock");
 324 }
 325
 326 sub quorate {
 327     my ($self) = @_;
 328
 329     my $quorate = 0;
 330     eval {
 331         $quorate = PVE::Cluster::check_cfs_quorum();
 332     };
 333
 334     return $quorate;
 335 }
 336
 337 sub get_time {
 338     my ($self) = @_;
 339
 340     return time();
 341 }
 342
 343 sub sleep {
 344     my ($self, $delay) = @_;
 345
 346     CORE::sleep($delay);
 347 }
 348
 349 sub sleep_until {
 350    my ($self, $end_time) = @_;
 351
 352    for (;;) {
 353        my $cur_time = time();
 354
 355        last if $cur_time >= $end_time;
 356
 357        $self->sleep(1);
 358    }
 359 }
 360
 361 sub loop_start_hook {
 362     my ($self) = @_;
 363
 364     $self->{loop_start} = $self->get_time();
 365
 366 }
 367
 368 sub loop_end_hook {
 369     my ($self) = @_;
 370
 371     my $delay = $self->get_time() - $self->{loop_start};
 372
 373     warn "loop take too long ($delay seconds)\n" if $delay > 30;
 374 }
 375
 376 sub cluster_state_update {
 377     my ($self) = @_;
 378
 379     eval { PVE::Cluster::cfs_update(1) };
 380     if (my $err = $@) {
 381         $self->log('warn', "cluster file system update failed - $err");
 382         return 0;
 383     }
 384
 385     return 1;
 386 }
 387
 388 my $watchdog_fh;
 389
 390 sub watchdog_open {
 391     my ($self) = @_;
 392
 393     die "watchdog already open\n" if defined($watchdog_fh);
 394
 395     $watchdog_fh = IO::Socket::UNIX->new(
 396         Type => SOCK_STREAM(),
 397         Peer => "/run/watchdog-mux.sock") ||
 398         die "unable to open watchdog socket - $!\n";
 399
 400     $self->log('info', "watchdog active");
 401 }
 402
 403 sub watchdog_update {
 404     my ($self, $wfh) = @_;
 405
 406     my $res = $watchdog_fh->syswrite("\0", 1);
 407     if (!defined($res)) {
 408         $self->log('err', "watchdog update failed - $!\n");
 409         return 0;
 410     }
 411     if ($res != 1) {
 412         $self->log('err', "watchdog update failed - write $res bytes\n");
 413         return 0;
 414     }
 415
 416     return 1;
 417 }
 418
 419 sub watchdog_close {
 420     my ($self, $wfh) = @_;
 421
 422     $watchdog_fh->syswrite("V", 1); # magic watchdog close
 423     if (!$watchdog_fh->close()) {
 424         $self->log('err', "watchdog close failed - $!");
 425     } else {
 426         $watchdog_fh = undef;
 427         $self->log('info', "watchdog closed (disabled)");
 428     }
 429 }
 430
 431 sub after_fork {
 432     my ($self) = @_;
 433
 434     # close inherited inotify FD from parent and reopen our own
 435     PVE::INotify::inotify_close();
 436     PVE::INotify::inotify_init();
 437
 438     PVE::Cluster::cfs_update();
 439 }
 440
 441 sub get_max_workers {
 442     my ($self) = @_;
 443
 444     my $datacenterconfig = cfs_read_file('datacenter.cfg');
 445
 446     return $datacenterconfig->{max_workers} || 4;
 447 }
 448
 449 # return cluster wide enforced HA settings
 450 sub get_ha_settings {
 451     my ($self) = @_;
 452
 453     my $datacenterconfig = eval { cfs_read_file('datacenter.cfg') };
 454     if (my $err = $@) {
 455         $self->log('err', "unable to get HA settings from datacenter.cfg - $err");
 456         return {};
 457     }
 458
 459     return $datacenterconfig->{ha};
 460 }
 461
 462 1;