src/PVE/HA/Env/PVE2.pm

   1 package PVE::HA::Env::PVE2;
   2
   3 use strict;
   4 use warnings;
   5 use POSIX qw(:errno_h :fcntl_h);
   6 use IO::File;
   7 use IO::Socket::UNIX;
   8
   9 use PVE::SafeSyslog;
  10 use PVE::Tools;
  11 use PVE::Cluster qw(cfs_register_file cfs_read_file cfs_write_file cfs_lock_file);
  12 use PVE::INotify;
  13 use PVE::RPCEnvironment;
  14
  15 use PVE::HA::Tools ':exit_codes';
  16 use PVE::HA::Env;
  17 use PVE::HA::Config;
  18 use PVE::HA::FenceConfig;
  19 use PVE::HA::Resources;
  20 use PVE::HA::Resources::PVEVM;
  21 use PVE::HA::Resources::PVECT;
  22
  23 PVE::HA::Resources::PVEVM->register();
  24 PVE::HA::Resources::PVECT->register();
  25
  26 PVE::HA::Resources->init();
  27
  28 my $lockdir = "/etc/pve/priv/lock";
  29
  30 sub new {
  31     my ($this, $nodename) = @_;
  32
  33     die "missing nodename" if !$nodename;
  34
  35     my $class = ref($this) || $this;
  36
  37     my $self = bless {}, $class;
  38
  39     $self->{nodename} = $nodename;
  40
  41     return $self;
  42 }
  43
  44 sub nodename {
  45     my ($self) = @_;
  46
  47     return $self->{nodename};
  48 }
  49
  50 sub hardware {
  51     my ($self) = @_;
  52
  53     die "hardware is for testing and simulation only";
  54 }
  55
  56 sub read_manager_status {
  57     my ($self) = @_;
  58
  59     return PVE::HA::Config::read_manager_status();
  60 }
  61
  62 sub write_manager_status {
  63     my ($self, $status_obj) = @_;
  64
  65     PVE::HA::Config::write_manager_status($status_obj);
  66 }
  67
  68 sub read_lrm_status {
  69     my ($self, $node) = @_;
  70
  71     $node = $self->{nodename} if !defined($node);
  72
  73     return PVE::HA::Config::read_lrm_status($node);
  74 }
  75
  76 sub write_lrm_status {
  77     my ($self, $status_obj) = @_;
  78
  79     my $node = $self->{nodename};
  80
  81     PVE::HA::Config::write_lrm_status($node, $status_obj);
  82 }
  83
  84 sub is_node_shutdown {
  85     my ($self) = @_;
  86
  87     my $shutdown = 0;
  88     my $reboot = 0;
  89
  90     my $code = sub {
  91         my $line = shift;
  92
  93         # ensure we match the full unit name by matching /^JOB_ID UNIT /
  94         # see: man systemd.special
  95         $shutdown = 1 if ($line =~ m/^\d+\s+shutdown\.target\s+/);
  96         $reboot = 1 if ($line =~ m/^\d+\s+reboot\.target\s+/);
  97     };
  98
  99     my $cmd = ['/bin/systemctl', '--full', 'list-jobs'];
 100     eval { PVE::Tools::run_command($cmd, outfunc => $code, noerr => 1); };
 101
 102     return ($shutdown, $reboot);
 103 }
 104
 105 sub queue_crm_commands {
 106     my ($self, $cmd) = @_;
 107
 108     return PVE::HA::Config::queue_crm_commands($cmd);
 109 }
 110
 111 sub read_crm_commands {
 112     my ($self) = @_;
 113
 114     return PVE::HA::Config::read_crm_commands();
 115 }
 116
 117 sub read_service_config {
 118     my ($self) = @_;
 119
 120     return PVE::HA::Config::read_and_check_resources_config();
 121 }
 122
 123 sub read_fence_config {
 124     my ($self) = @_;
 125
 126     return PVE::HA::Config::read_fence_config();
 127 }
 128
 129 sub fencing_mode {
 130     my ($self) = @_;
 131
 132     my $datacenterconfig = cfs_read_file('datacenter.cfg');
 133
 134     return 'watchdog' if !$datacenterconfig->{fencing};
 135
 136     return $datacenterconfig->{fencing};
 137 }
 138
 139 sub exec_fence_agent {
 140     my ($self, $agent, $node, @param) = @_;
 141
 142     # setup execution environment
 143     $ENV{'PATH'} = '/sbin:/bin:/usr/sbin:/usr/bin';
 144
 145     my $cmd = "$agent " . PVE::HA::FenceConfig::gen_arg_str(@param);
 146
 147     exec($cmd);
 148     exit -1;
 149 }
 150
 151 # this is only allowed by the master to recover a _fenced_ service
 152 sub steal_service {
 153     my ($self, $sid, $current_node, $new_node) = @_;
 154
 155     my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
 156
 157     if(my $plugin = PVE::HA::Resources->lookup($type)) {
 158         my $old = $plugin->config_file($name, $current_node);
 159         my $new = $plugin->config_file($name, $new_node);
 160         rename($old, $new) ||
 161             die "rename '$old' to '$new' failed - $!\n";
 162     } else {
 163         die "implement me";
 164     }
 165 }
 166
 167 sub read_group_config {
 168     my ($self) = @_;
 169
 170     return PVE::HA::Config::read_group_config();
 171 }
 172
 173 # this should return a hash containing info
 174 # what nodes are members and online.
 175 sub get_node_info {
 176     my ($self) = @_;
 177
 178     my ($node_info, $quorate) = ({}, 0);
 179
 180     my $nodename = $self->{nodename};
 181
 182     $quorate = PVE::Cluster::check_cfs_quorum(1) || 0;
 183
 184     my $members = PVE::Cluster::get_members();
 185
 186     foreach my $node (keys %$members) {
 187         my $d = $members->{$node};
 188         $node_info->{$node}->{online} = $d->{online};
 189     }
 190
 191     $node_info->{$nodename}->{online} = 1; # local node is always up
 192
 193     return ($node_info, $quorate);
 194 }
 195
 196 sub log {
 197     my ($self, $level, $msg) = @_;
 198
 199     chomp $msg;
 200
 201     syslog($level, $msg);
 202 }
 203
 204 sub sendmail {
 205     my ($self, $subject, $text) = @_;
 206
 207     # Leave it to postfix to append the correct hostname
 208     my $mailfrom = 'root';
 209     # /root/.forward makes pvemailforward redirect the
 210     # mail to the address configured in the datacenter
 211     my $mailto = 'root';
 212
 213     PVE::Tools::sendmail($mailto, $subject, $text, undef, $mailfrom);
 214 }
 215
 216 my $last_lock_status_hash = {};
 217
 218 sub get_pve_lock {
 219     my ($self, $lockid) = @_;
 220
 221     my $got_lock = 0;
 222
 223     my $filename = "$lockdir/$lockid";
 224
 225     $last_lock_status_hash->{$lockid} //= { lock_time => 0, got_lock => 0};
 226     my $last = $last_lock_status_hash->{$lockid};
 227
 228     my $ctime = time();
 229     my $last_lock_time = $last->{lock_time} // 0;
 230     my $last_got_lock = $last->{got_lock};
 231
 232     my $retry_timeout = 120; # hardcoded lock lifetime limit from pmxcfs
 233
 234     eval {
 235
 236         mkdir $lockdir;
 237
 238         # pve cluster filesystem not online
 239         die "can't create '$lockdir' (pmxcfs not mounted?)\n" if ! -d $lockdir;
 240
 241         if (($ctime - $last_lock_time) < $retry_timeout) {
 242             # try cfs lock update request (utime)
 243             if (utime(0, $ctime, $filename))  {
 244                 $got_lock = 1;
 245                 return;
 246             }
 247             die "cfs lock update failed - $!\n";
 248         }
 249
 250         if (mkdir $filename) {
 251             $got_lock = 1;
 252             return;
 253         }
 254
 255         utime 0, 0, $filename; # cfs unlock request
 256         die "can't get cfs lock\n";
 257     };
 258
 259     my $err = $@;
 260
 261     #$self->log('err', $err) if $err; # for debugging
 262
 263     $last->{got_lock} = $got_lock;
 264     $last->{lock_time} = $ctime if $got_lock;
 265
 266     if (!!$got_lock != !!$last_got_lock) {
 267         if ($got_lock) {
 268             $self->log('info', "successfully acquired lock '$lockid'");
 269         } else {
 270             my $msg = "lost lock '$lockid";
 271             $msg .= " - $err" if $err;
 272             $self->log('err', $msg);
 273         }
 274     }
 275
 276     return $got_lock;
 277 }
 278
 279 sub get_ha_manager_lock {
 280     my ($self) = @_;
 281
 282     return $self->get_pve_lock("ha_manager_lock");
 283 }
 284
 285 # release the cluster wide manager lock.
 286 # when released another CRM may step up and get the lock, thus this should only
 287 # get called when shutting down/deactivating the current master
 288 sub release_ha_manager_lock {
 289     my ($self) = @_;
 290
 291     return rmdir("$lockdir/ha_manager_lock");
 292 }
 293
 294 sub get_ha_agent_lock {
 295     my ($self, $node) = @_;
 296
 297     $node = $self->nodename() if !defined($node);
 298
 299     return $self->get_pve_lock("ha_agent_${node}_lock");
 300 }
 301
 302 # release the respective node agent lock.
 303 # this should only get called if the nodes LRM gracefully shuts down with
 304 # all services already cleanly stopped!
 305 sub release_ha_agent_lock {
 306     my ($self) = @_;
 307
 308     my $node = $self->nodename();
 309
 310     return rmdir("$lockdir/ha_agent_${node}_lock");
 311 }
 312
 313 sub quorate {
 314     my ($self) = @_;
 315
 316     my $quorate = 0;
 317     eval {
 318         $quorate = PVE::Cluster::check_cfs_quorum();
 319     };
 320
 321     return $quorate;
 322 }
 323
 324 sub get_time {
 325     my ($self) = @_;
 326
 327     return time();
 328 }
 329
 330 sub sleep {
 331     my ($self, $delay) = @_;
 332
 333     CORE::sleep($delay);
 334 }
 335
 336 sub sleep_until {
 337    my ($self, $end_time) = @_;
 338
 339    for (;;) {
 340        my $cur_time = time();
 341
 342        last if $cur_time >= $end_time;
 343
 344        $self->sleep(1);
 345    }
 346 }
 347
 348 sub loop_start_hook {
 349     my ($self) = @_;
 350
 351     PVE::Cluster::cfs_update();
 352
 353     $self->{loop_start} = $self->get_time();
 354 }
 355
 356 sub loop_end_hook {
 357     my ($self) = @_;
 358
 359     my $delay = $self->get_time() - $self->{loop_start};
 360
 361     warn "loop take too long ($delay seconds)\n" if $delay > 30;
 362 }
 363
 364 my $watchdog_fh;
 365
 366 sub watchdog_open {
 367     my ($self) = @_;
 368
 369     die "watchdog already open\n" if defined($watchdog_fh);
 370
 371     $watchdog_fh = IO::Socket::UNIX->new(
 372         Type => SOCK_STREAM(),
 373         Peer => "/run/watchdog-mux.sock") ||
 374         die "unable to open watchdog socket - $!\n";
 375
 376     $self->log('info', "watchdog active");
 377 }
 378
 379 sub watchdog_update {
 380     my ($self, $wfh) = @_;
 381
 382     my $res = $watchdog_fh->syswrite("\0", 1);
 383     if (!defined($res)) {
 384         $self->log('err', "watchdog update failed - $!\n");
 385         return 0;
 386     }
 387     if ($res != 1) {
 388         $self->log('err', "watchdog update failed - write $res bytes\n");
 389         return 0;
 390     }
 391
 392     return 1;
 393 }
 394
 395 sub watchdog_close {
 396     my ($self, $wfh) = @_;
 397
 398     $watchdog_fh->syswrite("V", 1); # magic watchdog close
 399     if (!$watchdog_fh->close()) {
 400         $self->log('err', "watchdog close failed - $!");
 401     } else {
 402         $watchdog_fh = undef;
 403         $self->log('info', "watchdog closed (disabled)");
 404     }
 405 }
 406
 407 sub after_fork {
 408     my ($self) = @_;
 409
 410     # close inherited inotify FD from parent and reopen our own
 411     PVE::INotify::inotify_close();
 412     PVE::INotify::inotify_init();
 413
 414     PVE::Cluster::cfs_update();
 415 }
 416
 417 sub get_max_workers {
 418     my ($self) = @_;
 419
 420     my $datacenterconfig = cfs_read_file('datacenter.cfg');
 421
 422     return $datacenterconfig->{max_workers} || 4;
 423 }
 424
 425 1;