use strict;
use warnings;
-use Data::Dumper;
use POSIX qw(:sys_wait_h);
use PVE::SafeSyslog;
shutdown_errors => 0,
# mode can be: active, reboot, shutdown, restart
mode => 'active',
+ cluster_state_update => 0,
}, $class;
$self->set_local_status({ state => 'wait_for_agent_lock' });
my $nodename = $haenv->nodename();
- my $shutdown = $haenv->is_node_shutdown();
+ my ($shutdown, $reboot) = $haenv->is_node_shutdown();
- if ($shutdown) {
- $haenv->log('info', "shutdown LRM, stop all services");
- $self->{mode} = 'shutdown';
+ my $dc_ha_cfg = $haenv->get_ha_settings();
+ my $shutdown_policy = $dc_ha_cfg->{shutdown_policy} // 'conditional';
+
+ if ($shutdown) { # don't log this on service restart, only on node shutdown
+ $haenv->log('info', "got shutdown request with shutdown policy '$shutdown_policy'");
+ }
- # queue stop jobs for all services
+ my $freeze_all;
+ if ($shutdown_policy eq 'conditional') {
+ $freeze_all = $reboot;
+ } elsif ($shutdown_policy eq 'freeze') {
+ $freeze_all = 1;
+ } elsif ($shutdown_policy eq 'failover') {
+ $freeze_all = 0;
+ } else {
+ $haenv->log('err', "unknown shutdown policy '$shutdown_policy', fall back to conditional");
+ $freeze_all = $reboot;
+ }
+ if ($shutdown) {
+ # *always* queue stop jobs for all services if the node shuts down,
+ # independent if it's a reboot or a poweroff, else we may corrupt
+ # services or hinder node shutdown
my $ss = $self->{service_status};
foreach my $sid (keys %$ss) {
# Note: use undef uid to mark shutdown/stop jobs
$self->queue_resource_command($sid, undef, 'request_stop');
}
+ }
+ if ($shutdown) {
+ if ($freeze_all) {
+ if ($reboot) {
+ $haenv->log('info', "reboot LRM, stop and freeze all services");
+ } else {
+ $haenv->log('info', "shutdown LRM, stop and freeze all services");
+ }
+ $self->{mode} = 'restart';
+ } else {
+ $haenv->log('info', "shutdown LRM, stop all services");
+ $self->{mode} = 'shutdown';
+ }
} else {
$haenv->log('info', "restart LRM, freeze all services");
$self->{mode} = 'restart';
return 1;
}
+sub update_service_status {
+ my ($self) = @_;
+
+ my $haenv = $self->{haenv};
+
+ my $ms = eval { $haenv->read_manager_status(); };
+ if (my $err = $@) {
+ $haenv->log('err', "updating service status from manager failed: $err");
+ return undef;
+ } else {
+ $self->{service_status} = $ms->{service_status} || {};
+ return 1;
+ }
+}
+
sub get_protected_ha_agent_lock {
my ($self) = @_;
next if !defined($req_state);
next if $req_state eq 'stopped';
next if $req_state eq 'freeze';
+ # erroneous services are not managed by HA, don't count them as active
+ next if $req_state eq 'error';
$count++;
}
my $haenv = $self->{haenv};
+ $haenv->loop_start_hook();
+
+ $self->{cluster_state_update} = $haenv->cluster_state_update();
+
+ my $res = $self->work();
+
+ $haenv->loop_end_hook();
+
+ return $res;
+}
+
+sub work {
+ my ($self) = @_;
+
+ my $haenv = $self->{haenv};
+
if (!$wrote_lrm_status_at_startup) {
if ($self->update_lrm_status()) {
$wrote_lrm_status_at_startup = 1;
my $status = $self->get_local_status();
my $state = $status->{state};
- my $ms = $haenv->read_manager_status();
- $self->{service_status} = $ms->{service_status} || {};
+ $self->update_service_status();
my $fence_request = PVE::HA::Tools::count_fenced_services($self->{service_status}, $haenv->nodename());
eval {
# fixme: set alert timer
+ # if we could not get the current service status there's no point
+ # in doing anything, try again next round.
+ return if !$self->update_service_status();
+
if ($self->{shutdown_request}) {
if ($self->{mode} eq 'restart') {
}
}
} else {
+ if (!$self->{cluster_state_update}) {
+ # update failed but we could still renew our lock (cfs restart?),
+ # safely skip manage and expect to update just fine next round
+ $haenv->log('notice', "temporary inconsistent cluster state " .
+ "(cfs restart?), skip round");
+ return;
+ }
$self->manage_resources();
while (($haenv->get_time() - $starttime) < 5) {
my $count = $self->check_active_workers();
- foreach my $sid (keys %{$self->{workers}}) {
+ foreach my $sid (sort keys %{$self->{workers}}) {
last if $count >= $max_workers && $max_workers > 0;
my $w = $self->{workers}->{$sid};
my $ss = $self->{service_status};
+ foreach my $sid (keys %{$self->{restart_tries}}) {
+ delete $self->{restart_tries}->{$sid} if !$ss->{$sid};
+ }
+
foreach my $sid (keys %$ss) {
my $sd = $ss->{$sid};
next if !$sd->{node};
my $nodename = $haenv->nodename();
- my (undef, $service_type, $service_name) = PVE::HA::Tools::parse_sid($sid);
+ my (undef, $service_type, $service_name) = $haenv->parse_sid($sid);
my $plugin = PVE::HA::Resources->lookup($service_type);
if (!$plugin) {