request_stop => 1,
started => 1,
fence => 1,
+ recovery => 1,
migrate => 1,
relocate => 1,
freeze => 1,
my $sd = $self->{ss}->{$sid};
my $state = $sd->{state};
if (defined($online_node_usage->{$sd->{node}})) {
- if (($state eq 'started') || ($state eq 'request_stop') ||
- ($state eq 'fence') || ($state eq 'freeze') || ($state eq 'error')) {
+ if (
+ $state eq 'started' || $state eq 'request_stop' || $state eq 'fence' ||
+ $state eq 'freeze' || $state eq 'error' || $state eq 'recovery'
+ ) {
$online_node_usage->{$sd->{node}}++;
} elsif (($state eq 'migrate') || ($state eq 'relocate')) {
# count it for both, source and target as load is put on both
}
};
-# after a node was fenced this recovers the service to a new node
-my $recover_fenced_service = sub {
- my ($self, $sid, $cd) = @_;
-
- my ($haenv, $ss) = ($self->{haenv}, $self->{ss});
-
- my $sd = $ss->{$sid};
-
- if ($sd->{state} ne 'fence') { # should not happen
- $haenv->log('err', "cannot recover service '$sid' from fencing, wrong state '$sd->{state}'");
- return;
- }
-
- my $fenced_node = $sd->{node}; # for logging purpose
-
- $self->recompute_online_node_usage(); # we want the most current node state
-
- my $recovery_node = select_service_node(
- $self->{groups},
- $self->{online_node_usage},
- $cd,
- $sd->{node},
- );
-
- if ($recovery_node) {
- $haenv->log('info', "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'");
-
- &$fence_recovery_cleanup($self, $sid, $fenced_node);
-
- $haenv->steal_service($sid, $sd->{node}, $recovery_node);
- $self->{online_node_usage}->{$recovery_node}++;
-
- # $sd *is normally read-only*, fencing is the exception
- $cd->{node} = $sd->{node} = $recovery_node;
- my $new_state = ($cd->{state} eq 'started') ? 'started' : 'request_stop';
- &$change_service_state($self, $sid, $new_state, node => $recovery_node);
- } else {
- # no possible node found, cannot recover
- $haenv->log('err', "recovering service '$sid' from fenced node '$fenced_node' failed, no recovery node found");
- &$change_service_state($self, $sid, 'error');
- }
-};
-
# read LRM status for all nodes
sub read_lrm_status {
my ($self) = @_;
# do nothing here - wait until fenced
+ } elsif ($last_state eq 'recovery') {
+
+ $self->next_state_recovery($sid, $cd, $sd, $lrm_res);
+
} elsif ($last_state eq 'request_stop') {
$self->next_state_request_stop($sid, $cd, $sd, $lrm_res);
next if !$fenced_nodes->{$sd->{node}};
# node fence was successful - recover service
- &$recover_fenced_service($self, $sid, $sc->{$sid});
+ $change_service_state->($self, $sid, 'recovery');
+ $repeat = 1; # for faster execution
}
last if !$repeat;
}
+# after a node was fenced this recovers the service to a new node
+sub next_state_recovery {
+ my ($self, $sid, $cd, $sd, $lrm_res) = @_;
+
+ my ($haenv, $ss) = ($self->{haenv}, $self->{ss});
+ my $ns = $self->{ns};
+ my $ms = $self->{ms};
+
+ if ($sd->{state} ne 'recovery') { # should not happen
+ $haenv->log('err', "cannot recover service '$sid' from fencing, wrong state '$sd->{state}'");
+ return;
+ }
+
+ my $fenced_node = $sd->{node}; # for logging purpose
+
+ $self->recompute_online_node_usage(); # we want the most current node state
+
+ my $recovery_node = select_service_node(
+ $self->{groups},
+ $self->{online_node_usage},
+ $cd,
+ $sd->{node},
+ );
+
+ if ($recovery_node) {
+ $haenv->log('info', "recover service '$sid' from fenced node '$fenced_node' to node '$recovery_node'");
+
+ $fence_recovery_cleanup->($self, $sid, $fenced_node);
+
+ $haenv->steal_service($sid, $sd->{node}, $recovery_node);
+ $self->{online_node_usage}->{$recovery_node}++;
+
+ # NOTE: $sd *is normally read-only*, fencing is the exception
+ $cd->{node} = $sd->{node} = $recovery_node;
+ my $new_state = ($cd->{state} eq 'started') ? 'started' : 'request_stop';
+ $change_service_state->($self, $sid, $new_state, node => $recovery_node);
+ } else {
+ # no possible node found, cannot recover - but retry later, as we always try to make it available
+ $haenv->log('err', "recovering service '$sid' from fenced node '$fenced_node' failed, no recovery node found");
+ $change_service_state->($self, $sid, 'error');
+ }
+}
+
1;
info 240 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 240 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 240 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 240 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
info 240 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node2'
-info 240 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node2)
+info 240 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node2)
info 243 node2/lrm: starting service vm:103
info 243 node2/lrm: service status vm:103 started
info 720 hardware: exit simulation - done
info 22 node3/crm: fencing: acknowledged - got agent lock for node 'node1'
info 22 node3/crm: node 'node1': state changed from 'fence' => 'unknown'
emai 22 node3/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node1'
+info 22 node3/crm: service 'vm:101': state changed from 'fence' to 'recovery'
info 22 node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node3'
-info 22 node3/crm: service 'vm:101': state changed from 'fence' to 'started' (node = node3)
+info 22 node3/crm: service 'vm:101': state changed from 'recovery' to 'started' (node = node3)
info 23 node3/lrm: got lock 'ha_agent_node3_lock'
info 23 node3/lrm: status change wait_for_agent_lock => active
info 23 node3/lrm: starting service vm:101
info 282 node3/crm: fencing: acknowledged - got agent lock for node 'node1'
info 282 node3/crm: node 'node1': state changed from 'fence' => 'unknown'
emai 282 node3/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node1'
+info 282 node3/crm: service 'vm:101': state changed from 'fence' to 'recovery'
info 282 node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node2'
-info 282 node3/crm: service 'vm:101': state changed from 'fence' to 'started' (node = node2)
+info 282 node3/crm: service 'vm:101': state changed from 'recovery' to 'started' (node = node2)
info 301 node2/lrm: starting service vm:101
info 301 node2/lrm: service status vm:101 started
info 500 cmdlist: execute power node1 on
info 340 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 340 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
info 340 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
warn 340 node1/crm: removed leftover lock 'backup' from recovered service 'vm:103' to allow its start.
-info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 340 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node1)
info 341 node1/lrm: got lock 'ha_agent_node1_lock'
info 341 node1/lrm: status change wait_for_agent_lock => active
info 341 node1/lrm: starting service vm:103
info 340 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 340 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
info 340 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
-info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 340 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node1)
info 341 node1/lrm: got lock 'ha_agent_node1_lock'
info 341 node1/lrm: status change wait_for_agent_lock => active
info 341 node1/lrm: starting service vm:103
info 240 node1/crm: fencing: acknowledged - got agent lock for node 'node2'
info 240 node1/crm: node 'node2': state changed from 'fence' => 'unknown'
emai 240 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node2'
+info 240 node1/crm: service 'vm:102': state changed from 'fence' to 'recovery'
err 240 node1/crm: recovering service 'vm:102' from fenced node 'node2' failed, no recovery node found
-info 240 node1/crm: service 'vm:102': state changed from 'fence' to 'error'
+info 240 node1/crm: service 'vm:102': state changed from 'recovery' to 'error'
info 720 hardware: exit simulation - done
info 340 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 340 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
info 340 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
-info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 340 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node1)
info 341 node1/lrm: got lock 'ha_agent_node1_lock'
info 341 node1/lrm: status change wait_for_agent_lock => active
info 341 node1/lrm: starting service vm:103
info 340 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 340 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 340 node1/crm: service 'fa:1501': state changed from 'fence' to 'recovery'
info 340 node1/crm: recover service 'fa:1501' from fenced node 'node3' to node 'node1'
-info 340 node1/crm: service 'fa:1501': state changed from 'fence' to 'request_stop' (node = node1)
+info 340 node1/crm: service 'fa:1501': state changed from 'recovery' to 'request_stop' (node = node1)
info 341 node1/lrm: got lock 'ha_agent_node1_lock'
info 341 node1/lrm: status change wait_for_agent_lock => active
info 360 node1/crm: service 'fa:1501': state changed from 'request_stop' to 'stopped'
info 220 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 220 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 220 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 220 node1/crm: service 'ct:105': state changed from 'fence' to 'recovery'
info 220 node1/crm: recover service 'ct:105' from fenced node 'node3' to node 'node2'
-info 220 node1/crm: service 'ct:105': state changed from 'fence' to 'started' (node = node2)
+info 220 node1/crm: service 'ct:105': state changed from 'recovery' to 'started' (node = node2)
info 223 node2/lrm: got lock 'ha_agent_node2_lock'
info 223 node2/lrm: status change wait_for_agent_lock => active
info 223 node2/lrm: starting service ct:105
info 200 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 200 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
info 200 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
-info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 200 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
info 201 node1/lrm: status change wait_for_agent_lock => active
info 201 node1/lrm: starting service vm:103
info 200 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 200 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'recovery'
info 200 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
-info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 200 node1/crm: service 'vm:103': state changed from 'recovery' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
info 201 node1/lrm: status change wait_for_agent_lock => active
info 201 node1/lrm: starting service vm:103
info 200 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
emai 200 node1/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node3'
+info 200 node1/crm: service 'ct:103': state changed from 'fence' to 'recovery'
info 200 node1/crm: recover service 'ct:103' from fenced node 'node3' to node 'node1'
-info 200 node1/crm: service 'ct:103': state changed from 'fence' to 'started' (node = node1)
+info 200 node1/crm: service 'ct:103': state changed from 'recovery' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
info 201 node1/lrm: status change wait_for_agent_lock => active
info 201 node1/lrm: starting service ct:103
info 220 node2/crm: fencing: acknowledged - got agent lock for node 'node1'
info 220 node2/crm: node 'node1': state changed from 'fence' => 'unknown'
emai 220 node2/crm: SUCCEED: fencing: acknowledged - got agent lock for node 'node1'
+info 220 node2/crm: service 'vm:100': state changed from 'fence' to 'recovery'
info 220 node2/crm: recover service 'vm:100' from fenced node 'node1' to node 'node2'
-info 220 node2/crm: service 'vm:100': state changed from 'fence' to 'started' (node = node2)
+info 220 node2/crm: service 'vm:100': state changed from 'recovery' to 'started' (node = node2)
info 221 node2/lrm: got lock 'ha_agent_node2_lock'
info 221 node2/lrm: status change wait_for_agent_lock => active
info 221 node2/lrm: starting service vm:100