return $self->{plug}->read_service_config();
}
-sub change_service_location {
+# this is normally only allowed by the master to recover a _fenced_ service
+sub steal_service {
my ($self, $sid, $current_node, $new_node) = @_;
- return $self->{plug}->change_service_location($sid, $current_node, $new_node);
+ return $self->{plug}->steal_service($sid, $current_node, $new_node);
}
sub read_group_config {
return $conf;
}
-sub change_service_location {
+# this is only allowed by the master to recover a _fenced_ service
+sub steal_service {
my ($self, $sid, $current_node, $new_node) = @_;
my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
$haenv->log('info', "service '$sid': state changed from '${old_state}' to '${new_state}' $text_state");
};
+# after a node was fenced this recovers the service to a new node
+my $recover_fenced_service = sub {
+ my ($self, $sid, $cd) = @_;
+
+ my ($haenv, $ss) = ($self->{haenv}, $self->{ss});
+
+ my $sd = $ss->{$sid};
+
+ if ($sd->{state} ne 'fence') { # should not happen
+ $haenv->log('err', "cannot recover service '$sid' from fencing," .
+ " wrong state '$sd->{state}'");
+ return;
+ }
+
+ my $fenced_node = $sd->{node}; # for logging purpose
+
+ $self->recompute_online_node_usage(); # we want the most current node state
+
+ my $recovery_node = select_service_node($self->{groups},
+ $self->{online_node_usage},
+ $cd, $sd->{node});
+
+ if ($recovery_node) {
+ $haenv->log('info', "recover service '$sid' from fenced node " .
+ "'$fenced_node' to node '$recovery_node'");
+
+ $haenv->steal_service($sid, $sd->{node}, $recovery_node);
+
+ # $sd *is normally read-only*, fencing is the exception
+ $cd->{node} = $sd->{node} = $recovery_node;
+ &$change_service_state($self, $sid, 'started', node => $recovery_node);
+ } else {
+ # no node found, let the service in 'fence' state and try again
+ $haenv->log('err', "recovering service '$sid' from fenced node " .
+ "'$fenced_node' failed, no recovery node found");
+ }
+};
+
# read LRM status for all nodes
sub read_lrm_status {
my ($self) = @_;
next if !$fenced_nodes->{$sd->{node}};
- # node fence was successful - mark service as stopped
- &$change_service_state($self, $sid, 'stopped');
+ # node fence was successful - recover service
+ &$recover_fenced_service($self, $sid, $sc->{$sid});
}
last if !$repeat;
} elsif ($sd->{node} eq $target) {
$haenv->log('info', "ignore service '$sid' $cmd request - service already on node '$target'");
} else {
- eval {
- $haenv->change_service_location($sid, $sd->{node}, $target);
- $cd->{node} = $sd->{node} = $target; # fixme: $sd is read-only??!!
- $haenv->log('info', "$cmd service '$sid' to node '$target' (stopped)");
- };
- if (my $err = $@) {
- $haenv->log('err', "$cmd service '$sid' to node '$target' failed - $err");
- }
+ &$change_service_state($self, $sid, $cmd, node => $target);
+ return;
}
} else {
$haenv->log('err', "unknown command '$cmd' for service '$sid'");
}
if ($cd->{state} eq 'enabled') {
- if (my $node = select_service_node($self->{groups}, $self->{online_node_usage}, $cd, $sd->{node})) {
- if ($node && ($sd->{node} ne $node)) {
- eval {
- $haenv->change_service_location($sid, $sd->{node}, $node);
- $cd->{node} = $sd->{node} = $node; # fixme: $sd is read-only??!!
- };
- if (my $err = $@) {
- $haenv->log('err', "move service '$sid' to node '$node' failed - $err");
- } else {
- &$change_service_state($self, $sid, 'started', node => $node);
- }
- } else {
- &$change_service_state($self, $sid, 'started', node => $node);
- }
- } else {
- # fixme: warn
- }
-
+ # simply mark it started, if it's on the wrong node
+ # next_state_started will fix that for us
+ &$change_service_state($self, $sid, 'started', node => $sd->{node});
return;
}
return $self->{hardware}->read_group_config();
}
-sub change_service_location {
+# this is normally only allowed by the master to recover a _fenced_ service
+sub steal_service {
my ($self, $sid, $current_node, $new_node) = @_;
return $self->{hardware}->change_service_location($sid, $current_node, $new_node);
$haenv->sleep(2); # (live) migration time
}
- $haenv->change_service_location($sid, $nodename, $target);
+ $hardware->change_service_location($sid, $nodename, $target);
$haenv->log("info", "service $sid - end $cmd to node '$target'");
# ensure that the old node doesn't has the service anymore
delete $ss->{$sid};
info 240 node1/crm: got lock 'ha_agent_node3_lock'
info 240 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 240 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 240 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
-info 260 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node2)
-info 263 node2/lrm: starting service vm:103
-info 263 node2/lrm: service status vm:103 started
+info 240 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node2'
+info 240 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node2)
+info 243 node2/lrm: starting service vm:103
+info 243 node2/lrm: service status vm:103 started
info 720 hardware: exit simulation - done
info 22 node3/crm: got lock 'ha_agent_node1_lock'
info 22 node3/crm: fencing: acknowleged - got agent lock for node 'node1'
info 22 node3/crm: node 'node1': state changed from 'fence' => 'unknown'
-info 22 node3/crm: service 'vm:101': state changed from 'fence' to 'stopped'
+info 22 node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node3'
+info 22 node3/crm: service 'vm:101': state changed from 'fence' to 'started' (node = node3)
+info 23 node3/lrm: got lock 'ha_agent_node3_lock'
+info 23 node3/lrm: status change wait_for_agent_lock => active
+info 23 node3/lrm: starting service vm:101
+info 23 node3/lrm: service status vm:101 started
info 40 node1/crm: status change wait_for_quorum => slave
info 42 node3/crm: node 'node1': state changed from 'unknown' => 'online'
-info 42 node3/crm: service 'vm:101': state changed from 'stopped' to 'started' (node = node1)
-info 161 node1/lrm: got lock 'ha_agent_node1_lock'
-info 161 node1/lrm: status change wait_for_agent_lock => active
-info 161 node1/lrm: starting service vm:101
-info 161 node1/lrm: service status vm:101 started
info 620 hardware: exit simulation - done
info 282 node3/crm: got lock 'ha_agent_node1_lock'
info 282 node3/crm: fencing: acknowleged - got agent lock for node 'node1'
info 282 node3/crm: node 'node1': state changed from 'fence' => 'unknown'
-info 282 node3/crm: service 'vm:101': state changed from 'fence' to 'stopped'
-info 282 node3/crm: service 'vm:101': state changed from 'stopped' to 'started' (node = node2)
+info 282 node3/crm: recover service 'vm:101' from fenced node 'node1' to node 'node2'
+info 282 node3/crm: service 'vm:101': state changed from 'fence' to 'started' (node = node2)
info 301 node2/lrm: starting service vm:101
info 301 node2/lrm: service status vm:101 started
info 500 cmdlist: execute power node1 on
info 200 node1/crm: got lock 'ha_agent_node3_lock'
info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
-info 200 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node1)
+info 200 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
info 201 node1/lrm: status change wait_for_agent_lock => active
info 201 node1/lrm: starting service vm:103
info 200 node1/crm: got lock 'ha_agent_node3_lock'
info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'stopped'
-info 200 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node1)
+info 200 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+info 200 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
info 201 node1/lrm: status change wait_for_agent_lock => active
info 201 node1/lrm: starting service vm:103
info 200 node1/crm: got lock 'ha_agent_node3_lock'
info 200 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
info 200 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
-info 200 node1/crm: service 'ct:103': state changed from 'fence' to 'stopped'
-info 200 node1/crm: service 'ct:103': state changed from 'stopped' to 'started' (node = node1)
+info 200 node1/crm: recover service 'ct:103' from fenced node 'node3' to node 'node1'
+info 200 node1/crm: service 'ct:103': state changed from 'fence' to 'started' (node = node1)
info 201 node1/lrm: got lock 'ha_agent_node1_lock'
info 201 node1/lrm: status change wait_for_agent_lock => active
info 201 node1/lrm: starting service ct:103
info 220 node2/crm: got lock 'ha_agent_node1_lock'
info 220 node2/crm: fencing: acknowleged - got agent lock for node 'node1'
info 220 node2/crm: node 'node1': state changed from 'fence' => 'unknown'
-info 220 node2/crm: service 'vm:100': state changed from 'fence' to 'stopped'
-info 220 node2/crm: service 'vm:100': state changed from 'stopped' to 'started' (node = node2)
+info 220 node2/crm: recover service 'vm:100' from fenced node 'node1' to node 'node2'
+info 220 node2/crm: service 'vm:100': state changed from 'fence' to 'started' (node = node2)
info 221 node2/lrm: got lock 'ha_agent_node2_lock'
info 221 node2/lrm: status change wait_for_agent_lock => active
info 221 node2/lrm: starting service vm:100