sub queue_resource_command {
my ($self, $sid, $uid, $state, $target) = @_;
+ # do not queue the excatly same command twice as this may lead to
+ # an inconsistent HA state when the first command fails but the CRM
+ # does not process its failure right away and the LRM starts a second
+ # try, without the CRM knowing of it (race condition)
+ # The 'stopped' command is an exception as we do not process its result
+ # in the CRM and we want to execute it always (even with no active CRM)
+ return if $state ne 'stopped' && $uid && defined($self->{results}->{$uid});
+
if (my $w = $self->{workers}->{$sid}) {
return if $w->{pid}; # already started
# else, delete and overwrite queue entry with new command
} else {
$haenv->log('err', "unknown command '$cmd' for service '$sid'");
}
- }
+ }
if ($cd->{state} eq 'disabled') {
- # do nothing
+ # NOTE: do nothing here, the stop state is an exception as we do not
+ # process the LRM result here, thus the LRM always tries to stop the
+ # service (protection for the case no CRM is active)
return;
- }
+ }
if ($cd->{state} eq 'enabled') {
# simply mark it started, if it's on the wrong node
" (exit code $ec))");
# we have no save way out (yet) for other errors
&$change_service_state($self, $sid, 'error');
+ return;
}
}
&$change_service_state($self, $sid, 'relocate', node => $sd->{node}, target => $node);
}
} else {
- # do nothing
+ # ensure service get started again if it went unexpected down
+ $sd->{uid} = compute_new_uuid($sd->{state});
}
}
return;
- }
+ }
$haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration");
}
err 160 node1/crm: recovery policy for service fa:130 failed, entering error state!
info 160 node1/crm: service 'fa:130': state changed from 'started' to 'error'
warn 163 node2/lrm: service fa:130 is not running and in an error state
-warn 183 node2/lrm: service fa:130 is not running and in an error state
-warn 203 node2/lrm: service fa:130 is not running and in an error state
info 220 cmdlist: execute service fa:130 disabled
info 220 node1/crm: service 'fa:130': state changed from 'error' to 'stopped'
info 820 hardware: exit simulation - done