my ($conf, $sid, $has_state) = @_;
if (my $d = $conf->{ids}->{$sid}) {
- return 1 if !defined($has_state);
+ if (!defined($has_state)) {
+ # ignored service behave as if they were not managed by HA
+ return 0 if defined($d->{state}) && $d->{state} eq 'ignored';
+ return 1;
+ }
# backward compatibility
$has_state = 'started' if $has_state eq 'enabled';
foreach my $sid (sort keys %$sc) {
next if $ss->{$sid}; # already there
my $cd = $sc->{$sid};
+ next if $cd->{state} eq 'ignored';
+
$haenv->log('info', "adding new service '$sid' on node '$cd->{node}'");
# assume we are running to avoid relocate running service at add
my $state = ($cd->{state} eq 'started') ? 'started' : 'request_stop';
uid => compute_new_uuid('started') };
}
- # remove stale service from manager state
+ # remove stale or ignored services from manager state
foreach my $sid (keys %$ss) {
- next if $sc->{$sid};
- $haenv->log('info', "removing stale service '$sid' (no config)");
+ next if $sc->{$sid} && $sc->{$sid}->{state} ne 'ignored';
+
+ my $reason = defined($sc->{$sid}) ? 'ignored state requested' : 'no config';
+ $haenv->log('info', "removing stale service '$sid' ($reason)");
+
# remove all service related state information
delete $ss->{$sid};
}
{ completion => \&PVE::HA::Tools::complete_sid }),
state => {
type => 'string',
- enum => ['started', 'stopped', 'enabled', 'disabled'],
+ enum => ['started', 'stopped', 'enabled', 'disabled', 'ignored'],
optional => 1,
default => 'started',
description => "Requested resource state.",
state is error recovery, because it is the only way to move a resource out
of the `error` state.
+`ignored`;;
+
+The resource gets removed from the manager status and so the CRM and the LRM do
+not touch the resource anymore. All {pve} API calls affecting this resource
+will be executed, directly bypassing the HA stack. CRM commands will be thrown
+away while there source is in this state. The resource will not get relocated
+on node failures.
+
EODESC
},
group => get_standard_option('pve-ha-group-id',
# reboot <node>
# shutdown <node>
# restart-lrm <node>
-# service <sid> <started|disabled|stopped>
+# service <sid> <started|disabled|stopped|ignored>
# service <sid> <migrate|relocate> <target>
# service <sid> lock/unlock [lockname]
}
} elsif ($cmd eq 'service') {
- if ($action eq 'started' || $action eq 'disabled' || $action eq 'stopped') {
+ if ($action eq 'started' || $action eq 'disabled' ||
+ $action eq 'stopped' || $action eq 'ignored') {
$self->set_service_state($sid, $action);
$sgrid->attach($w, 1, $row, 1, 1);
my $count = 0;
- foreach my $state (qw(started stopped disabled)) {
+ foreach my $state (qw(started stopped disabled ignored)) {
$w->append_text($state);
$w->set_active($count) if $d->{state} eq $state;
$count++;
--- /dev/null
+Test an user triggered service disable and enable cycle. The services should
+successfully stop and then start again.
--- /dev/null
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service vm:103 ignored" ]
+]
--- /dev/null
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
--- /dev/null
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute service vm:103 ignored
+info 120 node1/crm: removing stale service 'vm:103' (ignored state requested)
+info 720 hardware: exit simulation - done
--- /dev/null
+{
+ "vm:103": { "node": "node3", "state": "enabled" }
+}
--- /dev/null
+Set the request state of a service to ignored. Then simulate a node failure
+through network outage. The HA stack should not touch the 'ignored' service.
+
+Set the service to 'started' again, now the service should be fenced.
--- /dev/null
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service vm:103 ignored" ],
+ [ "network node3 off" ],
+ [ "service vm:103 started" ]
+]
--- /dev/null
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
--- /dev/null
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute service vm:103 ignored
+info 120 node1/crm: removing stale service 'vm:103' (ignored state requested)
+info 220 cmdlist: execute network node3 off
+info 220 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 224 node3/crm: status change slave => wait_for_quorum
+info 225 node3/lrm: status change active => lost_agent_lock
+info 266 watchdog: execute power node3 off
+info 265 node3/crm: killed by poweroff
+info 266 node3/lrm: killed by poweroff
+info 266 hardware: server 'node3' stopped by poweroff (watchdog)
+info 320 cmdlist: execute service vm:103 started
+info 320 node1/crm: adding new service 'vm:103' on node 'node3'
+info 320 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 320 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 340 node1/crm: got lock 'ha_agent_node3_lock'
+info 340 node1/crm: fencing: acknowledged - got agent lock for node 'node3'
+info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info 340 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 341 node1/lrm: got lock 'ha_agent_node1_lock'
+info 341 node1/lrm: status change wait_for_agent_lock => active
+info 341 node1/lrm: starting service vm:103
+info 341 node1/lrm: service status vm:103 started
+info 920 hardware: exit simulation - done
--- /dev/null
+{
+ "vm:103": { "node": "node3", "state": "enabled" }
+}