lost_agent_lock => "lost agent_lock",
};
+# we sleep ~10s per 'active' round, so if no services is available for >= 10 min we'd go in wait
+# state givining up the watchdog and the LRM lock acquire voluntary, ensuring the WD can do no harm
+my $max_active_idle_rounds = 60;
+
sub new {
my ($this, $haenv) = @_;
# mode can be: active, reboot, shutdown, restart
mode => 'active',
cluster_state_update => 0,
+ active_idle_rounds => 0,
}, $class;
$self->set_local_status({ state => 'wait_for_agent_lock' });
return 0;
}
+# only cares if any service has the local node as their node, independent of which req.state it is
+sub has_configured_service_on_local_node {
+ my ($self) = @_;
+
+ my $haenv = $self->{haenv};
+ my $nodename = $haenv->nodename();
+
+ my $ss = $self->{service_status};
+ foreach my $sid (keys %$ss) {
+ my $sd = $ss->{$sid};
+ next if !$sd->{node} || $sd->{node} ne $nodename;
+
+ return 1;
+ }
+ return 0;
+}
+
sub active_service_count {
my ($self) = @_;
$self->set_local_status({ state => 'lost_agent_lock'});
} elsif ($self->{mode} eq 'maintenance') {
$self->set_local_status({ state => 'maintenance'});
+ } else {
+ if (!$self->has_configured_service_on_local_node() && !$self->run_workers()) {
+ # no active service configured for this node and all (old) workers are done
+ $self->{active_idle_rounds}++;
+ if ($self->{active_idle_rounds} > $max_active_idle_rounds) {
+ $haenv->log('info', "node had no service configured for $max_active_idle_rounds rounds, going idle.\n");
+ # safety: no active service & no running worker for quite some time -> OK
+ $haenv->release_ha_agent_lock();
+ give_up_watchdog_protection($self);
+ $self->set_local_status({ state => 'wait_for_agent_lock'});
+ $self->{active_idle_rounds} = 0;
+ }
+ } elsif ($self->{active_idle_rounds}) {
+ $self->{active_idle_rounds} = 0;
+ }
}
} elsif ($state eq 'maintenance') {
--- /dev/null
+Test an user triggered service removal from a previously active LRM, which
+should make said LRM going idle and dropping the lock once enough cycles passed
+without any new service.
+
+We use some delays to stall execution, as else we'd exit the test-simulation
+earlier than the 60 rounds idle time required before the LRM gives up their
+lock and watchdog.
--- /dev/null
+[
+ [ "power node1 on", "power node2 on", "power node3 on"],
+ [ "service vm:103 delete" ],
+ [ "delay 0" ],
+ [ "delay 0" ],
+ [ "delay 0" ],
+ [ "delay 0" ],
+ [ "delay 0" ],
+ [ "delay 0" ],
+ [ "delay 0" ]
+]
--- /dev/null
+{
+ "node1": { "power": "off", "network": "off" },
+ "node2": { "power": "off", "network": "off" },
+ "node3": { "power": "off", "network": "off" }
+}
--- /dev/null
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute service vm:103 delete
+info 120 node1/crm: removing stale service 'vm:103' (no config)
+info 122 node2/crm: status change slave => wait_for_quorum
+info 124 node3/crm: status change slave => wait_for_quorum
+info 1325 node3/lrm: node had no service configured for 60 rounds, going idle.
+info 1325 node3/lrm: status change active => wait_for_agent_lock
+info 1420 hardware: exit simulation - done
--- /dev/null
+{
+ "vm:103": { "node": "node3", "state": "enabled" }
+}