]> git.proxmox.com Git - pve-ha-manager.git/commitdiff
LRM: release lock and close watchdog if no service configured for >10min
authorThomas Lamprecht <t.lamprecht@proxmox.com>
Thu, 1 Jul 2021 13:55:43 +0000 (15:55 +0200)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Fri, 2 Jul 2021 18:08:12 +0000 (20:08 +0200)
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
src/PVE/HA/LRM.pm
src/test/test-lrm-going-idle1/README [new file with mode: 0644]
src/test/test-lrm-going-idle1/cmdlist [new file with mode: 0644]
src/test/test-lrm-going-idle1/hardware_status [new file with mode: 0644]
src/test/test-lrm-going-idle1/log.expect [new file with mode: 0644]
src/test/test-lrm-going-idle1/manager_status [new file with mode: 0644]
src/test/test-lrm-going-idle1/service_config [new file with mode: 0644]

index 82f78ca472f1345e38cb6be1fce6e6ea3a45ef3d..97aa1e0cc7c12202dbded2afb8b523426de47ac3 100644 (file)
@@ -20,6 +20,10 @@ my $valid_states = {
     lost_agent_lock => "lost agent_lock",
 };
 
+# we sleep ~10s per 'active' round, so if no services is available for >= 10 min we'd go in wait
+# state givining up the watchdog and the LRM lock acquire voluntary, ensuring the WD can do no harm
+my $max_active_idle_rounds = 60;
+
 sub new {
     my ($this, $haenv) = @_;
 
@@ -36,6 +40,7 @@ sub new {
        # mode can be: active, reboot, shutdown, restart
        mode => 'active',
        cluster_state_update => 0,
+       active_idle_rounds => 0,
     }, $class;
 
     $self->set_local_status({ state =>         'wait_for_agent_lock' });
@@ -216,6 +221,23 @@ sub get_protected_ha_agent_lock {
     return 0;
 }
 
+# only cares if any service has the local node as their node, independent of which req.state it is
+sub has_configured_service_on_local_node {
+    my ($self) = @_;
+
+    my $haenv = $self->{haenv};
+    my $nodename = $haenv->nodename();
+
+    my $ss = $self->{service_status};
+    foreach my $sid (keys %$ss) {
+       my $sd = $ss->{$sid};
+       next if !$sd->{node} || $sd->{node} ne $nodename;
+
+       return 1;
+    }
+    return 0;
+}
+
 sub active_service_count {
     my ($self) = @_;
 
@@ -326,6 +348,21 @@ sub work {
            $self->set_local_status({ state => 'lost_agent_lock'});
        } elsif ($self->{mode} eq 'maintenance') {
            $self->set_local_status({ state => 'maintenance'});
+       } else {
+           if (!$self->has_configured_service_on_local_node() && !$self->run_workers()) {
+               # no active service configured for this node and all (old) workers are done
+               $self->{active_idle_rounds}++;
+               if ($self->{active_idle_rounds} > $max_active_idle_rounds) {
+                   $haenv->log('info', "node had no service configured for $max_active_idle_rounds rounds, going idle.\n");
+                   # safety: no active service & no running worker for quite some time -> OK
+                   $haenv->release_ha_agent_lock();
+                   give_up_watchdog_protection($self);
+                   $self->set_local_status({ state => 'wait_for_agent_lock'});
+                   $self->{active_idle_rounds} = 0;
+               }
+           } elsif ($self->{active_idle_rounds}) {
+               $self->{active_idle_rounds} = 0;
+           }
        }
     } elsif ($state eq 'maintenance') {
 
diff --git a/src/test/test-lrm-going-idle1/README b/src/test/test-lrm-going-idle1/README
new file mode 100644 (file)
index 0000000..ed44463
--- /dev/null
@@ -0,0 +1,7 @@
+Test an user triggered service removal from a previously active LRM, which
+should make said LRM going idle and dropping the lock once enough cycles passed
+without any new service.
+
+We use some delays to stall execution, as else we'd exit the test-simulation
+earlier than the 60 rounds idle time required before the LRM gives up their
+lock and watchdog.
diff --git a/src/test/test-lrm-going-idle1/cmdlist b/src/test/test-lrm-going-idle1/cmdlist
new file mode 100644 (file)
index 0000000..8567d6a
--- /dev/null
@@ -0,0 +1,11 @@
+[
+    [ "power node1 on", "power node2 on", "power node3 on"],
+    [ "service vm:103 delete" ],
+    [ "delay 0" ],
+    [ "delay 0" ],
+    [ "delay 0" ],
+    [ "delay 0" ],
+    [ "delay 0" ],
+    [ "delay 0" ],
+    [ "delay 0" ]
+]
diff --git a/src/test/test-lrm-going-idle1/hardware_status b/src/test/test-lrm-going-idle1/hardware_status
new file mode 100644 (file)
index 0000000..451beb1
--- /dev/null
@@ -0,0 +1,5 @@
+{
+  "node1": { "power": "off", "network": "off" },
+  "node2": { "power": "off", "network": "off" },
+  "node3": { "power": "off", "network": "off" }
+}
diff --git a/src/test/test-lrm-going-idle1/log.expect b/src/test/test-lrm-going-idle1/log.expect
new file mode 100644 (file)
index 0000000..2dce3e9
--- /dev/null
@@ -0,0 +1,29 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 delete
+info    120    node1/crm: removing stale service 'vm:103' (no config)
+info    122    node2/crm: status change slave => wait_for_quorum
+info    124    node3/crm: status change slave => wait_for_quorum
+info   1325    node3/lrm: node had no service configured for 60 rounds, going idle.
+info   1325    node3/lrm: status change active => wait_for_agent_lock
+info   1420     hardware: exit simulation - done
diff --git a/src/test/test-lrm-going-idle1/manager_status b/src/test/test-lrm-going-idle1/manager_status
new file mode 100644 (file)
index 0000000..0967ef4
--- /dev/null
@@ -0,0 +1 @@
+{}
diff --git a/src/test/test-lrm-going-idle1/service_config b/src/test/test-lrm-going-idle1/service_config
new file mode 100644 (file)
index 0000000..c6860e7
--- /dev/null
@@ -0,0 +1,3 @@
+{
+    "vm:103": { "node": "node3", "state": "enabled" }
+}