From 724bd3f3110c1178443c9bdb44801607371cd10b Mon Sep 17 00:00:00 2001
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
Date: Wed, 22 Nov 2017 11:53:12 +0100
Subject: [PATCH] do not do active work if cfs update failed

We ignored if the cluster state update failed and happily worked with
an empty state, resulting in strange actions, e.g., the removal of
all (not so) "stale" services or changing the all but the masters
node state to unknown.

Check on the update result and if failed, either do not get active,
or, if already active, skip the current round with the knowledge
that we only got here because the update failed but our lock renew
worked => cfs got already in a working and quorate state again -
(probably just a restart)

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
Reviewed-by: Dominik Csapak <d.csapak@proxmox.com>
Tested-by: Dominik Csapak <d.csapak@proxmox.com>
---
 src/PVE/HA/CRM.pm                         | 10 +++++
 src/PVE/HA/LRM.pm                         |  7 ++++
 src/test/test-cfs-unavailable2/log.expect | 49 +++++++++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 src/test/test-cfs-unavailable2/log.expect

diff --git a/src/PVE/HA/CRM.pm b/src/PVE/HA/CRM.pm
index d149c58..a6557d3 100644
--- a/src/PVE/HA/CRM.pm
+++ b/src/PVE/HA/CRM.pm
@@ -120,6 +120,9 @@ sub can_get_active {
 
     return 0 if !$haenv->quorate();
 
+    # we may not do any active work with an incosistent cluster state
+    return 0 if !$self->{cluster_state_update};
+
     my $manager_status = eval { $haenv->read_manager_status() };
     if (my $err = $@) {
 	$haenv->log('err', "could not read manager status: $err");
@@ -246,6 +249,13 @@ sub work {
 		$shutdown = 1;
 
 	    } else {
+		if (!$self->{cluster_state_update}) {
+		    # update failed but we could still renew our lock (cfs restart?),
+		    # safely skip manage and expect to update just fine next round
+		    $haenv->log('notice', "temporary inconsistent cluster state " .
+		                "(cfs restart?), skip round");
+		    return;
+		}
 
 		$manager->manage();
 	    }
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index afca084..af7ad08 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -349,6 +349,13 @@ sub work {
 		    }
 		}
 	    } else {
+		if (!$self->{cluster_state_update}) {
+		    # update failed but we could still renew our lock (cfs restart?),
+		    # safely skip manage and expect to update just fine next round
+		    $haenv->log('notice', "temporary inconsistent cluster state " .
+		                "(cfs restart?), skip round");
+		    return;
+		}
 
 		$self->manage_resources();
 
diff --git a/src/test/test-cfs-unavailable2/log.expect b/src/test/test-cfs-unavailable2/log.expect
new file mode 100644
index 0000000..f1bbdb0
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/log.expect
@@ -0,0 +1,49 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:101' on node 'node1'
+info     20    node1/crm: adding new service 'vm:102' on node 'node2'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     21    node1/lrm: got lock 'ha_agent_node1_lock'
+info     21    node1/lrm: status change wait_for_agent_lock => active
+info     21    node1/lrm: starting service vm:101
+info     21    node1/lrm: service status vm:101 started
+info     22    node2/crm: status change wait_for_quorum => slave
+info     23    node2/lrm: got lock 'ha_agent_node2_lock'
+info     23    node2/lrm: status change wait_for_agent_lock => active
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info     40    node1/crm: service 'vm:102': state changed from 'request_stop' to 'stopped'
+info    120      cmdlist: execute cfs node1 update fail
+info    120      cmdlist: execute service vm:101 stopped
+noti    120    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    121    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    140    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    141    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    160    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    161    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    180    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    181    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    200    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    201    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+info    220      cmdlist: execute cfs node1 update work
+info    220    node1/crm: service 'vm:101': state changed from 'started' to 'request_stop'
+info    221    node1/lrm: stopping service vm:101
+info    221    node1/lrm: service status vm:101 stopped
+info    240    node1/crm: service 'vm:101': state changed from 'request_stop' to 'stopped'
+info    820     hardware: exit simulation - done
-- 
2.39.2