do not do active work if cfs update failed

author Thomas Lamprecht <t.lamprecht@proxmox.com>

Wed, 22 Nov 2017 10:53:12 +0000 (11:53 +0100)

committer Thomas Lamprecht <t.lamprecht@proxmox.com>

Tue, 30 Jan 2018 08:33:16 +0000 (09:33 +0100)
author Thomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 22 Nov 2017 10:53:12 +0000 (11:53 +0100)
committer Thomas Lamprecht <t.lamprecht@proxmox.com>
Tue, 30 Jan 2018 08:33:16 +0000 (09:33 +0100)
diff --git a/src/PVE/HA/CRM.pm b/src/PVE/HA/CRM.pm

index d149c58af69d84c8ab6c2909ac0bda03daec32fd..a6557d354deb78e15b56d70f2de17ebec111253e 100644 (file)
--- a/src/PVE/HA/CRM.pm
+++ b/src/PVE/HA/CRM.pm
@@ -120,6 +120,9 @@ sub can_get_active {
  
      return 0 if !$haenv->quorate();
  
+    # we may not do any active work with an incosistent cluster state
+    return 0 if !$self->{cluster_state_update};
+
      my $manager_status = eval { $haenv->read_manager_status() };
      if (my $err = $@) {
         $haenv->log('err', "could not read manager status: $err");
@@ -246,6 +249,13 @@ sub work {
                 $shutdown = 1;
  
             } else {
+               if (!$self->{cluster_state_update}) {
+                   # update failed but we could still renew our lock (cfs restart?),
+                   # safely skip manage and expect to update just fine next round
+                   $haenv->log('notice', "temporary inconsistent cluster state " .
+                               "(cfs restart?), skip round");
+                   return;
+               }
  
                 $manager->manage();
             }
diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm

index afca084b7b38e1edf8915036e5b32d51ea84def7..af7ad081965d25cb4a1dd939b1c449c30573f3ee 100644 (file)
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -349,6 +349,13 @@ sub work {
                     }
                 }
             } else {
+               if (!$self->{cluster_state_update}) {
+                   # update failed but we could still renew our lock (cfs restart?),
+                   # safely skip manage and expect to update just fine next round
+                   $haenv->log('notice', "temporary inconsistent cluster state " .
+                               "(cfs restart?), skip round");
+                   return;
+               }
  
                 $self->manage_resources();
  
diff --git a/src/test/test-cfs-unavailable2/log.expect b/src/test/test-cfs-unavailable2/log.expect

new file mode 100644 (file)

index 0000000..f1bbdb0
--- /dev/null
+++ b/src/test/test-cfs-unavailable2/log.expect
@@ -0,0 +1,49 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:101' on node 'node1'
+info     20    node1/crm: adding new service 'vm:102' on node 'node2'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     21    node1/lrm: got lock 'ha_agent_node1_lock'
+info     21    node1/lrm: status change wait_for_agent_lock => active
+info     21    node1/lrm: starting service vm:101
+info     21    node1/lrm: service status vm:101 started
+info     22    node2/crm: status change wait_for_quorum => slave
+info     23    node2/lrm: got lock 'ha_agent_node2_lock'
+info     23    node2/lrm: status change wait_for_agent_lock => active
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info     40    node1/crm: service 'vm:102': state changed from 'request_stop' to 'stopped'
+info    120      cmdlist: execute cfs node1 update fail
+info    120      cmdlist: execute service vm:101 stopped
+noti    120    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    121    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    140    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    141    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    160    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    161    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    180    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    181    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    200    node1/crm: temporary inconsistent cluster state (cfs restart?), skip round
+noti    201    node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round
+info    220      cmdlist: execute cfs node1 update work
+info    220    node1/crm: service 'vm:101': state changed from 'started' to 'request_stop'
+info    221    node1/lrm: stopping service vm:101
+info    221    node1/lrm: service status vm:101 stopped
+info    240    node1/crm: service 'vm:101': state changed from 'request_stop' to 'stopped'
+info    820     hardware: exit simulation - done
author	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Wed, 22 Nov 2017 10:53:12 +0000 (11:53 +0100)
committer	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Tue, 30 Jan 2018 08:33:16 +0000 (09:33 +0100)
src/PVE/HA/CRM.pm		patch \| blob \| blame \| history
src/PVE/HA/LRM.pm		patch \| blob \| blame \| history
src/test/test-cfs-unavailable2/log.expect	[new file with mode: 0644]	patch \| blob