From 724bd3f3110c1178443c9bdb44801607371cd10b Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Wed, 22 Nov 2017 11:53:12 +0100 Subject: [PATCH] do not do active work if cfs update failed We ignored if the cluster state update failed and happily worked with an empty state, resulting in strange actions, e.g., the removal of all (not so) "stale" services or changing the all but the masters node state to unknown. Check on the update result and if failed, either do not get active, or, if already active, skip the current round with the knowledge that we only got here because the update failed but our lock renew worked => cfs got already in a working and quorate state again - (probably just a restart) Signed-off-by: Thomas Lamprecht Reviewed-by: Dominik Csapak Tested-by: Dominik Csapak --- src/PVE/HA/CRM.pm | 10 +++++ src/PVE/HA/LRM.pm | 7 ++++ src/test/test-cfs-unavailable2/log.expect | 49 +++++++++++++++++++++++ 3 files changed, 66 insertions(+) create mode 100644 src/test/test-cfs-unavailable2/log.expect diff --git a/src/PVE/HA/CRM.pm b/src/PVE/HA/CRM.pm index d149c58..a6557d3 100644 --- a/src/PVE/HA/CRM.pm +++ b/src/PVE/HA/CRM.pm @@ -120,6 +120,9 @@ sub can_get_active { return 0 if !$haenv->quorate(); + # we may not do any active work with an incosistent cluster state + return 0 if !$self->{cluster_state_update}; + my $manager_status = eval { $haenv->read_manager_status() }; if (my $err = $@) { $haenv->log('err', "could not read manager status: $err"); @@ -246,6 +249,13 @@ sub work { $shutdown = 1; } else { + if (!$self->{cluster_state_update}) { + # update failed but we could still renew our lock (cfs restart?), + # safely skip manage and expect to update just fine next round + $haenv->log('notice', "temporary inconsistent cluster state " . + "(cfs restart?), skip round"); + return; + } $manager->manage(); } diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm index afca084..af7ad08 100644 --- a/src/PVE/HA/LRM.pm +++ b/src/PVE/HA/LRM.pm @@ -349,6 +349,13 @@ sub work { } } } else { + if (!$self->{cluster_state_update}) { + # update failed but we could still renew our lock (cfs restart?), + # safely skip manage and expect to update just fine next round + $haenv->log('notice', "temporary inconsistent cluster state " . + "(cfs restart?), skip round"); + return; + } $self->manage_resources(); diff --git a/src/test/test-cfs-unavailable2/log.expect b/src/test/test-cfs-unavailable2/log.expect new file mode 100644 index 0000000..f1bbdb0 --- /dev/null +++ b/src/test/test-cfs-unavailable2/log.expect @@ -0,0 +1,49 @@ +info 0 hardware: starting simulation +info 20 cmdlist: execute power node1 on +info 20 node1/crm: status change startup => wait_for_quorum +info 20 node1/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node2 on +info 20 node2/crm: status change startup => wait_for_quorum +info 20 node2/lrm: status change startup => wait_for_agent_lock +info 20 cmdlist: execute power node3 on +info 20 node3/crm: status change startup => wait_for_quorum +info 20 node3/lrm: status change startup => wait_for_agent_lock +info 20 node1/crm: got lock 'ha_manager_lock' +info 20 node1/crm: status change wait_for_quorum => master +info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online' +info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online' +info 20 node1/crm: adding new service 'vm:101' on node 'node1' +info 20 node1/crm: adding new service 'vm:102' on node 'node2' +info 20 node1/crm: adding new service 'vm:103' on node 'node3' +info 21 node1/lrm: got lock 'ha_agent_node1_lock' +info 21 node1/lrm: status change wait_for_agent_lock => active +info 21 node1/lrm: starting service vm:101 +info 21 node1/lrm: service status vm:101 started +info 22 node2/crm: status change wait_for_quorum => slave +info 23 node2/lrm: got lock 'ha_agent_node2_lock' +info 23 node2/lrm: status change wait_for_agent_lock => active +info 24 node3/crm: status change wait_for_quorum => slave +info 25 node3/lrm: got lock 'ha_agent_node3_lock' +info 25 node3/lrm: status change wait_for_agent_lock => active +info 25 node3/lrm: starting service vm:103 +info 25 node3/lrm: service status vm:103 started +info 40 node1/crm: service 'vm:102': state changed from 'request_stop' to 'stopped' +info 120 cmdlist: execute cfs node1 update fail +info 120 cmdlist: execute service vm:101 stopped +noti 120 node1/crm: temporary inconsistent cluster state (cfs restart?), skip round +noti 121 node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round +noti 140 node1/crm: temporary inconsistent cluster state (cfs restart?), skip round +noti 141 node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round +noti 160 node1/crm: temporary inconsistent cluster state (cfs restart?), skip round +noti 161 node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round +noti 180 node1/crm: temporary inconsistent cluster state (cfs restart?), skip round +noti 181 node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round +noti 200 node1/crm: temporary inconsistent cluster state (cfs restart?), skip round +noti 201 node1/lrm: temporary inconsistent cluster state (cfs restart?), skip round +info 220 cmdlist: execute cfs node1 update work +info 220 node1/crm: service 'vm:101': state changed from 'started' to 'request_stop' +info 221 node1/lrm: stopping service vm:101 +info 221 node1/lrm: service status vm:101 stopped +info 240 node1/crm: service 'vm:101': state changed from 'request_stop' to 'stopped' +info 820 hardware: exit simulation - done -- 2.39.2