From 30fc7ceedb7f3047659f22d063cc16c94c20dd7a Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Mon, 17 Jan 2022 15:52:53 +0100 Subject: [PATCH] lrm: also check CRM node-status for determining fence-request This fixes point 2. of commit 3addeeb - avoiding that a LRM goes active as long as the CRM still has it in (pending) `fence` state, which can happen after a watchdog reset + fast boot. This avoids that we interfere with the CRM acquiring the lock, which is all the more important once a future commit gets added that ensures a node isn't stuck in `fence` state if there's no service configured (anymore) due to admin manually removing them during fencing. We explicitly fix the startup first to better show how it works in the test framework, but as the test/sim hardware can now delay the CRM now while keeping LRM running, the second test (i.e., test-service-command9) should still trigger after the next commit, if this one would be reverted or broken otherwise. Signed-off-by: Thomas Lamprecht --- src/PVE/HA/LRM.pm | 5 ++++- src/test/test-service-command8/log.expect | 6 ------ src/test/test-service-command9/log.expect | 4 ---- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm index 1ba2038..86d0a34 100644 --- a/src/PVE/HA/LRM.pm +++ b/src/PVE/HA/LRM.pm @@ -186,6 +186,8 @@ sub update_service_status { return undef; } else { $self->{service_status} = $ms->{service_status} || {}; + my $nodename = $haenv->nodename(); + $self->{node_status} = $ms->{node_status}->{$nodename} || 'unknown'; return 1; } } @@ -242,12 +244,13 @@ sub is_fence_requested { my ($self) = @_; my $haenv = $self->{haenv}; + my $nodename = $haenv->nodename(); my $ss = $self->{service_status}; my $fenced_services = PVE::HA::Tools::count_fenced_services($ss, $nodename); - return $fenced_services; + return $fenced_services || $self->{node_status} eq 'fence'; } sub active_service_count { diff --git a/src/test/test-service-command8/log.expect b/src/test/test-service-command8/log.expect index 572e2f2..72eb369 100644 --- a/src/test/test-service-command8/log.expect +++ b/src/test/test-service-command8/log.expect @@ -18,11 +18,5 @@ info 22 node2/crm: status change wait_for_quorum => slave info 24 node3/crm: status change wait_for_quorum => slave info 120 cmdlist: execute service vm:103 add node3 stopped info 120 node1/crm: adding new service 'vm:103' on node 'node3' -info 125 node3/lrm: got lock 'ha_agent_node3_lock' -info 125 node3/lrm: status change wait_for_agent_lock => active -info 140 node1/crm: service 'vm:103': state changed from 'request_stop' to 'stopped' info 220 cmdlist: execute service vm:103 started -info 220 node1/crm: service 'vm:103': state changed from 'stopped' to 'started' (node = node3) -info 225 node3/lrm: starting service vm:103 -info 225 node3/lrm: service status vm:103 started info 820 hardware: exit simulation - done diff --git a/src/test/test-service-command9/log.expect b/src/test/test-service-command9/log.expect index 7981305..40de86b 100644 --- a/src/test/test-service-command9/log.expect +++ b/src/test/test-service-command9/log.expect @@ -15,10 +15,6 @@ info 20 node1/lrm: got lock 'ha_agent_node1_lock' info 20 node1/lrm: status change wait_for_agent_lock => active info 20 node1/lrm: starting service vm:101 info 20 node1/lrm: service status vm:101 started -info 22 node3/lrm: got lock 'ha_agent_node3_lock' -info 22 node3/lrm: status change wait_for_agent_lock => active -info 22 node3/lrm: starting service vm:103 -info 22 node3/lrm: service status vm:103 started info 40 run-loop: skipping CRM round info 60 node1/crm: got lock 'ha_manager_lock' info 60 node1/crm: status change wait_for_quorum => master -- 2.39.2