From 30fc7ceedb7f3047659f22d063cc16c94c20dd7a Mon Sep 17 00:00:00 2001
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
Date: Mon, 17 Jan 2022 15:52:53 +0100
Subject: [PATCH] lrm: also check CRM node-status for determining fence-request

This fixes point 2. of commit 3addeeb - avoiding that a LRM goes
active as long as the CRM still has it in (pending) `fence` state,
which can happen after a watchdog reset + fast boot. This avoids that
we interfere with the CRM acquiring the lock, which is all the more
important once a future commit gets added that ensures a node isn't
stuck in `fence` state if there's no service configured (anymore) due
to admin manually removing them during fencing.

We explicitly fix the startup first to better show how it works in
the test framework, but as the test/sim hardware can now delay the
CRM now while keeping LRM running, the second test (i.e.,
test-service-command9) should still trigger after the next commit, if
this one would be reverted or broken otherwise.

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
 src/PVE/HA/LRM.pm                         | 5 ++++-
 src/test/test-service-command8/log.expect | 6 ------
 src/test/test-service-command9/log.expect | 4 ----
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index 1ba2038..86d0a34 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -186,6 +186,8 @@ sub update_service_status {
 	return undef;
     } else {
 	$self->{service_status} = $ms->{service_status} || {};
+	my $nodename = $haenv->nodename();
+	$self->{node_status} = $ms->{node_status}->{$nodename} || 'unknown';
 	return 1;
     }
 }
@@ -242,12 +244,13 @@ sub is_fence_requested {
     my ($self) = @_;
 
     my $haenv = $self->{haenv};
+
     my $nodename = $haenv->nodename();
     my $ss = $self->{service_status};
 
     my $fenced_services = PVE::HA::Tools::count_fenced_services($ss, $nodename);
 
-    return $fenced_services;
+    return $fenced_services || $self->{node_status} eq 'fence';
 }
 
 sub active_service_count {
diff --git a/src/test/test-service-command8/log.expect b/src/test/test-service-command8/log.expect
index 572e2f2..72eb369 100644
--- a/src/test/test-service-command8/log.expect
+++ b/src/test/test-service-command8/log.expect
@@ -18,11 +18,5 @@ info     22    node2/crm: status change wait_for_quorum => slave
 info     24    node3/crm: status change wait_for_quorum => slave
 info    120      cmdlist: execute service vm:103 add node3 stopped
 info    120    node1/crm: adding new service 'vm:103' on node 'node3'
-info    125    node3/lrm: got lock 'ha_agent_node3_lock'
-info    125    node3/lrm: status change wait_for_agent_lock => active
-info    140    node1/crm: service 'vm:103': state changed from 'request_stop' to 'stopped'
 info    220      cmdlist: execute service vm:103 started
-info    220    node1/crm: service 'vm:103': state changed from 'stopped' to 'started'  (node = node3)
-info    225    node3/lrm: starting service vm:103
-info    225    node3/lrm: service status vm:103 started
 info    820     hardware: exit simulation - done
diff --git a/src/test/test-service-command9/log.expect b/src/test/test-service-command9/log.expect
index 7981305..40de86b 100644
--- a/src/test/test-service-command9/log.expect
+++ b/src/test/test-service-command9/log.expect
@@ -15,10 +15,6 @@ info     20    node1/lrm: got lock 'ha_agent_node1_lock'
 info     20    node1/lrm: status change wait_for_agent_lock => active
 info     20    node1/lrm: starting service vm:101
 info     20    node1/lrm: service status vm:101 started
-info     22    node3/lrm: got lock 'ha_agent_node3_lock'
-info     22    node3/lrm: status change wait_for_agent_lock => active
-info     22    node3/lrm: starting service vm:103
-info     22    node3/lrm: service status vm:103 started
 info     40     run-loop: skipping CRM round
 info     60    node1/crm: got lock 'ha_manager_lock'
 info     60    node1/crm: status change wait_for_quorum => master
-- 
2.39.2