From 35cbb764afaa1b48a8977a8ce4619cbd4901ea9c Mon Sep 17 00:00:00 2001
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
Date: Wed, 24 Feb 2016 10:06:52 +0100
Subject: [PATCH] avoid out of sync command execution in LRM

We are only allowed to execute any command once as else we
may disturb the synchrony between CRM and LRM.

The following scenario could happen:
schedule CRM: deploy task 'migrate' for service vm:100 with UID 1234
schedule LRM: fork task wit UID 123
schedule CRM: idle as no result available yet
schedule LRM: collect finished task UID and write result for CRM
	      Result was an error
schedule LRM: fork task UID _AGAIN_
schedule CRM: processes _previous_ erroneous result from LRM
	      and place vm:100 in started state on source node
schedule LRM: collect finished task UID and write result for CRM
	      This time the task was successful and service is on
	      target node, but the CRM know _nothing_ of it!
schedule LRM: try to schedule task but service is not on this node!
=> error

To fix that we _never_ execute two exactly same commands after each
other, exactly means here that the UID of the actual command to
queue is already a valid result.

This enforces the originally wanted SYN - ACK behaviour between CRM
and LRMs.

We generate now a new UID for services who does not change state
the one of the following evaluates to true:
* enabled AND in started state

This ensures that the state from the CRM holds in the LRM and thus,
for example, a killed VM gets restarted.

Note that the 'stopped' command is an exception, as we do not check
its result in the CRM (thus no race here) and we always want to
execute it (even when no CRM is active).
---
 src/PVE/HA/LRM.pm                          |  8 ++++++++
 src/PVE/HA/Manager.pm                      | 14 +++++++++-----
 src/test/test-resource-failure5/log.expect |  2 --
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index 7bbfe46..b2d7d37 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -455,6 +455,14 @@ sub manage_resources {
 sub queue_resource_command {
     my ($self, $sid, $uid, $state, $target) = @_;
 
+    # do not queue the excatly same command twice as this may lead to
+    # an inconsistent HA state when the first command fails but the CRM
+    # does not process its failure right away and the LRM starts a second
+    # try, without the CRM knowing of it (race condition)
+    # The 'stopped' command is an exception as we do not process its result
+    # in the CRM and we want to execute it always (even with no active CRM)
+    return if $state ne 'stopped' && $uid && defined($self->{results}->{$uid});
+
     if (my $w = $self->{workers}->{$sid}) {
 	return if $w->{pid}; # already started
 	# else, delete and overwrite queue entry with new command
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 32b0ad7..1d11084 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -517,12 +517,14 @@ sub next_state_stopped {
 	} else {
 	    $haenv->log('err', "unknown command '$cmd' for service '$sid'"); 
 	}
-    } 
+    }
 
     if ($cd->{state} eq 'disabled') {
-	# do nothing
+	# NOTE: do nothing here, the stop state is an exception as we do not
+	# process the LRM result here, thus the LRM always tries to stop the
+	# service (protection for the case no CRM is active)
 	return;
-    } 
+    }
 
     if ($cd->{state} eq 'enabled') {
 	# simply mark it started, if it's on the wrong node
@@ -608,6 +610,7 @@ sub next_state_started {
 				" (exit code $ec))");
 		    # we have no save way out (yet) for other errors
 		    &$change_service_state($self, $sid, 'error');
+		    return;
 		}
 	    }
 
@@ -623,12 +626,13 @@ sub next_state_started {
 		    &$change_service_state($self, $sid, 'relocate', node => $sd->{node}, target => $node);
 		}
 	    } else {
-		# do nothing
+		# ensure service get started again if it went unexpected down
+		$sd->{uid} = compute_new_uuid($sd->{state});
 	    }
 	}
 
 	return;
-    } 
+    }
 
     $haenv->log('err', "service '$sid' - unknown state '$cd->{state}' in service configuration");
 }
diff --git a/src/test/test-resource-failure5/log.expect b/src/test/test-resource-failure5/log.expect
index b6e7807..283ca8c 100644
--- a/src/test/test-resource-failure5/log.expect
+++ b/src/test/test-resource-failure5/log.expect
@@ -31,8 +31,6 @@ err     143    node2/lrm: unable to start service fa:130 on local node after 1 r
 err     160    node1/crm: recovery policy for service fa:130 failed, entering error state!
 info    160    node1/crm: service 'fa:130': state changed from 'started' to 'error'
 warn    163    node2/lrm: service fa:130 is not running and in an error state
-warn    183    node2/lrm: service fa:130 is not running and in an error state
-warn    203    node2/lrm: service fa:130 is not running and in an error state
 info    220      cmdlist: execute service fa:130 disabled
 info    220    node1/crm: service 'fa:130': state changed from 'error' to 'stopped'
 info    820     hardware: exit simulation - done
-- 
2.39.2