From 68da707062f1062556926b299bec84de1082e5e6 Mon Sep 17 00:00:00 2001
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
Date: Fri, 22 Dec 2017 14:34:32 +0100
Subject: [PATCH] add pmxcfs restart detection heuristic for IPCC

Allow clean pmxcfs restarts to be fully transparent for the IPCC
using perl stack aboce.

A restart of pmxcfs invalidates the connection cache, we only set the
cached connection to NULL for this case and the next call to
ipcc_send_rec would connect newly.
Further, such a restart may need quite a bit time (seconds).

Thus write a status file to flag a possible restart when terminating
from pmxcfs. Delete this flag file once we're up and ready again.
Error case handling is described further below.

If a new connections fails and this flag file exists then retry
connecting for a certain period (for now five seconds).

If a cached connection fails always retry once, as every pmxcfs
restart makes the cached connection invalid, even if IPCC would be
fully up and ready again and then also follow the connection polling
heuristic if the restart flag exists, as new connections do.

We use the monotonic clock to avoid problems if the (system) time
changes and to keep things as easy as possible.

We delete the flag file if a IPCC call could not connect in the grace
period, but only if the file is still the same, i.e., no one else has
deleted and recreated it in the meantime (e.g. a second cfs restart).
This guarantees that IPCC calls try this heuristic only for a limited
time (5 seconds until the first failed one) if the cfs does not
starts again.

Further, as the flag resided in /run/... - which is always a tmpfs
(thus in memory and thus cleaned upon reboot) we may not run into
leftover flag files on a node reset, e.g. done by the HA watchdog for
self-fencing.

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
 data/PVE/IPCC.xs     | 62 +++++++++++++++++++++++++++++++++++++++++---
 data/src/cfs-utils.h |  1 +
 data/src/pmxcfs.c    |  6 +++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/data/PVE/IPCC.xs b/data/PVE/IPCC.xs
index ded9472..decc60b 100644
--- a/data/PVE/IPCC.xs
+++ b/data/PVE/IPCC.xs
@@ -14,6 +14,8 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/uio.h>
+#include <unistd.h>
+#include <time.h>
 #include <errno.h>
 
 #ifndef SCM_RIGHTS
@@ -27,6 +29,9 @@
 #include <qb/qblog.h>
 #include <qb/qbipcc.h>
 
+#define RESTART_FLAG_FILE "/run/pve-cluster/cfs-restart-flag"
+#define RESTART_GRACE_PERIOD 5
+
 #define PCS_SOCKET_NAME "pve2"
 
 #define PCS_SERVICE1 1
@@ -37,6 +42,52 @@ static pid_t conn_pid;
 
 static char ipcbuffer[MAX_MSG_SIZE];
 
+static qb_ipcc_connection_t *init_connection() {
+
+	static qb_ipcc_connection_t *connection = NULL;
+	struct timespec retry_timeout, now;
+	int cfs_restart_flag_fd = -1;
+
+	// check if pmxcfs is currently restarting
+	if ((cfs_restart_flag_fd = open(RESTART_FLAG_FILE, 0)) > 0) {
+		clock_gettime(CLOCK_MONOTONIC, &retry_timeout);
+		retry_timeout.tv_sec += RESTART_GRACE_PERIOD;
+	}
+
+	qb_log_init("IPCC.xs", LOG_USER, LOG_EMERG);
+	qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
+
+retry_connection:
+	connection = qb_ipcc_connect(PCS_SOCKET_NAME, MAX_MSG_SIZE);
+
+	if (!connection) {
+		if (cfs_restart_flag_fd >= 0) {
+			// cfs restarting and hopefully back soon, poll
+			clock_gettime(CLOCK_MONOTONIC, &now);
+
+			if (now.tv_sec < retry_timeout.tv_sec ||
+			   (now.tv_sec == retry_timeout.tv_sec &&
+			    now.tv_nsec < retry_timeout.tv_nsec)) {
+
+				usleep(100 * 1000);
+				goto retry_connection;
+
+			} else {
+				// timeout: cleanup flag file if still the same
+				struct stat s;
+				fstat(cfs_restart_flag_fd, &s);
+				if (s.st_nlink > 0)
+					unlink(RESTART_FLAG_FILE);
+			}
+		}
+	}
+
+	if (cfs_restart_flag_fd >= 0) close(cfs_restart_flag_fd);
+
+	return connection;
+}
+
+
 MODULE = PVE::IPCC		PACKAGE = PVE::IPCC		
 
 SV *
@@ -46,6 +97,7 @@ SV * data;
 PROTOTYPE: $;$
 CODE:
 {
+	uint8_t retried_cache_connection = 0;
 	pid_t cpid = getpid();
 
 	/* Each process needs its own ipcc connection,
@@ -56,9 +108,8 @@ CODE:
 	}
 
 	if (conn == NULL) {
-		qb_log_init("IPCC.xs", LOG_USER, LOG_EMERG);
-		qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE);
-		conn = qb_ipcc_connect(PCS_SOCKET_NAME, MAX_MSG_SIZE);
+recache_connection:
+		conn = init_connection();
 
 		if (!conn)
 			XSRETURN_UNDEF;
@@ -89,6 +140,11 @@ CODE:
 	if (res < 0) {
 		qb_ipcc_disconnect(conn);
 		conn = NULL;
+		// requests during cfs restart and the first thereafter will fail, retry
+		if (!retried_cache_connection) {
+			retried_cache_connection = 1;
+			goto recache_connection;
+		}
 		errno = -res;
 		XSRETURN_UNDEF;
 	}
diff --git a/data/src/cfs-utils.h b/data/src/cfs-utils.h
index 0271566..6b22d45 100644
--- a/data/src/cfs-utils.h
+++ b/data/src/cfs-utils.h
@@ -32,6 +32,7 @@
 #define HOST_CLUSTER_CONF_FN "/etc/corosync/corosync.conf"
 #define CFS_PID_FN "/var/run/pve-cluster.pid"
 #define VARLIBDIR "/var/lib/pve-cluster"
+#define RUNDIR "/run/pve-cluster"
 
 #define CFS_MAX(a, b)		(((a) > (b)) ? (a) : (b))
 #define CFS_MIN(a, b)		(((a) < (b)) ? (a) : (b))
diff --git a/data/src/pmxcfs.c b/data/src/pmxcfs.c
index bb5b4ad..6047ad0 100644
--- a/data/src/pmxcfs.c
+++ b/data/src/pmxcfs.c
@@ -58,6 +58,7 @@
 
 #define DBFILENAME VARLIBDIR "/config.db"
 #define LOCKFILE VARLIBDIR "/.pmxcfs.lockfile"
+#define RESTART_FLAG_FILE RUNDIR "/cfs-restart-flag"
 
 #define CFSDIR "/etc/pve"
 
@@ -860,6 +861,7 @@ int main(int argc, char *argv[])
 	umask(027);
 
 	mkdir(VARLIBDIR, 0755);
+	mkdir(RUNDIR, 0755);
 
 	if ((lockfd = open(LOCKFILE, O_RDWR|O_CREAT|O_APPEND, 0600)) == -1) {
 		cfs_critical("unable to create lock '%s': %s", LOCKFILE, strerror (errno));
@@ -1018,8 +1020,12 @@ int main(int argc, char *argv[])
 
 	server_start(memdb);
 
+	unlink(RESTART_FLAG_FILE);
+
 	ret = fuse_loop_mt(fuse);
 
+	open(RESTART_FLAG_FILE, O_CREAT|O_NOCTTY|O_NONBLOCK);
+
 	cfs_message("teardown filesystem");
 
 	server_stop();
-- 
2.39.2