From 9dd866204c76fab850d5939018e7d5aa79b1df3c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fabian=20Gr=C3=BCnbichler?= Date: Wed, 30 Sep 2020 13:21:31 +0200 Subject: [PATCH] pmxcfs: protect CPG operations with mutex MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit cpg_mcast_joined (and transitively, cpg_join/leave) are not thread-safe. pmxcfs triggers such operations via FUSE and CPG dispatch callbacks, which are running in concurrent threads. accordingly, we need to protect these operations with a mutex, otherwise they might return CS_OK without actually doing what they were supposed to do (which in turn can lead to the dfsm taking a wrong turn and getting stuck in a supposedly short-lived state, blocking access via FUSE and getting whole clusters fenced). huge thanks to Alexandre Derumier for providing the initial bug report and quite a lot of test runs while debugging this issue. Signed-off-by: Fabian Grünbichler --- data/src/dfsm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/data/src/dfsm.c b/data/src/dfsm.c index 172d877..17a3ba4 100644 --- a/data/src/dfsm.c +++ b/data/src/dfsm.c @@ -107,6 +107,7 @@ struct dfsm { cpg_callbacks_t *cpg_callbacks; dfsm_callbacks_t *dfsm_callbacks; cpg_handle_t cpg_handle; + GMutex cpg_mutex; struct cpg_name cpg_group_name; uint32_t nodeid; uint32_t pid; @@ -204,7 +205,9 @@ dfsm_send_message_full( cs_error_t result; int retries = 0; loop: + g_mutex_lock (&dfsm->cpg_mutex); result = cpg_mcast_joined(dfsm->cpg_handle, CPG_TYPE_AGREED, iov, len); + g_mutex_unlock (&dfsm->cpg_mutex); if (retry && result == CS_ERR_TRY_AGAIN) { nanosleep(&tvreq, NULL); ++retries; @@ -1250,7 +1253,9 @@ dfsm_new( if (!(dfsm->msg_queue = g_sequence_new(NULL))) goto err; - + + g_mutex_init(&dfsm->cpg_mutex); + dfsm->log_domain = log_domain; dfsm->data = data; dfsm->mode = DFSM_MODE_START; @@ -1424,7 +1429,9 @@ dfsm_join(dfsm_t *dfsm) struct timespec tvreq = { .tv_sec = 0, .tv_nsec = 100000000 }; int retries = 0; loop: + g_mutex_lock (&dfsm->cpg_mutex); result = cpg_join(dfsm->cpg_handle, &dfsm->cpg_group_name); + g_mutex_unlock (&dfsm->cpg_mutex); if (result == CS_ERR_TRY_AGAIN) { nanosleep(&tvreq, NULL); ++retries; @@ -1453,7 +1460,9 @@ dfsm_leave (dfsm_t *dfsm) struct timespec tvreq = { .tv_sec = 0, .tv_nsec = 100000000 }; int retries = 0; loop: + g_mutex_lock (&dfsm->cpg_mutex); result = cpg_leave(dfsm->cpg_handle, &dfsm->cpg_group_name); + g_mutex_unlock (&dfsm->cpg_mutex); if (result == CS_ERR_TRY_AGAIN) { nanosleep(&tvreq, NULL); ++retries; @@ -1509,6 +1518,8 @@ dfsm_destroy(dfsm_t *dfsm) g_mutex_clear (&dfsm->sync_mutex); g_cond_clear (&dfsm->sync_cond); + + g_mutex_clear (&dfsm->cpg_mutex); if (dfsm->results) g_hash_table_destroy(dfsm->results); -- 2.39.2