From 6a2849ec511e6b82721d3e6aa91ea1d26639ac47 Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Wed, 25 Apr 2018 09:09:24 +0200 Subject: [PATCH] cpg: fix issue when nodes with low id hang The cpg config change callback where not made correctly for nodes with a low member id (lowest id == master node) after corosync on said node hung, due to IO, artifical suspend or other scheduling related hangs. This is releated to an heuristic for choosing/syncing the CPG member downlist added in 6bbbfcb6b4af72cf35ab9fdb4412fa6c6bdacc12 (corosync repository). See whole issue thread: https://lists.clusterlabs.org/pipermail/users/2018-March/014594.html Upstream pull-request: https://github.com/corosync/corosync/pull/347 --- ...lients-about-left-nodes-during-pause.patch | 322 ++++++++++++++++++ patches/series | 1 + 2 files changed, 323 insertions(+) create mode 100644 patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch diff --git a/patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch b/patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch new file mode 100644 index 0000000..2e745c7 --- /dev/null +++ b/patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch @@ -0,0 +1,322 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jan Friesse +Date: Tue, 24 Apr 2018 17:44:48 +0200 +Subject: [PATCH] cpg: Inform clients about left nodes during pause + +Patch tries to fix incorrect behaviour during following test-case: +- 3 nodes +- Node 1 is paused +- Node 2 and 3 detects node 1 as failed and informs CPG clients +- Node 1 is unpaused +- Node 1 clients are informed about new membership, but not about Node 1 + being paused, so from Node 1 point-of-view, Node 2 and 3 failure + +Solution is to: +- Remove downlist master choose and always choose local node downlist. + For Node 1 in example above, downlist contains Node 2 and 3. +- Keep code which informs clients about left nodes +- Use joinlist as a authoritative source of nodes/clients which exists + in membership + +This patch doesn't break backwards compatibility. + +I've walked thru all the patches which changed behavior of cpg to ensure +patch does not break CPG behavior. Most important were: +- 058f50314cd20abe67f5e8fb3c029a63b0e10cdc - Base. Code was significantly + changed to handle double free by split group_info into two structures + cpg_pd (local node clients) and process_info (all clients). Joinlist + was +- 97c28ea756cdf59316b2f609103122cc678329bd - This patch removed + confchg_fn and made CPG sync correct +- feff0e8542463773207a3b2c1f6004afba1f58d5 - I've tested described + behavior without any issues +- 6bbbfcb6b4af72cf35ab9fdb4412fa6c6bdacc12 - Added idea of using + heuristics to choose same downlist on all nodes. Sadly this idea + was beginning of the problems described in + 040fda8872a4a20340d73fa1c240b86afb2489f8, + ac1d79ea7c14997353427e962865781d0836d9fa, + 559d4083ed8355fe83f275e53b9c8f52a91694b2, + 02c5dffa5bb8579c223006fa1587de9ba7409a3d, + 64d0e5ace025cc929e42896c5d6beb3ef75b8244 and + b55f32fe2e1538db33a1ec584b67744c724328c6 +- 02c5dffa5bb8579c223006fa1587de9ba7409a3d - Made joinlist as + authoritative source of nodes/clients but left downlist_master_choose + as a source of information about left nodes + +Long story made short. This patch basically reverts +idea of using heuristics to choose same downlist on all nodes. + +Signed-off-by: Jan Friesse +Signed-off-by: Thomas Lamprecht +--- + exec/cpg.c | 164 +++++-------------------------------------------------------- + 1 file changed, 11 insertions(+), 153 deletions(-) + +diff --git a/exec/cpg.c b/exec/cpg.c +index 78ac1e9e..b851cba3 100644 +--- a/exec/cpg.c ++++ b/exec/cpg.c +@@ -139,13 +139,6 @@ enum cpg_sync_state { + CPGSYNC_JOINLIST + }; + +-enum cpg_downlist_state_e { +- CPG_DOWNLIST_NONE, +- CPG_DOWNLIST_WAITING_FOR_MESSAGES, +- CPG_DOWNLIST_APPLYING, +-}; +-static enum cpg_downlist_state_e downlist_state; +-static struct list_head downlist_messages_head; + static struct list_head joinlist_messages_head; + + struct cpg_pd { +@@ -295,9 +288,7 @@ static int cpg_exec_send_downlist(void); + + static int cpg_exec_send_joinlist(void); + +-static void downlist_messages_delete (void); +- +-static void downlist_master_choose_and_send (void); ++static void downlist_inform_clients (void); + + static void joinlist_inform_clients (void); + +@@ -499,14 +490,6 @@ struct req_exec_cpg_downlist { + mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); + }; + +-struct downlist_msg { +- mar_uint32_t sender_nodeid; +- mar_uint32_t old_members __attribute__((aligned(8))); +- mar_uint32_t left_nodes __attribute__((aligned(8))); +- mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); +- struct list_head list; +-}; +- + struct joinlist_msg { + mar_uint32_t sender_nodeid; + uint32_t pid; +@@ -566,8 +549,6 @@ static void cpg_sync_init ( + last_sync_ring_id.nodeid = ring_id->rep.nodeid; + last_sync_ring_id.seq = ring_id->seq; + +- downlist_state = CPG_DOWNLIST_WAITING_FOR_MESSAGES; +- + entries = 0; + /* + * Determine list of nodeids for downlist message +@@ -611,14 +592,10 @@ static void cpg_sync_activate (void) + my_member_list_entries * sizeof (unsigned int)); + my_old_member_list_entries = my_member_list_entries; + +- if (downlist_state == CPG_DOWNLIST_WAITING_FOR_MESSAGES) { +- downlist_master_choose_and_send (); +- } ++ downlist_inform_clients (); + + joinlist_inform_clients (); + +- downlist_messages_delete (); +- downlist_state = CPG_DOWNLIST_NONE; + joinlist_messages_delete (); + + notify_lib_totem_membership (NULL, my_member_list_entries, my_member_list); +@@ -626,8 +603,7 @@ static void cpg_sync_activate (void) + + static void cpg_sync_abort (void) + { +- downlist_state = CPG_DOWNLIST_NONE; +- downlist_messages_delete (); ++ + joinlist_messages_delete (); + } + +@@ -800,76 +776,17 @@ static int notify_lib_joinlist( + return CS_OK; + } + +-static void downlist_log(const char *msg, struct downlist_msg* dl) ++static void downlist_log(const char *msg, struct req_exec_cpg_downlist *dl) + { + log_printf (LOG_DEBUG, +- "%s: sender %s; members(old:%d left:%d)", ++ "%s: members(old:%d left:%d)", + msg, +- api->totem_ifaces_print(dl->sender_nodeid), + dl->old_members, + dl->left_nodes); + } + +-static struct downlist_msg* downlist_master_choose (void) ++static void downlist_inform_clients (void) + { +- struct downlist_msg *cmp; +- struct downlist_msg *best = NULL; +- struct list_head *iter; +- uint32_t cmp_members; +- uint32_t best_members; +- uint32_t i; +- int ignore_msg; +- +- for (iter = downlist_messages_head.next; +- iter != &downlist_messages_head; +- iter = iter->next) { +- +- cmp = list_entry(iter, struct downlist_msg, list); +- downlist_log("comparing", cmp); +- +- ignore_msg = 0; +- for (i = 0; i < cmp->left_nodes; i++) { +- if (cmp->nodeids[i] == api->totem_nodeid_get()) { +- log_printf (LOG_DEBUG, "Ignoring this entry because I'm in the left list\n"); +- +- ignore_msg = 1; +- break; +- } +- } +- +- if (ignore_msg) { +- continue ; +- } +- +- if (best == NULL) { +- best = cmp; +- continue; +- } +- +- best_members = best->old_members - best->left_nodes; +- cmp_members = cmp->old_members - cmp->left_nodes; +- +- if (cmp_members > best_members) { +- best = cmp; +- } else if (cmp_members == best_members) { +- if (cmp->old_members > best->old_members) { +- best = cmp; +- } else if (cmp->old_members == best->old_members) { +- if (cmp->sender_nodeid < best->sender_nodeid) { +- best = cmp; +- } +- } +- } +- } +- +- assert (best != NULL); +- +- return best; +-} +- +-static void downlist_master_choose_and_send (void) +-{ +- struct downlist_msg *stored_msg; + struct list_head *iter; + struct process_info *left_pi; + qb_map_t *group_map; +@@ -884,14 +801,7 @@ static void downlist_master_choose_and_send (void) + qb_map_iter_t *miter; + int i, size; + +- downlist_state = CPG_DOWNLIST_APPLYING; +- +- stored_msg = downlist_master_choose (); +- if (!stored_msg) { +- log_printf (LOGSYS_LEVEL_DEBUG, "NO chosen downlist"); +- return; +- } +- downlist_log("chosen downlist", stored_msg); ++ downlist_log("my downlist", &g_req_exec_cpg_downlist); + + group_map = qb_skiplist_create(); + +@@ -905,9 +815,9 @@ static void downlist_master_choose_and_send (void) + iter = iter->next; + + left_pi = NULL; +- for (i = 0; i < stored_msg->left_nodes; i++) { ++ for (i = 0; i < g_req_exec_cpg_downlist.left_nodes; i++) { + +- if (pi->nodeid == stored_msg->nodeids[i]) { ++ if (pi->nodeid == g_req_exec_cpg_downlist.nodeids[i]) { + left_pi = pi; + break; + } +@@ -1039,23 +949,6 @@ static void joinlist_inform_clients (void) + joinlist_remove_zombie_pi_entries (); + } + +-static void downlist_messages_delete (void) +-{ +- struct downlist_msg *stored_msg; +- struct list_head *iter, *iter_next; +- +- for (iter = downlist_messages_head.next; +- iter != &downlist_messages_head; +- iter = iter_next) { +- +- iter_next = iter->next; +- +- stored_msg = list_entry(iter, struct downlist_msg, list); +- list_del (&stored_msg->list); +- free (stored_msg); +- } +-} +- + static void joinlist_messages_delete (void) + { + struct joinlist_msg *stored_msg; +@@ -1076,7 +969,6 @@ static void joinlist_messages_delete (void) + + static char *cpg_exec_init_fn (struct corosync_api_v1 *corosync_api) + { +- list_init (&downlist_messages_head); + list_init (&joinlist_messages_head); + api = corosync_api; + return (NULL); +@@ -1338,43 +1230,9 @@ static void message_handler_req_exec_cpg_downlist( + unsigned int nodeid) + { + const struct req_exec_cpg_downlist *req_exec_cpg_downlist = message; +- int i; +- struct list_head *iter; +- struct downlist_msg *stored_msg; +- int found; + +- if (downlist_state != CPG_DOWNLIST_WAITING_FOR_MESSAGES) { +- log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received in state %d", +- req_exec_cpg_downlist->left_nodes, downlist_state); +- return; +- } +- +- stored_msg = malloc (sizeof (struct downlist_msg)); +- stored_msg->sender_nodeid = nodeid; +- stored_msg->old_members = req_exec_cpg_downlist->old_members; +- stored_msg->left_nodes = req_exec_cpg_downlist->left_nodes; +- memcpy (stored_msg->nodeids, req_exec_cpg_downlist->nodeids, +- req_exec_cpg_downlist->left_nodes * sizeof (mar_uint32_t)); +- list_init (&stored_msg->list); +- list_add (&stored_msg->list, &downlist_messages_head); +- +- for (i = 0; i < my_member_list_entries; i++) { +- found = 0; +- for (iter = downlist_messages_head.next; +- iter != &downlist_messages_head; +- iter = iter->next) { +- +- stored_msg = list_entry(iter, struct downlist_msg, list); +- if (my_member_list[i] == stored_msg->sender_nodeid) { +- found = 1; +- } +- } +- if (!found) { +- return; +- } +- } +- +- downlist_master_choose_and_send (); ++ log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received", ++ req_exec_cpg_downlist->left_nodes); + } + + +-- +2.14.2 + diff --git a/patches/series b/patches/series index 501b4f4..7931482 100644 --- a/patches/series +++ b/patches/series @@ -7,3 +7,4 @@ 0007-only-start-corosync.service-if-conf-exists.patch 0008-remove-unecessary-and-problematic-corosync-qdevice.i.patch 0009-totemcrypto-Check-length-of-the-packet.patch +0010-cpg-Inform-clients-about-left-nodes-during-pause.patch -- 2.39.2