]> git.proxmox.com Git - corosync-pve.git/commitdiff
cpg: fix issue when nodes with low id hang
authorThomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 25 Apr 2018 07:09:24 +0000 (09:09 +0200)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Wed, 25 Apr 2018 07:09:29 +0000 (09:09 +0200)
The cpg config change callback where not made correctly for nodes
with a low member id (lowest id == master node) after corosync on
said node hung, due to IO, artifical suspend or other scheduling
related hangs.

This is releated to an heuristic for choosing/syncing the CPG member
downlist added in 6bbbfcb6b4af72cf35ab9fdb4412fa6c6bdacc12 (corosync
repository).

See whole issue thread:
https://lists.clusterlabs.org/pipermail/users/2018-March/014594.html

Upstream pull-request:
https://github.com/corosync/corosync/pull/347

patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch [new file with mode: 0644]
patches/series

diff --git a/patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch b/patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch
new file mode 100644 (file)
index 0000000..2e745c7
--- /dev/null
@@ -0,0 +1,322 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jan Friesse <jfriesse@redhat.com>
+Date: Tue, 24 Apr 2018 17:44:48 +0200
+Subject: [PATCH] cpg: Inform clients about left nodes during pause
+
+Patch tries to fix incorrect behaviour during following test-case:
+- 3 nodes
+- Node 1 is paused
+- Node 2 and 3 detects node 1 as failed and informs CPG clients
+- Node 1 is unpaused
+- Node 1 clients are informed about new membership, but not about Node 1
+  being paused, so from Node 1 point-of-view, Node 2 and 3 failure
+
+Solution is to:
+- Remove downlist master choose and always choose local node downlist.
+  For Node 1 in example above, downlist contains Node 2 and 3.
+- Keep code which informs clients about left nodes
+- Use joinlist as a authoritative source of nodes/clients which exists
+  in membership
+
+This patch doesn't break backwards compatibility.
+
+I've walked thru all the patches which changed behavior of cpg to ensure
+patch does not break CPG behavior. Most important were:
+- 058f50314cd20abe67f5e8fb3c029a63b0e10cdc - Base. Code was significantly
+  changed to handle double free by split group_info into two structures
+  cpg_pd (local node clients) and process_info (all clients). Joinlist
+  was
+- 97c28ea756cdf59316b2f609103122cc678329bd - This patch removed
+  confchg_fn and made CPG sync correct
+- feff0e8542463773207a3b2c1f6004afba1f58d5 - I've tested described
+  behavior without any issues
+- 6bbbfcb6b4af72cf35ab9fdb4412fa6c6bdacc12 - Added idea of using
+  heuristics to choose same downlist on all nodes. Sadly this idea
+  was beginning of the problems described in
+  040fda8872a4a20340d73fa1c240b86afb2489f8,
+  ac1d79ea7c14997353427e962865781d0836d9fa,
+  559d4083ed8355fe83f275e53b9c8f52a91694b2,
+  02c5dffa5bb8579c223006fa1587de9ba7409a3d,
+  64d0e5ace025cc929e42896c5d6beb3ef75b8244 and
+  b55f32fe2e1538db33a1ec584b67744c724328c6
+- 02c5dffa5bb8579c223006fa1587de9ba7409a3d - Made joinlist as
+  authoritative source of nodes/clients but left downlist_master_choose
+  as a source of information about left nodes
+
+Long story made short. This patch basically reverts
+idea of using heuristics to choose same downlist on all nodes.
+
+Signed-off-by: Jan Friesse <jfriesse@redhat.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ exec/cpg.c | 164 +++++--------------------------------------------------------
+ 1 file changed, 11 insertions(+), 153 deletions(-)
+
+diff --git a/exec/cpg.c b/exec/cpg.c
+index 78ac1e9e..b851cba3 100644
+--- a/exec/cpg.c
++++ b/exec/cpg.c
+@@ -139,13 +139,6 @@ enum cpg_sync_state {
+       CPGSYNC_JOINLIST
+ };
+-enum cpg_downlist_state_e {
+-       CPG_DOWNLIST_NONE,
+-       CPG_DOWNLIST_WAITING_FOR_MESSAGES,
+-       CPG_DOWNLIST_APPLYING,
+-};
+-static enum cpg_downlist_state_e downlist_state;
+-static struct list_head downlist_messages_head;
+ static struct list_head joinlist_messages_head;
+ struct cpg_pd {
+@@ -295,9 +288,7 @@ static int cpg_exec_send_downlist(void);
+ static int cpg_exec_send_joinlist(void);
+-static void downlist_messages_delete (void);
+-
+-static void downlist_master_choose_and_send (void);
++static void downlist_inform_clients (void);
+ static void joinlist_inform_clients (void);
+@@ -499,14 +490,6 @@ struct req_exec_cpg_downlist {
+       mar_uint32_t nodeids[PROCESSOR_COUNT_MAX]  __attribute__((aligned(8)));
+ };
+-struct downlist_msg {
+-      mar_uint32_t sender_nodeid;
+-      mar_uint32_t old_members __attribute__((aligned(8)));
+-      mar_uint32_t left_nodes __attribute__((aligned(8)));
+-      mar_uint32_t nodeids[PROCESSOR_COUNT_MAX]  __attribute__((aligned(8)));
+-      struct list_head list;
+-};
+-
+ struct joinlist_msg {
+       mar_uint32_t sender_nodeid;
+       uint32_t pid;
+@@ -566,8 +549,6 @@ static void cpg_sync_init (
+       last_sync_ring_id.nodeid = ring_id->rep.nodeid;
+       last_sync_ring_id.seq = ring_id->seq;
+-      downlist_state = CPG_DOWNLIST_WAITING_FOR_MESSAGES;
+-
+       entries = 0;
+       /*
+        * Determine list of nodeids for downlist message
+@@ -611,14 +592,10 @@ static void cpg_sync_activate (void)
+               my_member_list_entries * sizeof (unsigned int));
+       my_old_member_list_entries = my_member_list_entries;
+-      if (downlist_state == CPG_DOWNLIST_WAITING_FOR_MESSAGES) {
+-              downlist_master_choose_and_send ();
+-      }
++      downlist_inform_clients ();
+       joinlist_inform_clients ();
+-      downlist_messages_delete ();
+-      downlist_state = CPG_DOWNLIST_NONE;
+       joinlist_messages_delete ();
+       notify_lib_totem_membership (NULL, my_member_list_entries, my_member_list);
+@@ -626,8 +603,7 @@ static void cpg_sync_activate (void)
+ static void cpg_sync_abort (void)
+ {
+-      downlist_state = CPG_DOWNLIST_NONE;
+-      downlist_messages_delete ();
++
+       joinlist_messages_delete ();
+ }
+@@ -800,76 +776,17 @@ static int notify_lib_joinlist(
+       return CS_OK;
+ }
+-static void downlist_log(const char *msg, struct downlist_msg* dl)
++static void downlist_log(const char *msg, struct req_exec_cpg_downlist *dl)
+ {
+       log_printf (LOG_DEBUG,
+-                  "%s: sender %s; members(old:%d left:%d)",
++                  "%s: members(old:%d left:%d)",
+                   msg,
+-                  api->totem_ifaces_print(dl->sender_nodeid),
+                   dl->old_members,
+                   dl->left_nodes);
+ }
+-static struct downlist_msg* downlist_master_choose (void)
++static void downlist_inform_clients (void)
+ {
+-      struct downlist_msg *cmp;
+-      struct downlist_msg *best = NULL;
+-      struct list_head *iter;
+-      uint32_t cmp_members;
+-      uint32_t best_members;
+-      uint32_t i;
+-      int ignore_msg;
+-
+-      for (iter = downlist_messages_head.next;
+-              iter != &downlist_messages_head;
+-              iter = iter->next) {
+-
+-              cmp = list_entry(iter, struct downlist_msg, list);
+-              downlist_log("comparing", cmp);
+-
+-              ignore_msg = 0;
+-              for (i = 0; i < cmp->left_nodes; i++) {
+-                      if (cmp->nodeids[i] == api->totem_nodeid_get()) {
+-                              log_printf (LOG_DEBUG, "Ignoring this entry because I'm in the left list\n");
+-
+-                              ignore_msg = 1;
+-                              break;
+-                      }
+-              }
+-
+-              if (ignore_msg) {
+-                      continue ;
+-              }
+-
+-              if (best == NULL) {
+-                      best = cmp;
+-                      continue;
+-              }
+-
+-              best_members = best->old_members - best->left_nodes;
+-              cmp_members = cmp->old_members - cmp->left_nodes;
+-
+-              if (cmp_members > best_members) {
+-                      best = cmp;
+-              } else if (cmp_members == best_members) {
+-                      if (cmp->old_members > best->old_members) {
+-                              best = cmp;
+-                      } else if (cmp->old_members == best->old_members) {
+-                              if (cmp->sender_nodeid < best->sender_nodeid) {
+-                                      best = cmp;
+-                              }
+-                      }
+-              }
+-      }
+-
+-      assert (best != NULL);
+-
+-      return best;
+-}
+-
+-static void downlist_master_choose_and_send (void)
+-{
+-      struct downlist_msg *stored_msg;
+       struct list_head *iter;
+       struct process_info *left_pi;
+       qb_map_t *group_map;
+@@ -884,14 +801,7 @@ static void downlist_master_choose_and_send (void)
+       qb_map_iter_t *miter;
+       int i, size;
+-      downlist_state = CPG_DOWNLIST_APPLYING;
+-
+-      stored_msg = downlist_master_choose ();
+-      if (!stored_msg) {
+-              log_printf (LOGSYS_LEVEL_DEBUG, "NO chosen downlist");
+-              return;
+-      }
+-      downlist_log("chosen downlist", stored_msg);
++      downlist_log("my downlist", &g_req_exec_cpg_downlist);
+       group_map = qb_skiplist_create();
+@@ -905,9 +815,9 @@ static void downlist_master_choose_and_send (void)
+               iter = iter->next;
+               left_pi = NULL;
+-              for (i = 0; i < stored_msg->left_nodes; i++) {
++              for (i = 0; i < g_req_exec_cpg_downlist.left_nodes; i++) {
+-                      if (pi->nodeid == stored_msg->nodeids[i]) {
++                      if (pi->nodeid == g_req_exec_cpg_downlist.nodeids[i]) {
+                               left_pi = pi;
+                               break;
+                       }
+@@ -1039,23 +949,6 @@ static void joinlist_inform_clients (void)
+       joinlist_remove_zombie_pi_entries ();
+ }
+-static void downlist_messages_delete (void)
+-{
+-      struct downlist_msg *stored_msg;
+-      struct list_head *iter, *iter_next;
+-
+-      for (iter = downlist_messages_head.next;
+-              iter != &downlist_messages_head;
+-              iter = iter_next) {
+-
+-              iter_next = iter->next;
+-
+-              stored_msg = list_entry(iter, struct downlist_msg, list);
+-              list_del (&stored_msg->list);
+-              free (stored_msg);
+-      }
+-}
+-
+ static void joinlist_messages_delete (void)
+ {
+       struct joinlist_msg *stored_msg;
+@@ -1076,7 +969,6 @@ static void joinlist_messages_delete (void)
+ static char *cpg_exec_init_fn (struct corosync_api_v1 *corosync_api)
+ {
+-      list_init (&downlist_messages_head);
+       list_init (&joinlist_messages_head);
+       api = corosync_api;
+       return (NULL);
+@@ -1338,43 +1230,9 @@ static void message_handler_req_exec_cpg_downlist(
+       unsigned int nodeid)
+ {
+       const struct req_exec_cpg_downlist *req_exec_cpg_downlist = message;
+-      int i;
+-      struct list_head *iter;
+-      struct downlist_msg *stored_msg;
+-      int found;
+-      if (downlist_state != CPG_DOWNLIST_WAITING_FOR_MESSAGES) {
+-              log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received in state %d",
+-                      req_exec_cpg_downlist->left_nodes, downlist_state);
+-              return;
+-      }
+-
+-      stored_msg = malloc (sizeof (struct downlist_msg));
+-      stored_msg->sender_nodeid = nodeid;
+-      stored_msg->old_members = req_exec_cpg_downlist->old_members;
+-      stored_msg->left_nodes = req_exec_cpg_downlist->left_nodes;
+-      memcpy (stored_msg->nodeids, req_exec_cpg_downlist->nodeids,
+-              req_exec_cpg_downlist->left_nodes * sizeof (mar_uint32_t));
+-      list_init (&stored_msg->list);
+-      list_add (&stored_msg->list, &downlist_messages_head);
+-
+-      for (i = 0; i < my_member_list_entries; i++) {
+-              found = 0;
+-              for (iter = downlist_messages_head.next;
+-                      iter != &downlist_messages_head;
+-                      iter = iter->next) {
+-
+-                      stored_msg = list_entry(iter, struct downlist_msg, list);
+-                      if (my_member_list[i] == stored_msg->sender_nodeid) {
+-                              found = 1;
+-                      }
+-              }
+-              if (!found) {
+-                      return;
+-              }
+-      }
+-
+-      downlist_master_choose_and_send ();
++      log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received",
++                      req_exec_cpg_downlist->left_nodes);
+ }
+-- 
+2.14.2
+
index 501b4f469beb39c3b1a8f3c23b690e8c7964e569..79314826dff75a28fbf5a6434d12b72c48f4d06c 100644 (file)
@@ -7,3 +7,4 @@
 0007-only-start-corosync.service-if-conf-exists.patch
 0008-remove-unecessary-and-problematic-corosync-qdevice.i.patch
 0009-totemcrypto-Check-length-of-the-packet.patch
+0010-cpg-Inform-clients-about-left-nodes-during-pause.patch