]> git.proxmox.com Git - corosync-pve.git/blob - patches/0010-cpg-Inform-clients-about-left-nodes-during-pause.patch
build: move some patches into debian/
[corosync-pve.git] / patches / 0010-cpg-Inform-clients-about-left-nodes-during-pause.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
3 Date: Wed, 24 Oct 2018 15:11:47 +0200
4 Subject: [PATCH] cpg: Inform clients about left nodes during pause
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 Patch tries to fix incorrect behaviour during following test-case:
10 - 3 nodes
11 - Node 1 is paused
12 - Node 2 and 3 detects node 1 as failed and informs CPG clients
13 - Node 1 is unpaused
14 - Node 1 clients are informed about new membership, but not about Node 1
15 being paused, so from Node 1 point-of-view, Node 2 and 3 failure
16
17 Solution is to:
18 - Remove downlist master choose and always choose local node downlist.
19 For Node 1 in example above, downlist contains Node 2 and 3.
20 - Keep code which informs clients about left nodes
21 - Use joinlist as a authoritative source of nodes/clients which exists
22 in membership
23
24 This patch doesn't break backwards compatibility.
25
26 I've walked thru all the patches which changed behavior of cpg to ensure
27 patch does not break CPG behavior. Most important were:
28 - 058f50314cd20abe67f5e8fb3c029a63b0e10cdc - Base. Code was significantly
29 changed to handle double free by split group_info into two structures
30 cpg_pd (local node clients) and process_info (all clients). Joinlist
31 was
32 - 97c28ea756cdf59316b2f609103122cc678329bd - This patch removed
33 confchg_fn and made CPG sync correct
34 - feff0e8542463773207a3b2c1f6004afba1f58d5 - I've tested described
35 behavior without any issues
36 - 6bbbfcb6b4af72cf35ab9fdb4412fa6c6bdacc12 - Added idea of using
37 heuristics to choose same downlist on all nodes. Sadly this idea
38 was beginning of the problems described in
39 040fda8872a4a20340d73fa1c240b86afb2489f8,
40 ac1d79ea7c14997353427e962865781d0836d9fa,
41 559d4083ed8355fe83f275e53b9c8f52a91694b2,
42 02c5dffa5bb8579c223006fa1587de9ba7409a3d,
43 64d0e5ace025cc929e42896c5d6beb3ef75b8244 and
44 b55f32fe2e1538db33a1ec584b67744c724328c6
45 - 02c5dffa5bb8579c223006fa1587de9ba7409a3d - Made joinlist as
46 authoritative source of nodes/clients but left downlist_master_choose
47 as a source of information about left nodes
48
49 Long story made short. This patch basically reverts
50 idea of using heuristics to choose same downlist on all nodes.
51
52 Signed-off-by: Jan Friesse <jfriesse@redhat.com>
53 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
54 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
55 ---
56 ...lients-about-left-nodes-during-pause.patch | 326 ++++++++++++++++++
57 debian/patches/series | 1 +
58 2 files changed, 327 insertions(+)
59 create mode 100644 debian/patches/pve__cpg-Inform-clients-about-left-nodes-during-pause.patch
60
61 diff --git a/debian/patches/pve__cpg-Inform-clients-about-left-nodes-during-pause.patch b/debian/patches/pve__cpg-Inform-clients-about-left-nodes-during-pause.patch
62 new file mode 100644
63 index 00000000..a8c9a50c
64 --- /dev/null
65 +++ b/debian/patches/pve__cpg-Inform-clients-about-left-nodes-during-pause.patch
66 @@ -0,0 +1,326 @@
67 +From 60b7c9ebfca5cc55c2cb222264a36e597ec5a29f Mon Sep 17 00:00:00 2001
68 +From: Jan Friesse <jfriesse@redhat.com>
69 +Date: Tue, 24 Apr 2018 17:44:48 +0200
70 +Subject: [PATCH] cpg: Inform clients about left nodes during pause
71 +MIME-Version: 1.0
72 +Content-Type: text/plain; charset=UTF-8
73 +Content-Transfer-Encoding: 8bit
74 +
75 +Patch tries to fix incorrect behaviour during following test-case:
76 +- 3 nodes
77 +- Node 1 is paused
78 +- Node 2 and 3 detects node 1 as failed and informs CPG clients
79 +- Node 1 is unpaused
80 +- Node 1 clients are informed about new membership, but not about Node 1
81 + being paused, so from Node 1 point-of-view, Node 2 and 3 failure
82 +
83 +Solution is to:
84 +- Remove downlist master choose and always choose local node downlist.
85 + For Node 1 in example above, downlist contains Node 2 and 3.
86 +- Keep code which informs clients about left nodes
87 +- Use joinlist as a authoritative source of nodes/clients which exists
88 + in membership
89 +
90 +This patch doesn't break backwards compatibility.
91 +
92 +I've walked thru all the patches which changed behavior of cpg to ensure
93 +patch does not break CPG behavior. Most important were:
94 +- 058f50314cd20abe67f5e8fb3c029a63b0e10cdc - Base. Code was significantly
95 + changed to handle double free by split group_info into two structures
96 + cpg_pd (local node clients) and process_info (all clients). Joinlist
97 + was
98 +- 97c28ea756cdf59316b2f609103122cc678329bd - This patch removed
99 + confchg_fn and made CPG sync correct
100 +- feff0e8542463773207a3b2c1f6004afba1f58d5 - I've tested described
101 + behavior without any issues
102 +- 6bbbfcb6b4af72cf35ab9fdb4412fa6c6bdacc12 - Added idea of using
103 + heuristics to choose same downlist on all nodes. Sadly this idea
104 + was beginning of the problems described in
105 + 040fda8872a4a20340d73fa1c240b86afb2489f8,
106 + ac1d79ea7c14997353427e962865781d0836d9fa,
107 + 559d4083ed8355fe83f275e53b9c8f52a91694b2,
108 + 02c5dffa5bb8579c223006fa1587de9ba7409a3d,
109 + 64d0e5ace025cc929e42896c5d6beb3ef75b8244 and
110 + b55f32fe2e1538db33a1ec584b67744c724328c6
111 +- 02c5dffa5bb8579c223006fa1587de9ba7409a3d - Made joinlist as
112 + authoritative source of nodes/clients but left downlist_master_choose
113 + as a source of information about left nodes
114 +
115 +Long story made short. This patch basically reverts
116 +idea of using heuristics to choose same downlist on all nodes.
117 +
118 +Signed-off-by: Jan Friesse <jfriesse@redhat.com>
119 +Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
120 +Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
121 +---
122 + exec/cpg.c | 164 ++++-------------------------------------------------
123 + 1 file changed, 11 insertions(+), 153 deletions(-)
124 +
125 +diff --git a/exec/cpg.c b/exec/cpg.c
126 +index 78ac1e9e..b851cba3 100644
127 +--- a/exec/cpg.c
128 ++++ b/exec/cpg.c
129 +@@ -139,13 +139,6 @@ enum cpg_sync_state {
130 + CPGSYNC_JOINLIST
131 + };
132 +
133 +-enum cpg_downlist_state_e {
134 +- CPG_DOWNLIST_NONE,
135 +- CPG_DOWNLIST_WAITING_FOR_MESSAGES,
136 +- CPG_DOWNLIST_APPLYING,
137 +-};
138 +-static enum cpg_downlist_state_e downlist_state;
139 +-static struct list_head downlist_messages_head;
140 + static struct list_head joinlist_messages_head;
141 +
142 + struct cpg_pd {
143 +@@ -295,9 +288,7 @@ static int cpg_exec_send_downlist(void);
144 +
145 + static int cpg_exec_send_joinlist(void);
146 +
147 +-static void downlist_messages_delete (void);
148 +-
149 +-static void downlist_master_choose_and_send (void);
150 ++static void downlist_inform_clients (void);
151 +
152 + static void joinlist_inform_clients (void);
153 +
154 +@@ -499,14 +490,6 @@ struct req_exec_cpg_downlist {
155 + mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8)));
156 + };
157 +
158 +-struct downlist_msg {
159 +- mar_uint32_t sender_nodeid;
160 +- mar_uint32_t old_members __attribute__((aligned(8)));
161 +- mar_uint32_t left_nodes __attribute__((aligned(8)));
162 +- mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8)));
163 +- struct list_head list;
164 +-};
165 +-
166 + struct joinlist_msg {
167 + mar_uint32_t sender_nodeid;
168 + uint32_t pid;
169 +@@ -566,8 +549,6 @@ static void cpg_sync_init (
170 + last_sync_ring_id.nodeid = ring_id->rep.nodeid;
171 + last_sync_ring_id.seq = ring_id->seq;
172 +
173 +- downlist_state = CPG_DOWNLIST_WAITING_FOR_MESSAGES;
174 +-
175 + entries = 0;
176 + /*
177 + * Determine list of nodeids for downlist message
178 +@@ -611,14 +592,10 @@ static void cpg_sync_activate (void)
179 + my_member_list_entries * sizeof (unsigned int));
180 + my_old_member_list_entries = my_member_list_entries;
181 +
182 +- if (downlist_state == CPG_DOWNLIST_WAITING_FOR_MESSAGES) {
183 +- downlist_master_choose_and_send ();
184 +- }
185 ++ downlist_inform_clients ();
186 +
187 + joinlist_inform_clients ();
188 +
189 +- downlist_messages_delete ();
190 +- downlist_state = CPG_DOWNLIST_NONE;
191 + joinlist_messages_delete ();
192 +
193 + notify_lib_totem_membership (NULL, my_member_list_entries, my_member_list);
194 +@@ -626,8 +603,7 @@ static void cpg_sync_activate (void)
195 +
196 + static void cpg_sync_abort (void)
197 + {
198 +- downlist_state = CPG_DOWNLIST_NONE;
199 +- downlist_messages_delete ();
200 ++
201 + joinlist_messages_delete ();
202 + }
203 +
204 +@@ -800,76 +776,17 @@ static int notify_lib_joinlist(
205 + return CS_OK;
206 + }
207 +
208 +-static void downlist_log(const char *msg, struct downlist_msg* dl)
209 ++static void downlist_log(const char *msg, struct req_exec_cpg_downlist *dl)
210 + {
211 + log_printf (LOG_DEBUG,
212 +- "%s: sender %s; members(old:%d left:%d)",
213 ++ "%s: members(old:%d left:%d)",
214 + msg,
215 +- api->totem_ifaces_print(dl->sender_nodeid),
216 + dl->old_members,
217 + dl->left_nodes);
218 + }
219 +
220 +-static struct downlist_msg* downlist_master_choose (void)
221 ++static void downlist_inform_clients (void)
222 + {
223 +- struct downlist_msg *cmp;
224 +- struct downlist_msg *best = NULL;
225 +- struct list_head *iter;
226 +- uint32_t cmp_members;
227 +- uint32_t best_members;
228 +- uint32_t i;
229 +- int ignore_msg;
230 +-
231 +- for (iter = downlist_messages_head.next;
232 +- iter != &downlist_messages_head;
233 +- iter = iter->next) {
234 +-
235 +- cmp = list_entry(iter, struct downlist_msg, list);
236 +- downlist_log("comparing", cmp);
237 +-
238 +- ignore_msg = 0;
239 +- for (i = 0; i < cmp->left_nodes; i++) {
240 +- if (cmp->nodeids[i] == api->totem_nodeid_get()) {
241 +- log_printf (LOG_DEBUG, "Ignoring this entry because I'm in the left list\n");
242 +-
243 +- ignore_msg = 1;
244 +- break;
245 +- }
246 +- }
247 +-
248 +- if (ignore_msg) {
249 +- continue ;
250 +- }
251 +-
252 +- if (best == NULL) {
253 +- best = cmp;
254 +- continue;
255 +- }
256 +-
257 +- best_members = best->old_members - best->left_nodes;
258 +- cmp_members = cmp->old_members - cmp->left_nodes;
259 +-
260 +- if (cmp_members > best_members) {
261 +- best = cmp;
262 +- } else if (cmp_members == best_members) {
263 +- if (cmp->old_members > best->old_members) {
264 +- best = cmp;
265 +- } else if (cmp->old_members == best->old_members) {
266 +- if (cmp->sender_nodeid < best->sender_nodeid) {
267 +- best = cmp;
268 +- }
269 +- }
270 +- }
271 +- }
272 +-
273 +- assert (best != NULL);
274 +-
275 +- return best;
276 +-}
277 +-
278 +-static void downlist_master_choose_and_send (void)
279 +-{
280 +- struct downlist_msg *stored_msg;
281 + struct list_head *iter;
282 + struct process_info *left_pi;
283 + qb_map_t *group_map;
284 +@@ -884,14 +801,7 @@ static void downlist_master_choose_and_send (void)
285 + qb_map_iter_t *miter;
286 + int i, size;
287 +
288 +- downlist_state = CPG_DOWNLIST_APPLYING;
289 +-
290 +- stored_msg = downlist_master_choose ();
291 +- if (!stored_msg) {
292 +- log_printf (LOGSYS_LEVEL_DEBUG, "NO chosen downlist");
293 +- return;
294 +- }
295 +- downlist_log("chosen downlist", stored_msg);
296 ++ downlist_log("my downlist", &g_req_exec_cpg_downlist);
297 +
298 + group_map = qb_skiplist_create();
299 +
300 +@@ -905,9 +815,9 @@ static void downlist_master_choose_and_send (void)
301 + iter = iter->next;
302 +
303 + left_pi = NULL;
304 +- for (i = 0; i < stored_msg->left_nodes; i++) {
305 ++ for (i = 0; i < g_req_exec_cpg_downlist.left_nodes; i++) {
306 +
307 +- if (pi->nodeid == stored_msg->nodeids[i]) {
308 ++ if (pi->nodeid == g_req_exec_cpg_downlist.nodeids[i]) {
309 + left_pi = pi;
310 + break;
311 + }
312 +@@ -1039,23 +949,6 @@ static void joinlist_inform_clients (void)
313 + joinlist_remove_zombie_pi_entries ();
314 + }
315 +
316 +-static void downlist_messages_delete (void)
317 +-{
318 +- struct downlist_msg *stored_msg;
319 +- struct list_head *iter, *iter_next;
320 +-
321 +- for (iter = downlist_messages_head.next;
322 +- iter != &downlist_messages_head;
323 +- iter = iter_next) {
324 +-
325 +- iter_next = iter->next;
326 +-
327 +- stored_msg = list_entry(iter, struct downlist_msg, list);
328 +- list_del (&stored_msg->list);
329 +- free (stored_msg);
330 +- }
331 +-}
332 +-
333 + static void joinlist_messages_delete (void)
334 + {
335 + struct joinlist_msg *stored_msg;
336 +@@ -1076,7 +969,6 @@ static void joinlist_messages_delete (void)
337 +
338 + static char *cpg_exec_init_fn (struct corosync_api_v1 *corosync_api)
339 + {
340 +- list_init (&downlist_messages_head);
341 + list_init (&joinlist_messages_head);
342 + api = corosync_api;
343 + return (NULL);
344 +@@ -1338,43 +1230,9 @@ static void message_handler_req_exec_cpg_downlist(
345 + unsigned int nodeid)
346 + {
347 + const struct req_exec_cpg_downlist *req_exec_cpg_downlist = message;
348 +- int i;
349 +- struct list_head *iter;
350 +- struct downlist_msg *stored_msg;
351 +- int found;
352 +-
353 +- if (downlist_state != CPG_DOWNLIST_WAITING_FOR_MESSAGES) {
354 +- log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received in state %d",
355 +- req_exec_cpg_downlist->left_nodes, downlist_state);
356 +- return;
357 +- }
358 +-
359 +- stored_msg = malloc (sizeof (struct downlist_msg));
360 +- stored_msg->sender_nodeid = nodeid;
361 +- stored_msg->old_members = req_exec_cpg_downlist->old_members;
362 +- stored_msg->left_nodes = req_exec_cpg_downlist->left_nodes;
363 +- memcpy (stored_msg->nodeids, req_exec_cpg_downlist->nodeids,
364 +- req_exec_cpg_downlist->left_nodes * sizeof (mar_uint32_t));
365 +- list_init (&stored_msg->list);
366 +- list_add (&stored_msg->list, &downlist_messages_head);
367 +-
368 +- for (i = 0; i < my_member_list_entries; i++) {
369 +- found = 0;
370 +- for (iter = downlist_messages_head.next;
371 +- iter != &downlist_messages_head;
372 +- iter = iter->next) {
373 +-
374 +- stored_msg = list_entry(iter, struct downlist_msg, list);
375 +- if (my_member_list[i] == stored_msg->sender_nodeid) {
376 +- found = 1;
377 +- }
378 +- }
379 +- if (!found) {
380 +- return;
381 +- }
382 +- }
383 +
384 +- downlist_master_choose_and_send ();
385 ++ log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received",
386 ++ req_exec_cpg_downlist->left_nodes);
387 + }
388 +
389 +
390 +--
391 +2.19.1
392 +
393 diff --git a/debian/patches/series b/debian/patches/series
394 index 78b44c78..090dade1 100644
395 --- a/debian/patches/series
396 +++ b/debian/patches/series
397 @@ -17,3 +17,4 @@ qnetd-stay-with-the-DBM-NSS-DB-format.patch
398 Fix-typo-defualt-default.patch
399 Please-make-the-manpages-reproducible.patch
400 pve__only-start-corosync.service-if-conf-exists.patch
401 +pve__cpg-Inform-clients-about-left-nodes-during-pause.patch