]> git.proxmox.com Git - mirror_corosync-qdevice.git/blame - qdevices/qnetd-algo-lms.c
init: Fix init scripts to work with containers
[mirror_corosync-qdevice.git] / qdevices / qnetd-algo-lms.c
CommitLineData
9a1955a7
JF
1/*
2 * Copyright (c) 2015-2017 Red Hat, Inc.
3 *
4 * All rights reserved.
5 *
6 * Author: Christine Caulfield (ccaulfie@redhat.com)
7 *
8 * This software licensed under BSD license, the text of which follows:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions are met:
12 *
13 * - Redistributions of source code must retain the above copyright notice,
14 * this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright notice,
16 * this list of conditions and the following disclaimer in the documentation
17 * and/or other materials provided with the distribution.
18 * - Neither the name of the Red Hat, Inc. nor the names of its
19 * contributors may be used to endorse or promote products derived from this
20 * software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
32 * THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35
36/*
37 * This is a 'last man standing' algorithm for 2+ node clusters
38 *
39 * If the node is the only one left in the cluster that can see the
40 * qdevice server then we return a vote.
41 *
42 * If more than one node can see the qdevice server but some nodes can't
43 * see each other then we divide the cluster up into 'partitions' based on
44 * their ring_id and return a vote to nodes in the partition that contains
45 * a nominated nodeid. (lowest, highest, etc)
46 *
47 */
48
49#include <sys/types.h>
50#include <sys/queue.h>
51
52#include <string.h>
53#include <limits.h>
54
55#include "qnetd-algo-lms.h"
56#include "qnetd-log.h"
57#include "qnetd-cluster-list.h"
58#include "qnetd-algo-utils.h"
59#include "qnetd-client-algo-timer.h"
60#include "utils.h"
61
62struct qnetd_algo_lms_info {
63 int num_config_nodes;
64 enum tlv_vote last_result;
65 partitions_list_t partition_list;
66};
67
68static enum tlv_reply_error_code do_lms_algorithm(struct qnetd_client *client, const struct tlv_ring_id *cur_ring_id, enum tlv_vote *result_vote)
69{
70 struct qnetd_client *other_client;
71 struct qnetd_algo_lms_info *info = client->algorithm_data;
72 struct qnetd_algo_partition *cur_partition;
73 struct qnetd_algo_partition *largest_partition;
74 struct qnetd_algo_partition *best_score_partition;
75 const struct tlv_ring_id *ring_id = cur_ring_id;
76 int num_partitions;
77 int joint_leader;
78
79 /* We are running the algorithm, don't do it again unless we say so */
80 qnetd_client_algo_timer_abort(client);
81
82 if (qnetd_algo_all_ring_ids_match(client, ring_id) == -1) {
83 qnetd_log(LOG_DEBUG, "algo-lms: nodeid %d: ring ID (" UTILS_PRI_RING_ID ") not unique in this membership, waiting",
84 client->node_id, ring_id->node_id, ring_id->seq);
85
86 qnetd_client_algo_timer_schedule(client);
87 *result_vote = info->last_result = TLV_VOTE_WAIT_FOR_REPLY;
88 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
89 }
90
91 /* Create and count the number of separate partitions */
92 if ( (num_partitions = qnetd_algo_create_partitions(client, &info->partition_list, ring_id)) == -1) {
93 qnetd_log(LOG_DEBUG, "algo-lms: Error creating partition list");
94 return (TLV_REPLY_ERROR_CODE_INTERNAL_ERROR);
95 }
96
97 /* This can happen if we are first on the block */
98 if (num_partitions == 0) {
99 qnetd_log(LOG_DEBUG, "algo-lms: No partitions found");
100
101 qnetd_client_algo_timer_schedule(client);
102 *result_vote = info->last_result = TLV_VOTE_WAIT_FOR_REPLY;
103 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
104 }
105
106 qnetd_algo_dump_partitions(&info->partition_list);
107
108 /* Only 1 partition - let votequorum sort it out */
109 if (num_partitions == 1) {
110 qnetd_log(LOG_DEBUG, "algo-lms: Only 1 partition. This is votequorum's problem, not ours");
111 qnetd_algo_free_partitions(&info->partition_list);
112 *result_vote = info->last_result = TLV_VOTE_ACK;
113 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
114 }
115
116
117 /* If we're a newcomer and there is another active partition, then we must NACK
118 * to avoid quorum moving to us from already active nodes.
119 */
120 if (info->last_result == 0) {
121 TAILQ_FOREACH(other_client, &client->cluster->client_list, cluster_entries) {
122 struct qnetd_algo_lms_info *other_info = other_client->algorithm_data;
123 if (!tlv_ring_id_eq(ring_id, &other_client->last_ring_id) &&
124 other_info->last_result == TLV_VOTE_ACK) {
125 qnetd_algo_free_partitions(&info->partition_list);
126
127 /* Don't save NACK, we need to know subsequently if we haven't been voting */
128 *result_vote = TLV_VOTE_NACK;
129 qnetd_log(LOG_DEBUG, "algo-lms: we are a new partition and another active partition exists. NACK");
130 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
131 }
132 }
133 }
134
135 /*
136 * Find the partition with highest score
137 */
138 best_score_partition = NULL;
139 TAILQ_FOREACH(cur_partition, &info->partition_list, entries) {
140 if (!best_score_partition ||
141 best_score_partition->score < cur_partition->score) {
142 best_score_partition = cur_partition;
143 }
144 }
145 qnetd_log(LOG_DEBUG, "algo-lms: best score partition is (" UTILS_PRI_RING_ID ") with score %d",
146 best_score_partition->ring_id.node_id, best_score_partition->ring_id.seq, best_score_partition->score);
147
148 /* Now check if it's really the highest score, and not just the joint-highest */
149 joint_leader = 0;
150 TAILQ_FOREACH(cur_partition, &info->partition_list, entries) {
151 if (best_score_partition != cur_partition &&
152 best_score_partition->score == cur_partition->score) {
153 joint_leader = 1;
154 }
155 }
156
157 if (!joint_leader) {
158 /* Partition with highest score is unique, allow us to run if we're in that partition. */
159 if (tlv_ring_id_eq(&best_score_partition->ring_id, ring_id)) {
160 qnetd_log(LOG_DEBUG, "algo-lms: We are in the best score partition. ACK");
161 *result_vote = info->last_result = TLV_VOTE_ACK;
162 }
163 else {
164 qnetd_log(LOG_DEBUG, "algo-lms: We are NOT in the best score partition. NACK");
165 *result_vote = info->last_result = TLV_VOTE_NACK;
166 }
167
168 qnetd_algo_free_partitions(&info->partition_list);
169
170 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
171 }
172
173 /*
174 * There are multiple partitions with same score. Find the largest partition
175 */
176 largest_partition = NULL;
177 TAILQ_FOREACH(cur_partition, &info->partition_list, entries) {
178 if (!largest_partition ||
179 largest_partition->num_nodes < cur_partition->num_nodes) {
180 largest_partition = cur_partition;
181 }
182 }
183
184 qnetd_log(LOG_DEBUG, "algo-lms: largest partition is (" UTILS_PRI_RING_ID ") with %d nodes",
185 largest_partition->ring_id.node_id, largest_partition->ring_id.seq, largest_partition->num_nodes);
186
187 /* Now check if it's really the largest, and not just the joint-largest */
188 joint_leader = 0;
189 TAILQ_FOREACH(cur_partition, &info->partition_list, entries) {
190 if (largest_partition != cur_partition &&
191 largest_partition->num_nodes == cur_partition->num_nodes) {
192 joint_leader = 1;
193 }
194 }
195
196 if (!joint_leader) {
197 /* Largest partition is unique, allow us to run if we're in that partition. */
198 if (tlv_ring_id_eq(&largest_partition->ring_id, ring_id)) {
199 qnetd_log(LOG_DEBUG, "algo-lms: We are in the largest partition. ACK");
200 *result_vote = info->last_result = TLV_VOTE_ACK;
201 }
202 else {
203 qnetd_log(LOG_DEBUG, "algo-lms: We are NOT in the largest partition. NACK");
204 *result_vote = info->last_result = TLV_VOTE_NACK;
205 }
206 }
207 else {
208 uint32_t tb_node_id;
209 struct tlv_ring_id tb_node_ring_id = {0LL, 0};
210
211 /* Look for the tie-breaker node */
212 if (client->tie_breaker.mode == TLV_TIE_BREAKER_MODE_LOWEST) {
213 tb_node_id = INT_MAX;
214 }
215 else if (client->tie_breaker.mode == TLV_TIE_BREAKER_MODE_HIGHEST) {
216 tb_node_id = 0;
217 }
218 else if (client->tie_breaker.mode == TLV_TIE_BREAKER_MODE_NODE_ID) {
219 tb_node_id = client->tie_breaker.node_id;
220 }
221 else {
222 qnetd_log(LOG_DEBUG, "algo-lms: denied vote because tie-breaker option is invalid: %d",
223 client->tie_breaker.mode);
224 tb_node_id = -1;
225 }
226
227 /* Find the tie_breaker node */
228 TAILQ_FOREACH(other_client, &client->cluster->client_list, cluster_entries) {
229 switch (client->tie_breaker.mode) {
230
231 case TLV_TIE_BREAKER_MODE_LOWEST:
232 if (other_client->node_id < tb_node_id) {
233 tb_node_id = other_client->node_id;
234 memcpy(&tb_node_ring_id, &other_client->last_ring_id, sizeof(struct tlv_ring_id));
235 qnetd_log(LOG_DEBUG, "algo-lms: Looking for low node ID. found %d (" UTILS_PRI_RING_ID ")",
236 tb_node_id, tb_node_ring_id.node_id, tb_node_ring_id.seq);
237 }
238 break;
239
240 case TLV_TIE_BREAKER_MODE_HIGHEST:
241 if (other_client->node_id > tb_node_id) {
242 tb_node_id = other_client->node_id;
243 memcpy(&tb_node_ring_id, &other_client->last_ring_id, sizeof(struct tlv_ring_id));
244 qnetd_log(LOG_DEBUG, "algo-lms: Looking for high node ID. found %d (" UTILS_PRI_RING_ID ")",
245 tb_node_id, tb_node_ring_id.node_id, tb_node_ring_id.seq);
246 }
247 break;
248 case TLV_TIE_BREAKER_MODE_NODE_ID:
249 if (client->tie_breaker.node_id == client->node_id) {
250 memcpy(&tb_node_ring_id, &other_client->last_ring_id, sizeof(struct tlv_ring_id));
251 qnetd_log(LOG_DEBUG, "algo-lms: Looking for nominated node ID. found %d (" UTILS_PRI_RING_ID ")",
252 tb_node_id, tb_node_ring_id.node_id, tb_node_ring_id.seq);
253
254 }
255 break;
256 default:
257 qnetd_log(LOG_DEBUG, "algo-lms: denied vote because tie-breaker option is invalid: %d",
258 client->tie_breaker.mode);
259 memset(&tb_node_ring_id, 0, sizeof(struct tlv_ring_id));
260 }
261 }
262
263 if (client->node_id == tb_node_id || tlv_ring_id_eq(&tb_node_ring_id, ring_id)) {
264 qnetd_log(LOG_DEBUG, "algo-lms: We are in the same partition (" UTILS_PRI_RING_ID ") as tie-breaker node id %d. ACK",
265 tb_node_ring_id.node_id, tb_node_ring_id.seq, tb_node_id);
266 *result_vote = info->last_result = TLV_VOTE_ACK;
267 }
268 else {
269 qnetd_log(LOG_DEBUG, "algo-lms: We are NOT in the same partition (" UTILS_PRI_RING_ID ") as tie-breaker node id %d. NACK",
270 tb_node_ring_id.node_id, tb_node_ring_id.seq, tb_node_id);
271 *result_vote = info->last_result = TLV_VOTE_NACK;
272 }
273 }
274
275 qnetd_algo_free_partitions(&info->partition_list);
276 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
277}
278
279enum tlv_reply_error_code
280qnetd_algo_lms_client_init(struct qnetd_client *client)
281{
282 struct qnetd_algo_lms_info *info;
283
284 info = malloc(sizeof(struct qnetd_algo_lms_info));
285 if (!info) {
286 return (TLV_REPLY_ERROR_CODE_INTERNAL_ERROR);
287 }
288
289 memset(info, 0, sizeof(*info));
290 client->algorithm_data = info;
291 info->last_result = 0; /* status unknown, or NEW */
292 TAILQ_INIT(&info->partition_list);
293 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
294}
295
296/*
297 * We got the config node list. Simply count the number of available nodes
298 * and wait for the quorum list.
299 */
300enum tlv_reply_error_code
301qnetd_algo_lms_config_node_list_received(struct qnetd_client *client,
302 uint32_t msg_seq_num, int config_version_set, uint64_t config_version,
303 const struct node_list *nodes, int initial, enum tlv_vote *result_vote)
304{
305 struct node_list_entry *node_info;
306 struct qnetd_algo_lms_info *info = client->algorithm_data;
307 int node_count = 0;
308
309 TAILQ_FOREACH(node_info, nodes, entries) {
310 node_count++;
311 }
312 info->num_config_nodes = node_count;
313 qnetd_log(LOG_DEBUG, "algo-lms: cluster %s config_list has %d nodes", client->cluster_name, node_count);
314
315 *result_vote = TLV_VOTE_NO_CHANGE;
316
317 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
318}
319
320/*
321 * membership node list. This is where we get to work.
322 */
323
324enum tlv_reply_error_code
325qnetd_algo_lms_membership_node_list_received(struct qnetd_client *client,
326 uint32_t msg_seq_num, const struct tlv_ring_id *ring_id,
327 const struct node_list *nodes, enum tlv_heuristics heuristics, enum tlv_vote *result_vote)
328{
329 qnetd_log(LOG_DEBUG, " ");
330 qnetd_log(LOG_DEBUG, "algo-lms: membership list from node %d partition (" UTILS_PRI_RING_ID ")", client->node_id, ring_id->node_id, ring_id->seq);
331
332 return do_lms_algorithm(client, ring_id, result_vote);
333}
334
335/*
336 * The quorum node list is received after corosync has decided which nodes are in the cluster.
337 * We run our algorithm again to be sure that things still match. By this time we will (or should)
338 * all know the current ring_id (not guaranteed when the membership list is received). So this
339 * might be the most reliable return.
340 */
341enum tlv_reply_error_code
342qnetd_algo_lms_quorum_node_list_received(struct qnetd_client *client,
343 uint32_t msg_seq_num, enum tlv_quorate quorate, const struct node_list *nodes, enum tlv_vote *result_vote)
344{
345 qnetd_log(LOG_DEBUG, " ");
346 qnetd_log(LOG_DEBUG, "algo-lms: quorum node list from node %d partition (" UTILS_PRI_RING_ID ")", client->node_id, client->last_ring_id.node_id, client->last_ring_id.seq);
347 return do_lms_algorithm(client, &client->last_ring_id, result_vote);
348}
349
350/*
351 * Called after client disconnect. Client structure is still existing (and it's part
352 * of a client->cluster), but it is destroyed (and removed from cluster) right after
353 * this callback finishes. Callback is used mainly for destroing client->algorithm_data.
354 */
355void
356qnetd_algo_lms_client_disconnect(struct qnetd_client *client, int server_going_down)
357{
358 qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id "UTILS_PRI_NODE_ID") "
359 "disconnect", client, client->cluster_name, client->node_id);
360
361 qnetd_log(LOG_INFO, "algo-lms: server going down %u", server_going_down);
362
363 free(client->algorithm_data);
364}
365
366/*
367 * Called after client sent ask for vote message. This is usually happening after server
368 * replied TLV_VOTE_WAIT_FOR_REPLY.
369 */
370enum tlv_reply_error_code
371qnetd_algo_lms_ask_for_vote_received(struct qnetd_client *client, uint32_t msg_seq_num,
372 enum tlv_vote *result_vote)
373{
374 qnetd_log(LOG_DEBUG, " ");
375 qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id "UTILS_PRI_NODE_ID") "
376 "asked for a vote", client, client->cluster_name, client->node_id);
377
378 return do_lms_algorithm(client, &client->last_ring_id, result_vote);
379}
380
381enum tlv_reply_error_code
382qnetd_algo_lms_vote_info_reply_received(struct qnetd_client *client, uint32_t msg_seq_num)
383{
384 qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id "UTILS_PRI_NODE_ID") "
385 "replied back to vote info message", client, client->cluster_name, client->node_id);
386
387 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
388}
389
390enum tlv_reply_error_code
391qnetd_algo_lms_heuristics_change_received(struct qnetd_client *client, uint32_t msg_seq_num,
392 enum tlv_heuristics heuristics, enum tlv_vote *result_vote)
393{
394
395 qnetd_log(LOG_INFO, "algo-lms: heuristics change is not supported.");
396
397 *result_vote = TLV_VOTE_NO_CHANGE;
398
399 return (TLV_REPLY_ERROR_CODE_NO_ERROR);
400}
401
402enum tlv_reply_error_code
403qnetd_algo_lms_timer_callback(struct qnetd_client *client, int *reschedule_timer,
404 int *send_vote, enum tlv_vote *result_vote)
405{
406 enum tlv_reply_error_code ret;
407
408 qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id "UTILS_PRI_NODE_ID") "
409 "Timer callback", client, client->cluster_name, client->node_id);
410
411 ret = do_lms_algorithm(client, &client->last_ring_id, result_vote);
412
413 if (ret == TLV_REPLY_ERROR_CODE_NO_ERROR &&
414 (*result_vote == TLV_VOTE_ACK || *result_vote == TLV_VOTE_NACK)) {
415 *send_vote = 1;
416 }
417
418 if (ret == TLV_REPLY_ERROR_CODE_NO_ERROR &&
419 *result_vote == TLV_VOTE_WAIT_FOR_REPLY) {
420 /*
421 * Reschedule was called in the do_lms_algorithm but algo_timer is
422 * not stack based so there can only be one. So if do_lms aborted
423 * the active timer, and scheduled it again the timer would be aborted
424 * if reschedule_timer was not set.
425 */
426 *reschedule_timer = 1;
427 }
428
429 return ret;
430}
431
432static struct qnetd_algorithm qnetd_algo_lms = {
433 .init = qnetd_algo_lms_client_init,
434 .config_node_list_received = qnetd_algo_lms_config_node_list_received,
435 .membership_node_list_received = qnetd_algo_lms_membership_node_list_received,
436 .quorum_node_list_received = qnetd_algo_lms_quorum_node_list_received,
437 .client_disconnect = qnetd_algo_lms_client_disconnect,
438 .ask_for_vote_received = qnetd_algo_lms_ask_for_vote_received,
439 .vote_info_reply_received = qnetd_algo_lms_vote_info_reply_received,
440 .heuristics_change_received = qnetd_algo_lms_heuristics_change_received,
441 .timer_callback = qnetd_algo_lms_timer_callback,
442};
443
444enum tlv_reply_error_code qnetd_algo_lms_register()
445{
446 return qnetd_algorithm_register(TLV_DECISION_ALGORITHM_TYPE_LMS, &qnetd_algo_lms);
447}