]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/Elector.h
update sources to v12.1.1
[ceph.git] / ceph / src / mon / Elector.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16#ifndef CEPH_MON_ELECTOR_H
17#define CEPH_MON_ELECTOR_H
18
19#include <map>
20using namespace std;
21
22#include "include/types.h"
23#include "include/Context.h"
24#include "mon/MonOpRequest.h"
25#include "mon/mon_types.h"
26
27class Monitor;
28
29/**
30 * This class is responsible for maintaining the local state when electing
31 * a new Leader. We may win or we may lose. If we win, it means we became the
32 * Leader; if we lose, it means we are a Peon.
33 */
34class Elector {
35 /**
36 * @defgroup Elector_h_class Elector
37 * @{
38 */
39 private:
40 /**
41 * @defgroup Elector_h_internal_types Internal Types
42 * @{
43 */
44 /**
45 * This struct will hold the features from a given peer.
46 * Features may both be the cluster's (in the form of a uint64_t), or
47 * mon-specific features. Instead of keeping maps to hold them both, or
48 * a pair, which would be weird, a struct to keep them seems appropriate.
49 */
224ce89b 50 struct elector_info_t {
7c673cae
FG
51 uint64_t cluster_features;
52 mon_feature_t mon_features;
224ce89b 53 map<string,string> metadata;
7c673cae
FG
54 };
55
56 /**
57 * @}
58 */
59
60 /**
61 * The Monitor instance associated with this class.
62 */
63 Monitor *mon;
64
65 /**
66 * Event callback responsible for dealing with an expired election once a
67 * timer runs out and fires up.
68 */
69 Context *expire_event = nullptr;
70
71 /**
72 * Resets the expire_event timer, by cancelling any existing one and
73 * scheduling a new one.
74 *
75 * @remarks This function assumes as a default firing value the duration of
76 * the monitor's lease interval, and adds to it the value specified
77 * in @e plus
78 *
79 * @post expire_event is set
80 *
81 * @param plus The amount of time to be added to the default firing value.
82 */
83 void reset_timer(double plus=0.0);
84 /**
85 * Cancel the expire_event timer, if it is defined.
86 *
87 * @post expire_event is not set
88 */
89 void cancel_timer();
90
91 /**
92 * Latest epoch we've seen.
93 *
94 * @remarks if its value is odd, we're electing; if it's even, then we're
95 * stable.
96 */
97 epoch_t epoch;
98
99 /**
100 * Indicates if we are participating in the quorum.
101 *
102 * @remarks By default, we are created as participating. We may stop
103 * participating if the Monitor explicitely calls
104 * Elector::stop_participating though. If that happens, it will
105 * have to call Elector::start_participating for us to resume
106 * participating in the quorum.
107 */
108 bool participating;
109
110 // electing me
111 /**
112 * @defgroup Elector_h_electing_me_vars We are being elected
113 * @{
114 */
115 /**
116 * Indicates if we are the ones being elected.
117 *
118 * We always attempt to be the one being elected if we are the ones starting
119 * the election. If we are not the ones that started it, we will only attempt
120 * to be elected if we think we might have a chance (i.e., the other guy's
121 * rank is lower than ours).
122 */
123 bool electing_me;
124 /**
125 * Holds the time at which we started the election.
126 */
127 utime_t start_stamp;
128 /**
129 * Set containing all those that acked our proposal to become the Leader.
130 *
131 * If we are acked by everyone in the MonMap, we will declare
132 * victory. Also note each peer's feature set.
133 */
224ce89b 134 map<int, elector_info_t> acked_me;
7c673cae
FG
135 /**
136 * @}
137 */
138 /**
139 * @defgroup Elector_h_electing_them_vars We are electing another guy
140 * @{
141 */
142 /**
143 * Indicates who we have acked
144 */
145 int leader_acked;
146 /**
147 * Indicates when we have acked it
148 */
149 utime_t ack_stamp;
150 /**
151 * @}
152 */
153
154 /**
155 * Update our epoch.
156 *
157 * If we come across a higher epoch, we simply update ours, also making
158 * sure we are no longer being elected (even though we could have been,
159 * we no longer are since we no longer are on that old epoch).
160 *
161 * @pre Our epoch is lower than @p e
162 * @post Our epoch equals @p e
163 *
164 * @param e Epoch to which we will update our epoch
165 */
166 void bump_epoch(epoch_t e);
167
168 /**
169 * Start new elections by proposing ourselves as the new Leader.
170 *
171 * Basically, send propose messages to all the monitors in the MonMap and
172 * then reset the expire_event timer so we can limit the amount of time we
173 * will be going at it.
174 *
175 * @pre participating is true
176 * @post epoch is an odd value
177 * @post electing_me is true
178 * @post we sent propose messages to all the monitors in the MonMap
179 * @post we reset the expire_event timer
180 */
181 void start();
182 /**
183 * Defer the current election to some other monitor.
184 *
185 * This means that we will ack some other monitor and drop out from the run
186 * to become the Leader. We will only defer an election if the monitor we
187 * are deferring to outranks us.
188 *
189 * @pre @p who outranks us (i.e., who < our rank)
190 * @pre @p who outranks any other monitor we have deferred to in the past
191 * @post electing_me is false
192 * @post leader_acked equals @p who
193 * @post we sent an ack message to @p who
194 * @post we reset the expire_event timer
195 *
196 * @param who Some other monitor's numeric identifier.
197 */
198 void defer(int who);
199 /**
200 * The election has taken too long and has expired.
201 *
202 * This will happen when no one declared victory or started a new election
203 * during the time span allowed by the expire_event timer.
204 *
205 * When the election expires, we will check if we were the ones who won, and
206 * if so we will declare victory. If that is not the case, then we assume
207 * that the one we defered to didn't declare victory quickly enough (in fact,
208 * as far as we know, we may even be dead); so, just propose ourselves as the
209 * Leader.
210 */
211 void expire();
212 /**
213 * Declare Victory.
214 *
215 * We won. Or at least we believe we won, but for all intentions and purposes
216 * that does not matter. What matters is that we Won.
217 *
218 * That said, we must now bump our epoch to reflect that the election is over
219 * and then we must let everybody in the quorum know we are their brand new
220 * Leader. And we will also cancel our expire_event timer.
221 *
222 * Actually, the quorum will be now defined as the group of monitors that
223 * acked us during the election process.
224 *
225 * @pre Election is on-going
226 * @pre electing_me is true
227 * @post electing_me is false
228 * @post epoch is bumped up into an even value
229 * @post Election is not on-going
230 * @post We have a quorum, composed of the monitors that acked us
231 * @post We sent a message of type OP_VICTORY to each quorum member.
232 */
233 void victory();
234
235 /**
236 * Handle a message from some other node proposing itself to become it
237 * the Leader.
238 *
239 * If the message appears to be old (i.e., its epoch is lower than our epoch),
240 * then we may take one of two actions:
241 *
242 * @li Ignore it because it's nothing more than an old proposal
243 * @li Start new elections if we verify that it was sent by a monitor from
244 * outside the quorum; given its old state, it's fair to assume it just
245 * started, so we should start new elections so it may rejoin
246 *
247 * If we did not ignore the received message, then we know that this message
248 * was sent by some other node proposing itself to become the Leader. So, we
249 * will take one of the following actions:
250 *
251 * @li Ignore it because we already acked another node with higher rank
252 * @li Ignore it and start a new election because we outrank it
253 * @li Defer to it because it outranks us and the node we previously
254 * acked, if any
255 *
256 *
257 * @invariant The received message is an operation of type OP_PROPOSE
258 *
259 * @param m A message sent by another participant in the quorum.
260 */
261 void handle_propose(MonOpRequestRef op);
262 /**
263 * Handle a message from some other participant Acking us as the Leader.
264 *
265 * When we receive such a message, one of three thing may be happening:
266 * @li We received a message with a newer epoch, which means we must have
267 * somehow lost track of what was going on (maybe we rebooted), thus we
268 * will start a new election
269 * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
270 * is true), and we are actually being Acked by someone; thus simply add
271 * the one acking us to the @p acked_me set. If we do now have acks from
272 * all the participants, then we can declare victory
273 * @li We already deferred the election to somebody else, so we will just
274 * ignore this message
275 *
276 * @pre Election is on-going
277 * @post Election is on-going if we deferred to somebody else
278 * @post Election is on-going if we are still waiting for further Acks
279 * @post Election is not on-going if we are victorious
280 * @post Election is not on-going if we must start a new one
281 *
282 * @param m A message with an operation type of OP_ACK
283 */
284 void handle_ack(MonOpRequestRef op);
285 /**
286 * Handle a message from some other participant declaring Victory.
287 *
288 * We just got a message from someone declaring themselves Victorious, thus
289 * the new Leader.
290 *
291 * However, if the message's epoch happens to be different from our epoch+1,
292 * then it means we lost track of something and we must start a new election.
293 *
294 * If that is not the case, then we will simply update our epoch to the one
295 * in the message, cancel our @p expire_event timer and inform our Monitor
296 * that we lost the election and provide it with the new quorum.
297 *
298 * @pre Election in on-going
299 * @post Election is not on-going
300 * @post Updated @p epoch
301 * @post We have a new quorum if we lost the election
302 *
303 * @param m A message with an operation type of OP_VICTORY
304 */
305 void handle_victory(MonOpRequestRef op);
306 /**
307 * Send a nak to a peer who's out of date, containing information about why.
308 *
309 * If we get a message from a peer who can't support the required quorum
310 * features, we have to ignore them. This function will at least send
311 * them a message about *why* they're being ignored -- if they're new
312 * enough to support such a message.
313 *
314 * @param m A message from a monitor not supporting required features. We
315 * take ownership of the reference.
316 */
317 void nak_old_peer(MonOpRequestRef op);
318 /**
319 * Handle a message from some other participant declaring
320 * we cannot join the quorum.
321 *
322 * Apparently the quorum requires some feature that we do not implement. Shut
323 * down gracefully.
324 *
325 * @pre Election is on-going.
326 * @post We've shut down.
327 *
328 * @param m A message with an operation type of OP_NAK
329 */
330 void handle_nak(MonOpRequestRef op);
331
332 public:
333 /**
334 * Create an Elector class
335 *
336 * @param m A Monitor instance
337 */
338 explicit Elector(Monitor *m) : mon(m),
339 epoch(0),
340 participating(true),
341 electing_me(false),
342 leader_acked(-1) { }
343
344 /**
345 * Initiate the Elector class.
346 *
347 * Basically, we will simply read whatever epoch value we have in our stable
348 * storage, or consider it to be 1 if none is read.
349 *
350 * @post @p epoch is set to 1 or higher.
351 */
352 void init();
353 /**
354 * Inform this class it is supposed to shutdown.
355 *
356 * We will simply cancel the @p expire_event if any exists.
357 *
358 * @post @p expire_event is cancelled
359 */
360 void shutdown();
361
362 /**
363 * Obtain our epoch
364 *
365 * @returns Our current epoch number
366 */
367 epoch_t get_epoch() { return epoch; }
368
369 /**
370 * advance_epoch
371 *
372 * increase election epoch by 1
373 */
374 void advance_epoch() {
375 bump_epoch(epoch + 1);
376 }
377
378 /**
379 * Handle received messages.
380 *
381 * We will ignore all messages that are not of type @p MSG_MON_ELECTION
382 * (i.e., messages whose interface is not of type @p MMonElection). All of
383 * those that are will then be dispatched to their operation-specific
384 * functions.
385 *
386 * @param m A received message
387 */
388 void dispatch(MonOpRequestRef op);
389
390 /**
391 * Call an election.
392 *
393 * This function simply calls Elector::start.
394 */
395 void call_election() {
396 start();
397 }
398
399 /**
400 * Stop participating in subsequent Elections.
401 *
402 * @post @p participating is false
403 */
404 void stop_participating() { participating = false; }
405 /**
406 * Start participating in Elections.
407 *
408 * If we are already participating (i.e., @p participating is true), then
409 * calling this function is moot.
410 *
411 * However, if we are not participating (i.e., @p participating is false),
412 * then we will start participating by setting @p participating to true and
413 * we will call for an Election.
414 *
415 * @post @p participating is true
416 */
417 void start_participating();
418
419 /**
420 * @}
421 */
422};
423
424#endif