]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Elector.h
2e407d29058258c92b3223a62ed40257b99275e3
[ceph.git] / ceph / src / mon / Elector.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_MON_ELECTOR_H
17 #define CEPH_MON_ELECTOR_H
18
19 #include <map>
20 using namespace std;
21
22 #include "include/types.h"
23 #include "include/Context.h"
24 #include "mon/MonOpRequest.h"
25 #include "mon/mon_types.h"
26
27 class Monitor;
28
29 /**
30 * This class is responsible for maintaining the local state when electing
31 * a new Leader. We may win or we may lose. If we win, it means we became the
32 * Leader; if we lose, it means we are a Peon.
33 */
34 class Elector {
35 /**
36 * @defgroup Elector_h_class Elector
37 * @{
38 */
39 private:
40 /**
41 * @defgroup Elector_h_internal_types Internal Types
42 * @{
43 */
44 /**
45 * This struct will hold the features from a given peer.
46 * Features may both be the cluster's (in the form of a uint64_t), or
47 * mon-specific features. Instead of keeping maps to hold them both, or
48 * a pair, which would be weird, a struct to keep them seems appropriate.
49 */
50 struct elector_features_t {
51 uint64_t cluster_features;
52 mon_feature_t mon_features;
53 };
54
55 /**
56 * @}
57 */
58
59 /**
60 * The Monitor instance associated with this class.
61 */
62 Monitor *mon;
63
64 /**
65 * Event callback responsible for dealing with an expired election once a
66 * timer runs out and fires up.
67 */
68 Context *expire_event = nullptr;
69
70 /**
71 * Resets the expire_event timer, by cancelling any existing one and
72 * scheduling a new one.
73 *
74 * @remarks This function assumes as a default firing value the duration of
75 * the monitor's lease interval, and adds to it the value specified
76 * in @e plus
77 *
78 * @post expire_event is set
79 *
80 * @param plus The amount of time to be added to the default firing value.
81 */
82 void reset_timer(double plus=0.0);
83 /**
84 * Cancel the expire_event timer, if it is defined.
85 *
86 * @post expire_event is not set
87 */
88 void cancel_timer();
89
90 /**
91 * Latest epoch we've seen.
92 *
93 * @remarks if its value is odd, we're electing; if it's even, then we're
94 * stable.
95 */
96 epoch_t epoch;
97
98 /**
99 * Indicates if we are participating in the quorum.
100 *
101 * @remarks By default, we are created as participating. We may stop
102 * participating if the Monitor explicitely calls
103 * Elector::stop_participating though. If that happens, it will
104 * have to call Elector::start_participating for us to resume
105 * participating in the quorum.
106 */
107 bool participating;
108
109 // electing me
110 /**
111 * @defgroup Elector_h_electing_me_vars We are being elected
112 * @{
113 */
114 /**
115 * Indicates if we are the ones being elected.
116 *
117 * We always attempt to be the one being elected if we are the ones starting
118 * the election. If we are not the ones that started it, we will only attempt
119 * to be elected if we think we might have a chance (i.e., the other guy's
120 * rank is lower than ours).
121 */
122 bool electing_me;
123 /**
124 * Holds the time at which we started the election.
125 */
126 utime_t start_stamp;
127 /**
128 * Set containing all those that acked our proposal to become the Leader.
129 *
130 * If we are acked by everyone in the MonMap, we will declare
131 * victory. Also note each peer's feature set.
132 */
133 map<int, elector_features_t> acked_me;
134 /**
135 * @}
136 */
137 /**
138 * @defgroup Elector_h_electing_them_vars We are electing another guy
139 * @{
140 */
141 /**
142 * Indicates who we have acked
143 */
144 int leader_acked;
145 /**
146 * Indicates when we have acked it
147 */
148 utime_t ack_stamp;
149 /**
150 * @}
151 */
152
153 /**
154 * Update our epoch.
155 *
156 * If we come across a higher epoch, we simply update ours, also making
157 * sure we are no longer being elected (even though we could have been,
158 * we no longer are since we no longer are on that old epoch).
159 *
160 * @pre Our epoch is lower than @p e
161 * @post Our epoch equals @p e
162 *
163 * @param e Epoch to which we will update our epoch
164 */
165 void bump_epoch(epoch_t e);
166
167 /**
168 * Start new elections by proposing ourselves as the new Leader.
169 *
170 * Basically, send propose messages to all the monitors in the MonMap and
171 * then reset the expire_event timer so we can limit the amount of time we
172 * will be going at it.
173 *
174 * @pre participating is true
175 * @post epoch is an odd value
176 * @post electing_me is true
177 * @post we sent propose messages to all the monitors in the MonMap
178 * @post we reset the expire_event timer
179 */
180 void start();
181 /**
182 * Defer the current election to some other monitor.
183 *
184 * This means that we will ack some other monitor and drop out from the run
185 * to become the Leader. We will only defer an election if the monitor we
186 * are deferring to outranks us.
187 *
188 * @pre @p who outranks us (i.e., who < our rank)
189 * @pre @p who outranks any other monitor we have deferred to in the past
190 * @post electing_me is false
191 * @post leader_acked equals @p who
192 * @post we sent an ack message to @p who
193 * @post we reset the expire_event timer
194 *
195 * @param who Some other monitor's numeric identifier.
196 */
197 void defer(int who);
198 /**
199 * The election has taken too long and has expired.
200 *
201 * This will happen when no one declared victory or started a new election
202 * during the time span allowed by the expire_event timer.
203 *
204 * When the election expires, we will check if we were the ones who won, and
205 * if so we will declare victory. If that is not the case, then we assume
206 * that the one we defered to didn't declare victory quickly enough (in fact,
207 * as far as we know, we may even be dead); so, just propose ourselves as the
208 * Leader.
209 */
210 void expire();
211 /**
212 * Declare Victory.
213 *
214 * We won. Or at least we believe we won, but for all intentions and purposes
215 * that does not matter. What matters is that we Won.
216 *
217 * That said, we must now bump our epoch to reflect that the election is over
218 * and then we must let everybody in the quorum know we are their brand new
219 * Leader. And we will also cancel our expire_event timer.
220 *
221 * Actually, the quorum will be now defined as the group of monitors that
222 * acked us during the election process.
223 *
224 * @pre Election is on-going
225 * @pre electing_me is true
226 * @post electing_me is false
227 * @post epoch is bumped up into an even value
228 * @post Election is not on-going
229 * @post We have a quorum, composed of the monitors that acked us
230 * @post We sent a message of type OP_VICTORY to each quorum member.
231 */
232 void victory();
233
234 /**
235 * Handle a message from some other node proposing itself to become it
236 * the Leader.
237 *
238 * If the message appears to be old (i.e., its epoch is lower than our epoch),
239 * then we may take one of two actions:
240 *
241 * @li Ignore it because it's nothing more than an old proposal
242 * @li Start new elections if we verify that it was sent by a monitor from
243 * outside the quorum; given its old state, it's fair to assume it just
244 * started, so we should start new elections so it may rejoin
245 *
246 * If we did not ignore the received message, then we know that this message
247 * was sent by some other node proposing itself to become the Leader. So, we
248 * will take one of the following actions:
249 *
250 * @li Ignore it because we already acked another node with higher rank
251 * @li Ignore it and start a new election because we outrank it
252 * @li Defer to it because it outranks us and the node we previously
253 * acked, if any
254 *
255 *
256 * @invariant The received message is an operation of type OP_PROPOSE
257 *
258 * @param m A message sent by another participant in the quorum.
259 */
260 void handle_propose(MonOpRequestRef op);
261 /**
262 * Handle a message from some other participant Acking us as the Leader.
263 *
264 * When we receive such a message, one of three thing may be happening:
265 * @li We received a message with a newer epoch, which means we must have
266 * somehow lost track of what was going on (maybe we rebooted), thus we
267 * will start a new election
268 * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
269 * is true), and we are actually being Acked by someone; thus simply add
270 * the one acking us to the @p acked_me set. If we do now have acks from
271 * all the participants, then we can declare victory
272 * @li We already deferred the election to somebody else, so we will just
273 * ignore this message
274 *
275 * @pre Election is on-going
276 * @post Election is on-going if we deferred to somebody else
277 * @post Election is on-going if we are still waiting for further Acks
278 * @post Election is not on-going if we are victorious
279 * @post Election is not on-going if we must start a new one
280 *
281 * @param m A message with an operation type of OP_ACK
282 */
283 void handle_ack(MonOpRequestRef op);
284 /**
285 * Handle a message from some other participant declaring Victory.
286 *
287 * We just got a message from someone declaring themselves Victorious, thus
288 * the new Leader.
289 *
290 * However, if the message's epoch happens to be different from our epoch+1,
291 * then it means we lost track of something and we must start a new election.
292 *
293 * If that is not the case, then we will simply update our epoch to the one
294 * in the message, cancel our @p expire_event timer and inform our Monitor
295 * that we lost the election and provide it with the new quorum.
296 *
297 * @pre Election in on-going
298 * @post Election is not on-going
299 * @post Updated @p epoch
300 * @post We have a new quorum if we lost the election
301 *
302 * @param m A message with an operation type of OP_VICTORY
303 */
304 void handle_victory(MonOpRequestRef op);
305 /**
306 * Send a nak to a peer who's out of date, containing information about why.
307 *
308 * If we get a message from a peer who can't support the required quorum
309 * features, we have to ignore them. This function will at least send
310 * them a message about *why* they're being ignored -- if they're new
311 * enough to support such a message.
312 *
313 * @param m A message from a monitor not supporting required features. We
314 * take ownership of the reference.
315 */
316 void nak_old_peer(MonOpRequestRef op);
317 /**
318 * Handle a message from some other participant declaring
319 * we cannot join the quorum.
320 *
321 * Apparently the quorum requires some feature that we do not implement. Shut
322 * down gracefully.
323 *
324 * @pre Election is on-going.
325 * @post We've shut down.
326 *
327 * @param m A message with an operation type of OP_NAK
328 */
329 void handle_nak(MonOpRequestRef op);
330
331 public:
332 /**
333 * Create an Elector class
334 *
335 * @param m A Monitor instance
336 */
337 explicit Elector(Monitor *m) : mon(m),
338 epoch(0),
339 participating(true),
340 electing_me(false),
341 leader_acked(-1) { }
342
343 /**
344 * Initiate the Elector class.
345 *
346 * Basically, we will simply read whatever epoch value we have in our stable
347 * storage, or consider it to be 1 if none is read.
348 *
349 * @post @p epoch is set to 1 or higher.
350 */
351 void init();
352 /**
353 * Inform this class it is supposed to shutdown.
354 *
355 * We will simply cancel the @p expire_event if any exists.
356 *
357 * @post @p expire_event is cancelled
358 */
359 void shutdown();
360
361 /**
362 * Obtain our epoch
363 *
364 * @returns Our current epoch number
365 */
366 epoch_t get_epoch() { return epoch; }
367
368 /**
369 * advance_epoch
370 *
371 * increase election epoch by 1
372 */
373 void advance_epoch() {
374 bump_epoch(epoch + 1);
375 }
376
377 /**
378 * Handle received messages.
379 *
380 * We will ignore all messages that are not of type @p MSG_MON_ELECTION
381 * (i.e., messages whose interface is not of type @p MMonElection). All of
382 * those that are will then be dispatched to their operation-specific
383 * functions.
384 *
385 * @param m A received message
386 */
387 void dispatch(MonOpRequestRef op);
388
389 /**
390 * Call an election.
391 *
392 * This function simply calls Elector::start.
393 */
394 void call_election() {
395 start();
396 }
397
398 /**
399 * Stop participating in subsequent Elections.
400 *
401 * @post @p participating is false
402 */
403 void stop_participating() { participating = false; }
404 /**
405 * Start participating in Elections.
406 *
407 * If we are already participating (i.e., @p participating is true), then
408 * calling this function is moot.
409 *
410 * However, if we are not participating (i.e., @p participating is false),
411 * then we will start participating by setting @p participating to true and
412 * we will call for an Election.
413 *
414 * @post @p participating is true
415 */
416 void start_participating();
417
418 /**
419 * @}
420 */
421 };
422
423 #endif