]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef CEPH_MON_ELECTOR_H | |
17 | #define CEPH_MON_ELECTOR_H | |
18 | ||
19 | #include <map> | |
20 | using namespace std; | |
21 | ||
22 | #include "include/types.h" | |
23 | #include "include/Context.h" | |
24 | #include "mon/MonOpRequest.h" | |
25 | #include "mon/mon_types.h" | |
26 | ||
27 | class Monitor; | |
28 | ||
29 | /** | |
30 | * This class is responsible for maintaining the local state when electing | |
31 | * a new Leader. We may win or we may lose. If we win, it means we became the | |
32 | * Leader; if we lose, it means we are a Peon. | |
33 | */ | |
34 | class Elector { | |
35 | /** | |
36 | * @defgroup Elector_h_class Elector | |
37 | * @{ | |
38 | */ | |
39 | private: | |
40 | /** | |
41 | * @defgroup Elector_h_internal_types Internal Types | |
42 | * @{ | |
43 | */ | |
44 | /** | |
45 | * This struct will hold the features from a given peer. | |
46 | * Features may both be the cluster's (in the form of a uint64_t), or | |
47 | * mon-specific features. Instead of keeping maps to hold them both, or | |
48 | * a pair, which would be weird, a struct to keep them seems appropriate. | |
49 | */ | |
50 | struct elector_features_t { | |
51 | uint64_t cluster_features; | |
52 | mon_feature_t mon_features; | |
53 | }; | |
54 | ||
55 | /** | |
56 | * @} | |
57 | */ | |
58 | ||
59 | /** | |
60 | * The Monitor instance associated with this class. | |
61 | */ | |
62 | Monitor *mon; | |
63 | ||
64 | /** | |
65 | * Event callback responsible for dealing with an expired election once a | |
66 | * timer runs out and fires up. | |
67 | */ | |
68 | Context *expire_event = nullptr; | |
69 | ||
70 | /** | |
71 | * Resets the expire_event timer, by cancelling any existing one and | |
72 | * scheduling a new one. | |
73 | * | |
74 | * @remarks This function assumes as a default firing value the duration of | |
75 | * the monitor's lease interval, and adds to it the value specified | |
76 | * in @e plus | |
77 | * | |
78 | * @post expire_event is set | |
79 | * | |
80 | * @param plus The amount of time to be added to the default firing value. | |
81 | */ | |
82 | void reset_timer(double plus=0.0); | |
83 | /** | |
84 | * Cancel the expire_event timer, if it is defined. | |
85 | * | |
86 | * @post expire_event is not set | |
87 | */ | |
88 | void cancel_timer(); | |
89 | ||
90 | /** | |
91 | * Latest epoch we've seen. | |
92 | * | |
93 | * @remarks if its value is odd, we're electing; if it's even, then we're | |
94 | * stable. | |
95 | */ | |
96 | epoch_t epoch; | |
97 | ||
98 | /** | |
99 | * Indicates if we are participating in the quorum. | |
100 | * | |
101 | * @remarks By default, we are created as participating. We may stop | |
102 | * participating if the Monitor explicitely calls | |
103 | * Elector::stop_participating though. If that happens, it will | |
104 | * have to call Elector::start_participating for us to resume | |
105 | * participating in the quorum. | |
106 | */ | |
107 | bool participating; | |
108 | ||
109 | // electing me | |
110 | /** | |
111 | * @defgroup Elector_h_electing_me_vars We are being elected | |
112 | * @{ | |
113 | */ | |
114 | /** | |
115 | * Indicates if we are the ones being elected. | |
116 | * | |
117 | * We always attempt to be the one being elected if we are the ones starting | |
118 | * the election. If we are not the ones that started it, we will only attempt | |
119 | * to be elected if we think we might have a chance (i.e., the other guy's | |
120 | * rank is lower than ours). | |
121 | */ | |
122 | bool electing_me; | |
123 | /** | |
124 | * Holds the time at which we started the election. | |
125 | */ | |
126 | utime_t start_stamp; | |
127 | /** | |
128 | * Set containing all those that acked our proposal to become the Leader. | |
129 | * | |
130 | * If we are acked by everyone in the MonMap, we will declare | |
131 | * victory. Also note each peer's feature set. | |
132 | */ | |
133 | map<int, elector_features_t> acked_me; | |
134 | /** | |
135 | * @} | |
136 | */ | |
137 | /** | |
138 | * @defgroup Elector_h_electing_them_vars We are electing another guy | |
139 | * @{ | |
140 | */ | |
141 | /** | |
142 | * Indicates who we have acked | |
143 | */ | |
144 | int leader_acked; | |
145 | /** | |
146 | * Indicates when we have acked it | |
147 | */ | |
148 | utime_t ack_stamp; | |
149 | /** | |
150 | * @} | |
151 | */ | |
152 | ||
153 | /** | |
154 | * Update our epoch. | |
155 | * | |
156 | * If we come across a higher epoch, we simply update ours, also making | |
157 | * sure we are no longer being elected (even though we could have been, | |
158 | * we no longer are since we no longer are on that old epoch). | |
159 | * | |
160 | * @pre Our epoch is lower than @p e | |
161 | * @post Our epoch equals @p e | |
162 | * | |
163 | * @param e Epoch to which we will update our epoch | |
164 | */ | |
165 | void bump_epoch(epoch_t e); | |
166 | ||
167 | /** | |
168 | * Start new elections by proposing ourselves as the new Leader. | |
169 | * | |
170 | * Basically, send propose messages to all the monitors in the MonMap and | |
171 | * then reset the expire_event timer so we can limit the amount of time we | |
172 | * will be going at it. | |
173 | * | |
174 | * @pre participating is true | |
175 | * @post epoch is an odd value | |
176 | * @post electing_me is true | |
177 | * @post we sent propose messages to all the monitors in the MonMap | |
178 | * @post we reset the expire_event timer | |
179 | */ | |
180 | void start(); | |
181 | /** | |
182 | * Defer the current election to some other monitor. | |
183 | * | |
184 | * This means that we will ack some other monitor and drop out from the run | |
185 | * to become the Leader. We will only defer an election if the monitor we | |
186 | * are deferring to outranks us. | |
187 | * | |
188 | * @pre @p who outranks us (i.e., who < our rank) | |
189 | * @pre @p who outranks any other monitor we have deferred to in the past | |
190 | * @post electing_me is false | |
191 | * @post leader_acked equals @p who | |
192 | * @post we sent an ack message to @p who | |
193 | * @post we reset the expire_event timer | |
194 | * | |
195 | * @param who Some other monitor's numeric identifier. | |
196 | */ | |
197 | void defer(int who); | |
198 | /** | |
199 | * The election has taken too long and has expired. | |
200 | * | |
201 | * This will happen when no one declared victory or started a new election | |
202 | * during the time span allowed by the expire_event timer. | |
203 | * | |
204 | * When the election expires, we will check if we were the ones who won, and | |
205 | * if so we will declare victory. If that is not the case, then we assume | |
206 | * that the one we defered to didn't declare victory quickly enough (in fact, | |
207 | * as far as we know, we may even be dead); so, just propose ourselves as the | |
208 | * Leader. | |
209 | */ | |
210 | void expire(); | |
211 | /** | |
212 | * Declare Victory. | |
213 | * | |
214 | * We won. Or at least we believe we won, but for all intentions and purposes | |
215 | * that does not matter. What matters is that we Won. | |
216 | * | |
217 | * That said, we must now bump our epoch to reflect that the election is over | |
218 | * and then we must let everybody in the quorum know we are their brand new | |
219 | * Leader. And we will also cancel our expire_event timer. | |
220 | * | |
221 | * Actually, the quorum will be now defined as the group of monitors that | |
222 | * acked us during the election process. | |
223 | * | |
224 | * @pre Election is on-going | |
225 | * @pre electing_me is true | |
226 | * @post electing_me is false | |
227 | * @post epoch is bumped up into an even value | |
228 | * @post Election is not on-going | |
229 | * @post We have a quorum, composed of the monitors that acked us | |
230 | * @post We sent a message of type OP_VICTORY to each quorum member. | |
231 | */ | |
232 | void victory(); | |
233 | ||
234 | /** | |
235 | * Handle a message from some other node proposing itself to become it | |
236 | * the Leader. | |
237 | * | |
238 | * If the message appears to be old (i.e., its epoch is lower than our epoch), | |
239 | * then we may take one of two actions: | |
240 | * | |
241 | * @li Ignore it because it's nothing more than an old proposal | |
242 | * @li Start new elections if we verify that it was sent by a monitor from | |
243 | * outside the quorum; given its old state, it's fair to assume it just | |
244 | * started, so we should start new elections so it may rejoin | |
245 | * | |
246 | * If we did not ignore the received message, then we know that this message | |
247 | * was sent by some other node proposing itself to become the Leader. So, we | |
248 | * will take one of the following actions: | |
249 | * | |
250 | * @li Ignore it because we already acked another node with higher rank | |
251 | * @li Ignore it and start a new election because we outrank it | |
252 | * @li Defer to it because it outranks us and the node we previously | |
253 | * acked, if any | |
254 | * | |
255 | * | |
256 | * @invariant The received message is an operation of type OP_PROPOSE | |
257 | * | |
258 | * @param m A message sent by another participant in the quorum. | |
259 | */ | |
260 | void handle_propose(MonOpRequestRef op); | |
261 | /** | |
262 | * Handle a message from some other participant Acking us as the Leader. | |
263 | * | |
264 | * When we receive such a message, one of three thing may be happening: | |
265 | * @li We received a message with a newer epoch, which means we must have | |
266 | * somehow lost track of what was going on (maybe we rebooted), thus we | |
267 | * will start a new election | |
268 | * @li We consider ourselves in the run for the Leader (i.e., @p electing_me | |
269 | * is true), and we are actually being Acked by someone; thus simply add | |
270 | * the one acking us to the @p acked_me set. If we do now have acks from | |
271 | * all the participants, then we can declare victory | |
272 | * @li We already deferred the election to somebody else, so we will just | |
273 | * ignore this message | |
274 | * | |
275 | * @pre Election is on-going | |
276 | * @post Election is on-going if we deferred to somebody else | |
277 | * @post Election is on-going if we are still waiting for further Acks | |
278 | * @post Election is not on-going if we are victorious | |
279 | * @post Election is not on-going if we must start a new one | |
280 | * | |
281 | * @param m A message with an operation type of OP_ACK | |
282 | */ | |
283 | void handle_ack(MonOpRequestRef op); | |
284 | /** | |
285 | * Handle a message from some other participant declaring Victory. | |
286 | * | |
287 | * We just got a message from someone declaring themselves Victorious, thus | |
288 | * the new Leader. | |
289 | * | |
290 | * However, if the message's epoch happens to be different from our epoch+1, | |
291 | * then it means we lost track of something and we must start a new election. | |
292 | * | |
293 | * If that is not the case, then we will simply update our epoch to the one | |
294 | * in the message, cancel our @p expire_event timer and inform our Monitor | |
295 | * that we lost the election and provide it with the new quorum. | |
296 | * | |
297 | * @pre Election in on-going | |
298 | * @post Election is not on-going | |
299 | * @post Updated @p epoch | |
300 | * @post We have a new quorum if we lost the election | |
301 | * | |
302 | * @param m A message with an operation type of OP_VICTORY | |
303 | */ | |
304 | void handle_victory(MonOpRequestRef op); | |
305 | /** | |
306 | * Send a nak to a peer who's out of date, containing information about why. | |
307 | * | |
308 | * If we get a message from a peer who can't support the required quorum | |
309 | * features, we have to ignore them. This function will at least send | |
310 | * them a message about *why* they're being ignored -- if they're new | |
311 | * enough to support such a message. | |
312 | * | |
313 | * @param m A message from a monitor not supporting required features. We | |
314 | * take ownership of the reference. | |
315 | */ | |
316 | void nak_old_peer(MonOpRequestRef op); | |
317 | /** | |
318 | * Handle a message from some other participant declaring | |
319 | * we cannot join the quorum. | |
320 | * | |
321 | * Apparently the quorum requires some feature that we do not implement. Shut | |
322 | * down gracefully. | |
323 | * | |
324 | * @pre Election is on-going. | |
325 | * @post We've shut down. | |
326 | * | |
327 | * @param m A message with an operation type of OP_NAK | |
328 | */ | |
329 | void handle_nak(MonOpRequestRef op); | |
330 | ||
331 | public: | |
332 | /** | |
333 | * Create an Elector class | |
334 | * | |
335 | * @param m A Monitor instance | |
336 | */ | |
337 | explicit Elector(Monitor *m) : mon(m), | |
338 | epoch(0), | |
339 | participating(true), | |
340 | electing_me(false), | |
341 | leader_acked(-1) { } | |
342 | ||
343 | /** | |
344 | * Initiate the Elector class. | |
345 | * | |
346 | * Basically, we will simply read whatever epoch value we have in our stable | |
347 | * storage, or consider it to be 1 if none is read. | |
348 | * | |
349 | * @post @p epoch is set to 1 or higher. | |
350 | */ | |
351 | void init(); | |
352 | /** | |
353 | * Inform this class it is supposed to shutdown. | |
354 | * | |
355 | * We will simply cancel the @p expire_event if any exists. | |
356 | * | |
357 | * @post @p expire_event is cancelled | |
358 | */ | |
359 | void shutdown(); | |
360 | ||
361 | /** | |
362 | * Obtain our epoch | |
363 | * | |
364 | * @returns Our current epoch number | |
365 | */ | |
366 | epoch_t get_epoch() { return epoch; } | |
367 | ||
368 | /** | |
369 | * advance_epoch | |
370 | * | |
371 | * increase election epoch by 1 | |
372 | */ | |
373 | void advance_epoch() { | |
374 | bump_epoch(epoch + 1); | |
375 | } | |
376 | ||
377 | /** | |
378 | * Handle received messages. | |
379 | * | |
380 | * We will ignore all messages that are not of type @p MSG_MON_ELECTION | |
381 | * (i.e., messages whose interface is not of type @p MMonElection). All of | |
382 | * those that are will then be dispatched to their operation-specific | |
383 | * functions. | |
384 | * | |
385 | * @param m A received message | |
386 | */ | |
387 | void dispatch(MonOpRequestRef op); | |
388 | ||
389 | /** | |
390 | * Call an election. | |
391 | * | |
392 | * This function simply calls Elector::start. | |
393 | */ | |
394 | void call_election() { | |
395 | start(); | |
396 | } | |
397 | ||
398 | /** | |
399 | * Stop participating in subsequent Elections. | |
400 | * | |
401 | * @post @p participating is false | |
402 | */ | |
403 | void stop_participating() { participating = false; } | |
404 | /** | |
405 | * Start participating in Elections. | |
406 | * | |
407 | * If we are already participating (i.e., @p participating is true), then | |
408 | * calling this function is moot. | |
409 | * | |
410 | * However, if we are not participating (i.e., @p participating is false), | |
411 | * then we will start participating by setting @p participating to true and | |
412 | * we will call for an Election. | |
413 | * | |
414 | * @post @p participating is true | |
415 | */ | |
416 | void start_participating(); | |
417 | ||
418 | /** | |
419 | * @} | |
420 | */ | |
421 | }; | |
422 | ||
423 | #endif |