]>
Commit | Line | Data |
---|---|---|
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- | |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef CEPH_MON_ELECTOR_H | |
17 | #define CEPH_MON_ELECTOR_H | |
18 | ||
19 | #include <map> | |
20 | using namespace std; | |
21 | ||
22 | #include "include/types.h" | |
23 | #include "include/Context.h" | |
24 | #include "mon/MonOpRequest.h" | |
25 | #include "mon/mon_types.h" | |
26 | ||
27 | class Monitor; | |
28 | ||
29 | /** | |
30 | * This class is responsible for maintaining the local state when electing | |
31 | * a new Leader. We may win or we may lose. If we win, it means we became the | |
32 | * Leader; if we lose, it means we are a Peon. | |
33 | */ | |
34 | class Elector { | |
35 | /** | |
36 | * @defgroup Elector_h_class Elector | |
37 | * @{ | |
38 | */ | |
39 | private: | |
40 | /** | |
41 | * @defgroup Elector_h_internal_types Internal Types | |
42 | * @{ | |
43 | */ | |
44 | /** | |
45 | * This struct will hold the features from a given peer. | |
46 | * Features may both be the cluster's (in the form of a uint64_t), or | |
47 | * mon-specific features. Instead of keeping maps to hold them both, or | |
48 | * a pair, which would be weird, a struct to keep them seems appropriate. | |
49 | */ | |
50 | struct elector_info_t { | |
51 | uint64_t cluster_features; | |
52 | mon_feature_t mon_features; | |
53 | map<string,string> metadata; | |
54 | }; | |
55 | ||
56 | /** | |
57 | * @} | |
58 | */ | |
59 | ||
60 | /** | |
61 | * The Monitor instance associated with this class. | |
62 | */ | |
63 | Monitor *mon; | |
64 | ||
65 | /** | |
66 | * Event callback responsible for dealing with an expired election once a | |
67 | * timer runs out and fires up. | |
68 | */ | |
69 | Context *expire_event = nullptr; | |
70 | ||
71 | /** | |
72 | * Resets the expire_event timer, by cancelling any existing one and | |
73 | * scheduling a new one. | |
74 | * | |
75 | * @remarks This function assumes as a default firing value the duration of | |
76 | * the monitor's lease interval, and adds to it the value specified | |
77 | * in @e plus | |
78 | * | |
79 | * @post expire_event is set | |
80 | * | |
81 | * @param plus The amount of time to be added to the default firing value. | |
82 | */ | |
83 | void reset_timer(double plus=0.0); | |
84 | /** | |
85 | * Cancel the expire_event timer, if it is defined. | |
86 | * | |
87 | * @post expire_event is not set | |
88 | */ | |
89 | void cancel_timer(); | |
90 | ||
91 | /** | |
92 | * Latest epoch we've seen. | |
93 | * | |
94 | * @remarks if its value is odd, we're electing; if it's even, then we're | |
95 | * stable. | |
96 | */ | |
97 | epoch_t epoch; | |
98 | ||
99 | /** | |
100 | * Indicates if we are participating in the quorum. | |
101 | * | |
102 | * @remarks By default, we are created as participating. We may stop | |
103 | * participating if the Monitor explicitely calls | |
104 | * Elector::stop_participating though. If that happens, it will | |
105 | * have to call Elector::start_participating for us to resume | |
106 | * participating in the quorum. | |
107 | */ | |
108 | bool participating; | |
109 | ||
110 | // electing me | |
111 | /** | |
112 | * @defgroup Elector_h_electing_me_vars We are being elected | |
113 | * @{ | |
114 | */ | |
115 | /** | |
116 | * Indicates if we are the ones being elected. | |
117 | * | |
118 | * We always attempt to be the one being elected if we are the ones starting | |
119 | * the election. If we are not the ones that started it, we will only attempt | |
120 | * to be elected if we think we might have a chance (i.e., the other guy's | |
121 | * rank is lower than ours). | |
122 | */ | |
123 | bool electing_me; | |
124 | /** | |
125 | * Holds the time at which we started the election. | |
126 | */ | |
127 | utime_t start_stamp; | |
128 | /** | |
129 | * Set containing all those that acked our proposal to become the Leader. | |
130 | * | |
131 | * If we are acked by everyone in the MonMap, we will declare | |
132 | * victory. Also note each peer's feature set. | |
133 | */ | |
134 | map<int, elector_info_t> acked_me; | |
135 | /** | |
136 | * @} | |
137 | */ | |
138 | /** | |
139 | * @defgroup Elector_h_electing_them_vars We are electing another guy | |
140 | * @{ | |
141 | */ | |
142 | /** | |
143 | * Indicates who we have acked | |
144 | */ | |
145 | int leader_acked; | |
146 | /** | |
147 | * Indicates when we have acked it | |
148 | */ | |
149 | utime_t ack_stamp; | |
150 | /** | |
151 | * @} | |
152 | */ | |
153 | ||
154 | /** | |
155 | * Update our epoch. | |
156 | * | |
157 | * If we come across a higher epoch, we simply update ours, also making | |
158 | * sure we are no longer being elected (even though we could have been, | |
159 | * we no longer are since we no longer are on that old epoch). | |
160 | * | |
161 | * @pre Our epoch is lower than @p e | |
162 | * @post Our epoch equals @p e | |
163 | * | |
164 | * @param e Epoch to which we will update our epoch | |
165 | */ | |
166 | void bump_epoch(epoch_t e); | |
167 | ||
168 | /** | |
169 | * Start new elections by proposing ourselves as the new Leader. | |
170 | * | |
171 | * Basically, send propose messages to all the monitors in the MonMap and | |
172 | * then reset the expire_event timer so we can limit the amount of time we | |
173 | * will be going at it. | |
174 | * | |
175 | * @pre participating is true | |
176 | * @post epoch is an odd value | |
177 | * @post electing_me is true | |
178 | * @post we sent propose messages to all the monitors in the MonMap | |
179 | * @post we reset the expire_event timer | |
180 | */ | |
181 | void start(); | |
182 | /** | |
183 | * Defer the current election to some other monitor. | |
184 | * | |
185 | * This means that we will ack some other monitor and drop out from the run | |
186 | * to become the Leader. We will only defer an election if the monitor we | |
187 | * are deferring to outranks us. | |
188 | * | |
189 | * @pre @p who outranks us (i.e., who < our rank) | |
190 | * @pre @p who outranks any other monitor we have deferred to in the past | |
191 | * @post electing_me is false | |
192 | * @post leader_acked equals @p who | |
193 | * @post we sent an ack message to @p who | |
194 | * @post we reset the expire_event timer | |
195 | * | |
196 | * @param who Some other monitor's numeric identifier. | |
197 | */ | |
198 | void defer(int who); | |
199 | /** | |
200 | * The election has taken too long and has expired. | |
201 | * | |
202 | * This will happen when no one declared victory or started a new election | |
203 | * during the time span allowed by the expire_event timer. | |
204 | * | |
205 | * When the election expires, we will check if we were the ones who won, and | |
206 | * if so we will declare victory. If that is not the case, then we assume | |
207 | * that the one we defered to didn't declare victory quickly enough (in fact, | |
208 | * as far as we know, we may even be dead); so, just propose ourselves as the | |
209 | * Leader. | |
210 | */ | |
211 | void expire(); | |
212 | /** | |
213 | * Declare Victory. | |
214 | * | |
215 | * We won. Or at least we believe we won, but for all intentions and purposes | |
216 | * that does not matter. What matters is that we Won. | |
217 | * | |
218 | * That said, we must now bump our epoch to reflect that the election is over | |
219 | * and then we must let everybody in the quorum know we are their brand new | |
220 | * Leader. And we will also cancel our expire_event timer. | |
221 | * | |
222 | * Actually, the quorum will be now defined as the group of monitors that | |
223 | * acked us during the election process. | |
224 | * | |
225 | * @pre Election is on-going | |
226 | * @pre electing_me is true | |
227 | * @post electing_me is false | |
228 | * @post epoch is bumped up into an even value | |
229 | * @post Election is not on-going | |
230 | * @post We have a quorum, composed of the monitors that acked us | |
231 | * @post We sent a message of type OP_VICTORY to each quorum member. | |
232 | */ | |
233 | void victory(); | |
234 | ||
235 | /** | |
236 | * Handle a message from some other node proposing itself to become it | |
237 | * the Leader. | |
238 | * | |
239 | * If the message appears to be old (i.e., its epoch is lower than our epoch), | |
240 | * then we may take one of two actions: | |
241 | * | |
242 | * @li Ignore it because it's nothing more than an old proposal | |
243 | * @li Start new elections if we verify that it was sent by a monitor from | |
244 | * outside the quorum; given its old state, it's fair to assume it just | |
245 | * started, so we should start new elections so it may rejoin | |
246 | * | |
247 | * If we did not ignore the received message, then we know that this message | |
248 | * was sent by some other node proposing itself to become the Leader. So, we | |
249 | * will take one of the following actions: | |
250 | * | |
251 | * @li Ignore it because we already acked another node with higher rank | |
252 | * @li Ignore it and start a new election because we outrank it | |
253 | * @li Defer to it because it outranks us and the node we previously | |
254 | * acked, if any | |
255 | * | |
256 | * | |
257 | * @invariant The received message is an operation of type OP_PROPOSE | |
258 | * | |
259 | * @param m A message sent by another participant in the quorum. | |
260 | */ | |
261 | void handle_propose(MonOpRequestRef op); | |
262 | /** | |
263 | * Handle a message from some other participant Acking us as the Leader. | |
264 | * | |
265 | * When we receive such a message, one of three thing may be happening: | |
266 | * @li We received a message with a newer epoch, which means we must have | |
267 | * somehow lost track of what was going on (maybe we rebooted), thus we | |
268 | * will start a new election | |
269 | * @li We consider ourselves in the run for the Leader (i.e., @p electing_me | |
270 | * is true), and we are actually being Acked by someone; thus simply add | |
271 | * the one acking us to the @p acked_me set. If we do now have acks from | |
272 | * all the participants, then we can declare victory | |
273 | * @li We already deferred the election to somebody else, so we will just | |
274 | * ignore this message | |
275 | * | |
276 | * @pre Election is on-going | |
277 | * @post Election is on-going if we deferred to somebody else | |
278 | * @post Election is on-going if we are still waiting for further Acks | |
279 | * @post Election is not on-going if we are victorious | |
280 | * @post Election is not on-going if we must start a new one | |
281 | * | |
282 | * @param m A message with an operation type of OP_ACK | |
283 | */ | |
284 | void handle_ack(MonOpRequestRef op); | |
285 | /** | |
286 | * Handle a message from some other participant declaring Victory. | |
287 | * | |
288 | * We just got a message from someone declaring themselves Victorious, thus | |
289 | * the new Leader. | |
290 | * | |
291 | * However, if the message's epoch happens to be different from our epoch+1, | |
292 | * then it means we lost track of something and we must start a new election. | |
293 | * | |
294 | * If that is not the case, then we will simply update our epoch to the one | |
295 | * in the message, cancel our @p expire_event timer and inform our Monitor | |
296 | * that we lost the election and provide it with the new quorum. | |
297 | * | |
298 | * @pre Election in on-going | |
299 | * @post Election is not on-going | |
300 | * @post Updated @p epoch | |
301 | * @post We have a new quorum if we lost the election | |
302 | * | |
303 | * @param m A message with an operation type of OP_VICTORY | |
304 | */ | |
305 | void handle_victory(MonOpRequestRef op); | |
306 | /** | |
307 | * Send a nak to a peer who's out of date, containing information about why. | |
308 | * | |
309 | * If we get a message from a peer who can't support the required quorum | |
310 | * features, we have to ignore them. This function will at least send | |
311 | * them a message about *why* they're being ignored -- if they're new | |
312 | * enough to support such a message. | |
313 | * | |
314 | * @param m A message from a monitor not supporting required features. We | |
315 | * take ownership of the reference. | |
316 | */ | |
317 | void nak_old_peer(MonOpRequestRef op); | |
318 | /** | |
319 | * Handle a message from some other participant declaring | |
320 | * we cannot join the quorum. | |
321 | * | |
322 | * Apparently the quorum requires some feature that we do not implement. Shut | |
323 | * down gracefully. | |
324 | * | |
325 | * @pre Election is on-going. | |
326 | * @post We've shut down. | |
327 | * | |
328 | * @param m A message with an operation type of OP_NAK | |
329 | */ | |
330 | void handle_nak(MonOpRequestRef op); | |
331 | ||
332 | public: | |
333 | /** | |
334 | * Create an Elector class | |
335 | * | |
336 | * @param m A Monitor instance | |
337 | */ | |
338 | explicit Elector(Monitor *m) : mon(m), | |
339 | epoch(0), | |
340 | participating(true), | |
341 | electing_me(false), | |
342 | leader_acked(-1) { } | |
343 | ||
344 | /** | |
345 | * Initiate the Elector class. | |
346 | * | |
347 | * Basically, we will simply read whatever epoch value we have in our stable | |
348 | * storage, or consider it to be 1 if none is read. | |
349 | * | |
350 | * @post @p epoch is set to 1 or higher. | |
351 | */ | |
352 | void init(); | |
353 | /** | |
354 | * Inform this class it is supposed to shutdown. | |
355 | * | |
356 | * We will simply cancel the @p expire_event if any exists. | |
357 | * | |
358 | * @post @p expire_event is cancelled | |
359 | */ | |
360 | void shutdown(); | |
361 | ||
362 | /** | |
363 | * Obtain our epoch | |
364 | * | |
365 | * @returns Our current epoch number | |
366 | */ | |
367 | epoch_t get_epoch() { return epoch; } | |
368 | ||
369 | /** | |
370 | * advance_epoch | |
371 | * | |
372 | * increase election epoch by 1 | |
373 | */ | |
374 | void advance_epoch() { | |
375 | bump_epoch(epoch + 1); | |
376 | } | |
377 | ||
378 | /** | |
379 | * Handle received messages. | |
380 | * | |
381 | * We will ignore all messages that are not of type @p MSG_MON_ELECTION | |
382 | * (i.e., messages whose interface is not of type @p MMonElection). All of | |
383 | * those that are will then be dispatched to their operation-specific | |
384 | * functions. | |
385 | * | |
386 | * @param m A received message | |
387 | */ | |
388 | void dispatch(MonOpRequestRef op); | |
389 | ||
390 | /** | |
391 | * Call an election. | |
392 | * | |
393 | * This function simply calls Elector::start. | |
394 | */ | |
395 | void call_election() { | |
396 | start(); | |
397 | } | |
398 | ||
399 | /** | |
400 | * Stop participating in subsequent Elections. | |
401 | * | |
402 | * @post @p participating is false | |
403 | */ | |
404 | void stop_participating() { participating = false; } | |
405 | /** | |
406 | * Start participating in Elections. | |
407 | * | |
408 | * If we are already participating (i.e., @p participating is true), then | |
409 | * calling this function is moot. | |
410 | * | |
411 | * However, if we are not participating (i.e., @p participating is false), | |
412 | * then we will start participating by setting @p participating to true and | |
413 | * we will call for an Election. | |
414 | * | |
415 | * @post @p participating is true | |
416 | */ | |
417 | void start_participating(); | |
418 | ||
419 | /** | |
420 | * @} | |
421 | */ | |
422 | }; | |
423 | ||
424 | #endif |