]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/ElectionLogic.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / mon / ElectionLogic.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_ELECTIONLOGIC_H
17 #define CEPH_ELECTIONLOGIC_H
18
19 #include <map>
20 #include <set>
21 #include "include/types.h"
22 #include "ConnectionTracker.h"
23
24 class ElectionOwner {
25 public:
26 /**
27 * Write down the given epoch in persistent storage, such that it
28 * can later be retrieved by read_persisted_epoch even across process
29 * or machine restarts.
30 *
31 * @param e The epoch to write
32 */
33 virtual void persist_epoch(epoch_t e) = 0;
34 /**
35 * Retrieve the most-previously-persisted epoch.
36 *
37 * @returns The latest epoch passed to persist_epoch()
38 */
39 virtual epoch_t read_persisted_epoch() const = 0;
40 /**
41 * Validate that the persistent store is working by committing
42 * to it. (There is no interface for retrieving the value; this
43 * tests local functionality before doing things like triggering
44 * elections to try and join a quorum.)
45 */
46 virtual void validate_store() = 0;
47 /**
48 * Notify the ElectionOwner that ElectionLogic has increased its
49 * election epoch. This resets an election (either on local loss or victory,
50 * or when trying a new election round) and the ElectionOwner
51 * should reset any tracking of its own to match. (The ElectionLogic
52 * will further trigger sending election messages if that is
53 * appropriate.)
54 */
55 virtual void notify_bump_epoch() = 0;
56 /**
57 * Notify the ElectionOwner we must start a new election.
58 */
59 virtual void trigger_new_election() = 0;
60 /**
61 * Retrieve this Paxos instance's rank.
62 */
63 virtual int get_my_rank() const = 0;
64 /**
65 * Send a PROPOSE message to all our peers. This happens when
66 * we have started a new election (which may mean attempting to
67 * override a current one).
68 *
69 * @param e The election epoch of our proposal.
70 * @param bl A bufferlist containing data the logic wishes to share
71 */
72 virtual void propose_to_peers(epoch_t e, bufferlist& bl) = 0;
73 /**
74 * The election has failed and we aren't sure what the state of the
75 * quorum is, so reset the entire system as if from scratch.
76 */
77 virtual void reset_election() = 0;
78 /**
79 * Ask the ElectionOwner if we-the-Monitor have ever participated in the
80 * quorum (including across process restarts!).
81 *
82 * @returns true if we have participated, false otherwise
83 */
84 virtual bool ever_participated() const = 0;
85 /**
86 * Ask the ElectionOwner for the size of the Paxos set. This includes
87 * those monitors which may not be in the current quorum!
88 * The value returned by this function can change between elections,
89 * but not during them. (In practical terms, it can be updated
90 * by making a paxos commit, but not by injecting values while
91 * an election is ongoing.)
92 */
93 virtual unsigned paxos_size() const = 0;
94 /**
95 * Retrieve a set of ranks which are not allowed to become the leader.
96 * Like paxos_size(), This set can change between elections, but not
97 * during them.
98 */
99 virtual const std::set<int>& get_disallowed_leaders() const = 0;
100 /**
101 * Tell the ElectionOwner we have started a new election.
102 *
103 * The ElectionOwner is responsible for timing out the election (by invoking
104 * end_election_period()) if it takes too long (as defined by the ElectionOwner).
105 * This function is the opportunity to do that and to clean up any other external
106 * election state it may be maintaining.
107 */
108 virtual void _start() = 0;
109 /**
110 * Tell the ElectionOwner to defer to the identified peer. Tell that peer
111 * we have deferred to it.
112 *
113 * @post we sent an ack message to @p who
114 */
115 virtual void _defer_to(int who) = 0;
116 /**
117 * We have won an election, so have the ElectionOwner message that to
118 * our new quorum!
119 *
120 * @param quorum The ranks of our peers which deferred to us and
121 * must be told of our victory
122 */
123 virtual void message_victory(const std::set<int>& quorum) = 0;
124 /**
125 * Query the ElectionOwner about if a given rank is in the
126 * currently active quorum.
127 * @param rank the Paxos rank whose status we are checking
128 * @returns true if the rank is in our current quorum, false otherwise.
129 */
130 virtual bool is_current_member(int rank) const = 0;
131 virtual ~ElectionOwner() {}
132 };
133
134 /**
135 * This class maintains local state for running an election
136 * between Paxos instances. It receives input requests
137 * and calls back out to its ElectionOwner to do persistence
138 * and message other entities.
139 */
140
141 class ElectionLogic {
142 ElectionOwner *elector;
143 ConnectionTracker *peer_tracker;
144
145 CephContext *cct;
146 /**
147 * Latest epoch we've seen.
148 *
149 * @remarks if its value is odd, we're electing; if it's even, then we're
150 * stable.
151 */
152 epoch_t epoch = 0;
153 /**
154 * The last rank which won an election we participated in
155 */
156 int last_election_winner = -1;
157 /**
158 * Only used in the connectivity handler.
159 * The rank we voted for in the last election we voted in.
160 */
161 int last_voted_for = -1;
162 double ignore_propose_margin = 0.0001;
163 /**
164 * Only used in the connectivity handler.
165 * Points at a stable copy of the peer_tracker we use to keep scores
166 * throughout an election period.
167 */
168 std::unique_ptr<ConnectionTracker> stable_peer_tracker;
169 std::unique_ptr<ConnectionTracker> leader_peer_tracker;
170 /**
171 * Indicates who we have acked
172 */
173 int leader_acked;
174
175 public:
176 enum election_strategy {
177 // Keep in sync with MonMap.h!
178 CLASSIC = 1, // the original rank-based one
179 DISALLOW = 2, // disallow a set from being leader
180 CONNECTIVITY = 3 // includes DISALLOW, extends to prefer stronger connections
181 };
182 election_strategy strategy;
183
184 /**
185 * Indicates if we are participating in the quorum.
186 *
187 * @remarks By default, we are created as participating. We may stop
188 * participating if something explicitly sets our value
189 * false, though. If that happens, it will
190 * have to set participating=true and invoke start() for us to resume
191 * participating in the quorum.
192 */
193 bool participating;
194 /**
195 * Indicates if we are the ones being elected.
196 *
197 * We always attempt to be the one being elected if we are the ones starting
198 * the election. If we are not the ones that started it, we will only attempt
199 * to be elected if we think we might have a chance (i.e., the other guy's
200 * rank is lower than ours).
201 */
202 bool electing_me;
203 /**
204 * Set containing all those that acked our proposal to become the Leader.
205 *
206 * If we are acked by ElectionOwner::paxos_size() peers, we will declare
207 * victory.
208 */
209 std::set<int> acked_me;
210
211 ElectionLogic(ElectionOwner *e, election_strategy es, ConnectionTracker *t,
212 double ipm,
213 CephContext *c) : elector(e), peer_tracker(t), cct(c),
214 last_election_winner(-1), last_voted_for(-1),
215 ignore_propose_margin(ipm),
216 stable_peer_tracker(),
217 leader_peer_tracker(),
218 leader_acked(-1),
219 strategy(es),
220 participating(true),
221 electing_me(false) {}
222 /**
223 * Set the election strategy to use. If this is not consistent across the
224 * electing cluster, you're going to have a bad time.
225 * Defaults to CLASSIC.
226 */
227 void set_election_strategy(election_strategy es) {
228 strategy = es;
229 }
230 /**
231 * If there are no other peers in this Paxos group, ElectionOwner
232 * can simply declare victory and we will make it so.
233 *
234 * @pre paxos_size() is 1
235 * @pre get_my_rank is 0
236 */
237 void declare_standalone_victory();
238 /**
239 * Start a new election by proposing ourselves as the new Leader.
240 *
241 * Basically, send propose messages to all the peers.
242 *
243 * @pre participating is true
244 * @post epoch is an odd value
245 * @post electing_me is true
246 * @post We have invoked propose_to_peers() on our ElectionOwner
247 * @post We have invoked _start() on our ElectionOwner
248 */
249 void start();
250 /**
251 * ElectionOwner has decided the election has taken too long and expired.
252 *
253 * This will happen when no one declared victory or started a new election
254 * during the allowed time span.
255 *
256 * When the election expires, we will check if we were the ones who won, and
257 * if so we will declare victory. If that is not the case, then we assume
258 * that the one we deferred to didn't declare victory quickly enough (in fact,
259 * as far as we know, it may even be dead); so, just propose ourselves as the
260 * Leader.
261 */
262 void end_election_period();
263 /**
264 * Handle a proposal from some other node proposing asking to become
265 * the Leader.
266 *
267 * If the message appears to be old (i.e., its epoch is lower than our epoch),
268 * then we may take one of two actions:
269 *
270 * @li Ignore it because it's nothing more than an old proposal
271 * @li Start new elections if we verify that it was sent by a monitor from
272 * outside the quorum; given its old state, it's fair to assume it just
273 * started, so we should start new elections so it may rejoin. (Some
274 * handlers may choose to ignore even these, if they think it's flapping.)
275 *
276 * We pass the propose off to a propose_*_handler function based
277 * on the election strategy we're using.
278 * Only the Connectivity strategy cares about the ConnectionTracker; it should
279 * be NULL if other strategies are in use. Otherwise, it will take ownership
280 * of the underlying data and delete it as needed.
281 *
282 * @pre Message epoch is from the current or a newer epoch
283 * @param mepoch The epoch of the proposal
284 * @param from The rank proposing itself as leader
285 * @param ct Any incoming ConnectionTracker data sent with the message.
286 * Callers are responsible for deleting this -- we will copy it if we want
287 * to keep the data.
288 */
289 void receive_propose(int from, epoch_t mepoch, const ConnectionTracker *ct);
290 /**
291 * Handle a message from some other participant Acking us as the Leader.
292 *
293 * When we receive such a message, one of three thing may be happening:
294 * @li We received a message with a newer epoch, which means we must have
295 * somehow lost track of what was going on (maybe we rebooted), thus we
296 * will start a new election
297 * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
298 * is true), and we are actually being Acked by someone; thus simply add
299 * the one acking us to the @p acked_me set. If we do now have acks from
300 * all the participants, then we can declare victory
301 * @li We already deferred the election to somebody else, so we will just
302 * ignore this message
303 *
304 * @pre Message epoch is from the current or a newer epoch
305 * @post Election is on-going if we deferred to somebody else
306 * @post Election is on-going if we are still waiting for further Acks
307 * @post Election is not on-going if we are victorious
308 * @post Election is not on-going if we must start a new one
309 *
310 * @param from The rank which acked us
311 * @param from_epoch The election epoch the ack belongs to
312 */
313 void receive_ack(int from, epoch_t from_epoch);
314 /**
315 * Handle a message from some other participant declaring Victory.
316 *
317 * We just got a message from someone declaring themselves Victorious, thus
318 * the new Leader.
319 *
320 * However, if the message's epoch happens to be different from our epoch+1,
321 * then it means we lost track of something and we must start a new election.
322 *
323 * If that is not the case, then we will simply update our epoch to the one
324 * in the message and invoke start() to reset the quorum.
325 *
326 * @pre from_epoch is the current or a newer epoch
327 * @post Election is not on-going
328 * @post Updated @p epoch
329 * @post We are a peon in a new quorum if we lost the election
330 *
331 * @param from The victory-claiming rank
332 * @param from_epoch The election epoch in which they claim victory
333 */
334 bool receive_victory_claim(int from, epoch_t from_epoch);
335 /**
336 * Obtain our epoch
337 *
338 * @returns Our current epoch number
339 */
340 epoch_t get_epoch() const { return epoch; }
341 int get_election_winner() { return last_election_winner; }
342
343 private:
344 /**
345 * Initiate the ElectionLogic class.
346 *
347 * Basically, we will simply read whatever epoch value we have in our stable
348 * storage, or consider it to be 1 if none is read.
349 *
350 * @post @p epoch is set to 1 or higher.
351 */
352 void init();
353 /**
354 * Update our epoch.
355 *
356 * If we come across a higher epoch, we simply update ours, also making
357 * sure we are no longer being elected (even though we could have been,
358 * we no longer are since we no longer are on that old epoch).
359 *
360 * @pre Our epoch is not larger than @p e
361 * @post Our epoch equals @p e
362 *
363 * @param e Epoch to which we will update our epoch
364 */
365 void bump_epoch(epoch_t e);
366 /**
367 * If the incoming proposal is newer, bump our own epoch; if
368 * it comes from an out-of-quorum peer, trigger a new eleciton.
369 * @returns true if you should drop this proposal, false otherwise.
370 */
371 bool propose_classic_prefix(int from, epoch_t mepoch);
372 /**
373 * Handle a proposal from another rank using the classic strategy.
374 * We will take one of the following actions:
375 *
376 * @li Ignore it because we already acked another node with higher rank
377 * @li Ignore it and start a new election because we outrank it
378 * @li Defer to it because it outranks us and the node we previously
379 * acked, if any
380 */
381 void propose_classic_handler(int from, epoch_t mepoch);
382 /**
383 * Handle a proposal from another rank using our disallow strategy.
384 * This is the same as the classic strategy except we also disallow
385 * certain ranks from becoming the leader.
386 */
387 void propose_disallow_handler(int from, epoch_t mepoch);
388 /**
389 * Handle a proposal from another rank using the connectivity strategy.
390 * We will choose to defer or not based on the ordered criteria:
391 *
392 * @li Whether the other monitor (or ourself) is on the disallow list
393 * @li Whether the other monitor or ourself has the most connectivity to peers
394 * @li Whether the other monitor or ourself has the lower rank
395 */
396 void propose_connectivity_handler(int from, epoch_t mepoch, const ConnectionTracker *ct);
397 /**
398 * Helper function for connectivity handler. Combines the disallowed list
399 * with ConnectionTracker scores.
400 */
401 double connectivity_election_score(int rank);
402 /**
403 * Defer the current election to some other monitor.
404 *
405 * This means that we will ack some other monitor and drop out from the run
406 * to become the Leader. We will only defer an election if the monitor we
407 * are deferring to outranks us.
408 *
409 * @pre @p who outranks us (i.e., who < our rank)
410 * @pre @p who outranks any other monitor we have deferred to in the past
411 * @post electing_me is false
412 * @post leader_acked equals @p who
413 * @post we triggered ElectionOwner's _defer_to() on @p who
414 *
415 * @param who Some other monitor's numeric identifier.
416 */
417 void defer(int who);
418 /**
419 * Declare Victory.
420 *
421 * We won. Or at least we believe we won, but for all intents and purposes
422 * that does not matter. What matters is that we Won.
423 *
424 * That said, we must now bump our epoch to reflect that the election is over
425 * and then we must let everybody in the quorum know we are their brand new
426 * Leader.
427 *
428 * Actually, the quorum will be now defined as the group of monitors that
429 * acked us during the election process.
430 *
431 * @pre Election is on-going
432 * @pre electing_me is true
433 * @post electing_me is false
434 * @post epoch is bumped up into an even value
435 * @post Election is not on-going
436 * @post We have a quorum, composed of the monitors that acked us
437 * @post We invoked message_victory() on the ElectionOwner
438 */
439 void declare_victory();
440 /**
441 * This is just a helper function to validate that the victory claim we
442 * get from another rank makes any sense.
443 */
444 bool victory_makes_sense(int from);
445 /**
446 * Reset some data members which we only care about while we are in an election
447 * or need to be set consistently during stable states.
448 */
449 void clear_live_election_state();
450 void reset_stable_tracker();
451 /**
452 * Only for the connectivity handler, Bump the epoch
453 * when we get a message from a newer one and clear
454 * out leader and stable tracker
455 * data so that we can switch our allegiance.
456 */
457 void connectivity_bump_epoch_in_election(epoch_t mepoch);
458 };
459
460 #endif