1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #ifndef CEPH_ELECTIONLOGIC_H
17 #define CEPH_ELECTIONLOGIC_H
21 #include "include/types.h"
22 #include "ConnectionTracker.h"
27 * Write down the given epoch in persistent storage, such that it
28 * can later be retrieved by read_persisted_epoch even across process
29 * or machine restarts.
31 * @param e The epoch to write
33 virtual void persist_epoch(epoch_t e
) = 0;
35 * Retrieve the most-previously-persisted epoch.
37 * @returns The latest epoch passed to persist_epoch()
39 virtual epoch_t
read_persisted_epoch() const = 0;
41 * Validate that the persistent store is working by committing
42 * to it. (There is no interface for retrieving the value; this
43 * tests local functionality before doing things like triggering
44 * elections to try and join a quorum.)
46 virtual void validate_store() = 0;
48 * Notify the ElectionOwner that ElectionLogic has increased its
49 * election epoch. This resets an election (either on local loss or victory,
50 * or when trying a new election round) and the ElectionOwner
51 * should reset any tracking of its own to match. (The ElectionLogic
52 * will further trigger sending election messages if that is
55 virtual void notify_bump_epoch() = 0;
57 * Notify the ElectionOwner we must start a new election.
59 virtual void trigger_new_election() = 0;
61 * Retrieve this Paxos instance's rank.
63 virtual int get_my_rank() const = 0;
65 * Send a PROPOSE message to all our peers. This happens when
66 * we have started a new election (which may mean attempting to
67 * override a current one).
69 * @param e The election epoch of our proposal.
70 * @param bl A bufferlist containing data the logic wishes to share
72 virtual void propose_to_peers(epoch_t e
, bufferlist
& bl
) = 0;
74 * The election has failed and we aren't sure what the state of the
75 * quorum is, so reset the entire system as if from scratch.
77 virtual void reset_election() = 0;
79 * Ask the ElectionOwner if we-the-Monitor have ever participated in the
80 * quorum (including across process restarts!).
82 * @returns true if we have participated, false otherwise
84 virtual bool ever_participated() const = 0;
86 * Ask the ElectionOwner for the size of the Paxos set. This includes
87 * those monitors which may not be in the current quorum!
88 * The value returned by this function can change between elections,
89 * but not during them. (In practical terms, it can be updated
90 * by making a paxos commit, but not by injecting values while
91 * an election is ongoing.)
93 virtual unsigned paxos_size() const = 0;
95 * Retrieve a set of ranks which are not allowed to become the leader.
96 * Like paxos_size(), This set can change between elections, but not
99 virtual const std::set
<int>& get_disallowed_leaders() const = 0;
101 * Tell the ElectionOwner we have started a new election.
103 * The ElectionOwner is responsible for timing out the election (by invoking
104 * end_election_period()) if it takes too long (as defined by the ElectionOwner).
105 * This function is the opportunity to do that and to clean up any other external
106 * election state it may be maintaining.
108 virtual void _start() = 0;
110 * Tell the ElectionOwner to defer to the identified peer. Tell that peer
111 * we have deferred to it.
113 * @post we sent an ack message to @p who
115 virtual void _defer_to(int who
) = 0;
117 * We have won an election, so have the ElectionOwner message that to
120 * @param quorum The ranks of our peers which deferred to us and
121 * must be told of our victory
123 virtual void message_victory(const std::set
<int>& quorum
) = 0;
125 * Query the ElectionOwner about if a given rank is in the
126 * currently active quorum.
127 * @param rank the Paxos rank whose status we are checking
128 * @returns true if the rank is in our current quorum, false otherwise.
130 virtual bool is_current_member(int rank
) const = 0;
131 virtual ~ElectionOwner() {}
135 * This class maintains local state for running an election
136 * between Paxos instances. It receives input requests
137 * and calls back out to its ElectionOwner to do persistence
138 * and message other entities.
141 class ElectionLogic
{
142 ElectionOwner
*elector
;
143 ConnectionTracker
*peer_tracker
;
147 * Latest epoch we've seen.
149 * @remarks if its value is odd, we're electing; if it's even, then we're
154 * The last rank which won an election we participated in
156 int last_election_winner
= -1;
158 * Only used in the connectivity handler.
159 * The rank we voted for in the last election we voted in.
161 int last_voted_for
= -1;
162 double ignore_propose_margin
= 0.0001;
164 * Only used in the connectivity handler.
165 * Points at a stable copy of the peer_tracker we use to keep scores
166 * throughout an election period.
168 std::unique_ptr
<ConnectionTracker
> stable_peer_tracker
;
169 std::unique_ptr
<ConnectionTracker
> leader_peer_tracker
;
171 * Indicates who we have acked
176 enum election_strategy
{
177 // Keep in sync with MonMap.h!
178 CLASSIC
= 1, // the original rank-based one
179 DISALLOW
= 2, // disallow a set from being leader
180 CONNECTIVITY
= 3 // includes DISALLOW, extends to prefer stronger connections
182 election_strategy strategy
;
185 * Indicates if we are participating in the quorum.
187 * @remarks By default, we are created as participating. We may stop
188 * participating if something explicitly sets our value
189 * false, though. If that happens, it will
190 * have to set participating=true and invoke start() for us to resume
191 * participating in the quorum.
195 * Indicates if we are the ones being elected.
197 * We always attempt to be the one being elected if we are the ones starting
198 * the election. If we are not the ones that started it, we will only attempt
199 * to be elected if we think we might have a chance (i.e., the other guy's
200 * rank is lower than ours).
204 * Set containing all those that acked our proposal to become the Leader.
206 * If we are acked by ElectionOwner::paxos_size() peers, we will declare
209 std::set
<int> acked_me
;
211 ElectionLogic(ElectionOwner
*e
, election_strategy es
, ConnectionTracker
*t
,
213 CephContext
*c
) : elector(e
), peer_tracker(t
), cct(c
),
214 last_election_winner(-1), last_voted_for(-1),
215 ignore_propose_margin(ipm
),
216 stable_peer_tracker(),
217 leader_peer_tracker(),
221 electing_me(false) {}
223 * Set the election strategy to use. If this is not consistent across the
224 * electing cluster, you're going to have a bad time.
225 * Defaults to CLASSIC.
227 void set_election_strategy(election_strategy es
) {
231 * If there are no other peers in this Paxos group, ElectionOwner
232 * can simply declare victory and we will make it so.
234 * @pre paxos_size() is 1
235 * @pre get_my_rank is 0
237 void declare_standalone_victory();
239 * Start a new election by proposing ourselves as the new Leader.
241 * Basically, send propose messages to all the peers.
243 * @pre participating is true
244 * @post epoch is an odd value
245 * @post electing_me is true
246 * @post We have invoked propose_to_peers() on our ElectionOwner
247 * @post We have invoked _start() on our ElectionOwner
251 * ElectionOwner has decided the election has taken too long and expired.
253 * This will happen when no one declared victory or started a new election
254 * during the allowed time span.
256 * When the election expires, we will check if we were the ones who won, and
257 * if so we will declare victory. If that is not the case, then we assume
258 * that the one we deferred to didn't declare victory quickly enough (in fact,
259 * as far as we know, it may even be dead); so, just propose ourselves as the
262 void end_election_period();
264 * Handle a proposal from some other node proposing asking to become
267 * If the message appears to be old (i.e., its epoch is lower than our epoch),
268 * then we may take one of two actions:
270 * @li Ignore it because it's nothing more than an old proposal
271 * @li Start new elections if we verify that it was sent by a monitor from
272 * outside the quorum; given its old state, it's fair to assume it just
273 * started, so we should start new elections so it may rejoin. (Some
274 * handlers may choose to ignore even these, if they think it's flapping.)
276 * We pass the propose off to a propose_*_handler function based
277 * on the election strategy we're using.
278 * Only the Connectivity strategy cares about the ConnectionTracker; it should
279 * be NULL if other strategies are in use. Otherwise, it will take ownership
280 * of the underlying data and delete it as needed.
282 * @pre Message epoch is from the current or a newer epoch
283 * @param mepoch The epoch of the proposal
284 * @param from The rank proposing itself as leader
285 * @param ct Any incoming ConnectionTracker data sent with the message.
286 * Callers are responsible for deleting this -- we will copy it if we want
289 void receive_propose(int from
, epoch_t mepoch
, const ConnectionTracker
*ct
);
291 * Handle a message from some other participant Acking us as the Leader.
293 * When we receive such a message, one of three thing may be happening:
294 * @li We received a message with a newer epoch, which means we must have
295 * somehow lost track of what was going on (maybe we rebooted), thus we
296 * will start a new election
297 * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
298 * is true), and we are actually being Acked by someone; thus simply add
299 * the one acking us to the @p acked_me set. If we do now have acks from
300 * all the participants, then we can declare victory
301 * @li We already deferred the election to somebody else, so we will just
302 * ignore this message
304 * @pre Message epoch is from the current or a newer epoch
305 * @post Election is on-going if we deferred to somebody else
306 * @post Election is on-going if we are still waiting for further Acks
307 * @post Election is not on-going if we are victorious
308 * @post Election is not on-going if we must start a new one
310 * @param from The rank which acked us
311 * @param from_epoch The election epoch the ack belongs to
313 void receive_ack(int from
, epoch_t from_epoch
);
315 * Handle a message from some other participant declaring Victory.
317 * We just got a message from someone declaring themselves Victorious, thus
320 * However, if the message's epoch happens to be different from our epoch+1,
321 * then it means we lost track of something and we must start a new election.
323 * If that is not the case, then we will simply update our epoch to the one
324 * in the message and invoke start() to reset the quorum.
326 * @pre from_epoch is the current or a newer epoch
327 * @post Election is not on-going
328 * @post Updated @p epoch
329 * @post We are a peon in a new quorum if we lost the election
331 * @param from The victory-claiming rank
332 * @param from_epoch The election epoch in which they claim victory
334 bool receive_victory_claim(int from
, epoch_t from_epoch
);
338 * @returns Our current epoch number
340 epoch_t
get_epoch() const { return epoch
; }
341 int get_election_winner() { return last_election_winner
; }
345 * Initiate the ElectionLogic class.
347 * Basically, we will simply read whatever epoch value we have in our stable
348 * storage, or consider it to be 1 if none is read.
350 * @post @p epoch is set to 1 or higher.
356 * If we come across a higher epoch, we simply update ours, also making
357 * sure we are no longer being elected (even though we could have been,
358 * we no longer are since we no longer are on that old epoch).
360 * @pre Our epoch is not larger than @p e
361 * @post Our epoch equals @p e
363 * @param e Epoch to which we will update our epoch
365 void bump_epoch(epoch_t e
);
367 * If the incoming proposal is newer, bump our own epoch; if
368 * it comes from an out-of-quorum peer, trigger a new eleciton.
369 * @returns true if you should drop this proposal, false otherwise.
371 bool propose_classic_prefix(int from
, epoch_t mepoch
);
373 * Handle a proposal from another rank using the classic strategy.
374 * We will take one of the following actions:
376 * @li Ignore it because we already acked another node with higher rank
377 * @li Ignore it and start a new election because we outrank it
378 * @li Defer to it because it outranks us and the node we previously
381 void propose_classic_handler(int from
, epoch_t mepoch
);
383 * Handle a proposal from another rank using our disallow strategy.
384 * This is the same as the classic strategy except we also disallow
385 * certain ranks from becoming the leader.
387 void propose_disallow_handler(int from
, epoch_t mepoch
);
389 * Handle a proposal from another rank using the connectivity strategy.
390 * We will choose to defer or not based on the ordered criteria:
392 * @li Whether the other monitor (or ourself) is on the disallow list
393 * @li Whether the other monitor or ourself has the most connectivity to peers
394 * @li Whether the other monitor or ourself has the lower rank
396 void propose_connectivity_handler(int from
, epoch_t mepoch
, const ConnectionTracker
*ct
);
398 * Helper function for connectivity handler. Combines the disallowed list
399 * with ConnectionTracker scores.
401 double connectivity_election_score(int rank
);
403 * Defer the current election to some other monitor.
405 * This means that we will ack some other monitor and drop out from the run
406 * to become the Leader. We will only defer an election if the monitor we
407 * are deferring to outranks us.
409 * @pre @p who outranks us (i.e., who < our rank)
410 * @pre @p who outranks any other monitor we have deferred to in the past
411 * @post electing_me is false
412 * @post leader_acked equals @p who
413 * @post we triggered ElectionOwner's _defer_to() on @p who
415 * @param who Some other monitor's numeric identifier.
421 * We won. Or at least we believe we won, but for all intents and purposes
422 * that does not matter. What matters is that we Won.
424 * That said, we must now bump our epoch to reflect that the election is over
425 * and then we must let everybody in the quorum know we are their brand new
428 * Actually, the quorum will be now defined as the group of monitors that
429 * acked us during the election process.
431 * @pre Election is on-going
432 * @pre electing_me is true
433 * @post electing_me is false
434 * @post epoch is bumped up into an even value
435 * @post Election is not on-going
436 * @post We have a quorum, composed of the monitors that acked us
437 * @post We invoked message_victory() on the ElectionOwner
439 void declare_victory();
441 * This is just a helper function to validate that the victory claim we
442 * get from another rank makes any sense.
444 bool victory_makes_sense(int from
);
446 * Reset some data members which we only care about while we are in an election
447 * or need to be set consistently during stable states.
449 void clear_live_election_state();
450 void reset_stable_tracker();
452 * Only for the connectivity handler, Bump the epoch
453 * when we get a message from a newer one and clear
454 * out leader and stable tracker
455 * data so that we can switch our allegiance.
457 void connectivity_bump_epoch_in_election(epoch_t mepoch
);