]>
git.proxmox.com Git - ceph.git/blob - ceph/src/mon/ElectionLogic.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #ifndef CEPH_ELECTIONLOGIC_H
17 #define CEPH_ELECTIONLOGIC_H
20 #include "include/types.h"
25 * Write down the given epoch in persistent storage, such that it
26 * can later be retrieved by read_persisted_epoch even across process
27 * or machine restarts.
29 * @param e The epoch to write
31 virtual void persist_epoch(epoch_t e
) = 0;
33 * Retrieve the most-previously-persisted epoch.
35 * @returns The latest epoch passed to persist_epoch()
37 virtual epoch_t
read_persisted_epoch() const = 0;
39 * Validate that the persistent store is working by committing
40 * to it. (There is no interface for retrieving the value; this
41 * tests local functionality before doing things like triggering
42 * elections to try and join a quorum.)
44 virtual void validate_store() = 0;
46 * Notify the ElectionOwner that ElectionLogic has increased its
47 * election epoch. This resets an election (either on local loss or victory,
48 * or when trying a new election round) and the ElectionOwner
49 * should reset any tracking of its own to match. (The ElectionLogic
50 * will further trigger sending election messages if that is
53 virtual void notify_bump_epoch() = 0;
55 * Notify the ElectionOwner we must start a new election.
57 virtual void trigger_new_election() = 0;
59 * Retrieve this Paxos instance's rank.
61 virtual int get_my_rank() const = 0;
63 * Send a PROPOSE message to all our peers. This happens when
64 * we have started a new election (which may mean attempting to
65 * override a current one).
67 * @param e The election epoch of our proposal.
69 virtual void propose_to_peers(epoch_t e
) = 0;
71 * The election has failed and we aren't sure what the state of the
72 * quorum is, so reset the entire system as if from scratch.
74 virtual void reset_election() = 0;
76 * Ask the ElectionOwner if we-the-Monitor have ever participated in the
77 * quorum (including across process restarts!).
79 * @returns true if we have participated, false otherwise
81 virtual bool ever_participated() const = 0;
83 * Ask the ElectionOwner for the size of the Paxos set. This includes
84 * those monitors which may not be in the current quorum!
86 virtual unsigned paxos_size() const = 0;
88 * Tell the ElectionOwner we have started a new election.
90 * The ElectionOwner is responsible for timing out the election (by invoking
91 * end_election_period()) if it takes too long (as defined by the ElectionOwner).
92 * This function is the opportunity to do that and to clean up any other external
93 * election state it may be maintaining.
95 virtual void _start() = 0;
97 * Tell the ElectionOwner to defer to the identified peer. Tell that peer
98 * we have deferred to it.
100 * @post we sent an ack message to @p who
102 virtual void _defer_to(int who
) = 0;
104 * We have won an election, so have the ElectionOwner message that to
107 * @param quorum The ranks of our peers which deferred to us and
108 * must be told of our victory
110 virtual void message_victory(const std::set
<int>& quorum
) = 0;
112 * Query the ElectionOwner about if a given rank is in the
113 * currently active quorum.
114 * @param rank the Paxos rank whose status we are checking
115 * @returns true if the rank is in our current quorum, false otherwise.
117 virtual bool is_current_member(int rank
) const = 0;
118 virtual ~ElectionOwner() {}
122 * This class maintains local state for running an election
123 * between Paxos instances. It receives input requests
124 * and calls back out to its ElectionOwner to do persistence
125 * and message other entities.
128 class ElectionLogic
{
129 ElectionOwner
*elector
;
132 * Latest epoch we've seen.
134 * @remarks if its value is odd, we're electing; if it's even, then we're
139 * Indicates who we have acked
144 * Indicates if we are participating in the quorum.
146 * @remarks By default, we are created as participating. We may stop
147 * participating if something explicitly sets our value
148 * false, though. If that happens, it will
149 * have to set participating=true and invoke start() for us to resume
150 * participating in the quorum.
154 * Indicates if we are the ones being elected.
156 * We always attempt to be the one being elected if we are the ones starting
157 * the election. If we are not the ones that started it, we will only attempt
158 * to be elected if we think we might have a chance (i.e., the other guy's
159 * rank is lower than ours).
163 * Set containing all those that acked our proposal to become the Leader.
165 * If we are acked by ElectionOwner::paxos_size() peers, we will declare
168 std::set
<int> acked_me
;
170 ElectionLogic(ElectionOwner
*e
, CephContext
*c
) : elector(e
), cct(c
),
173 electing_me(false) {}
175 * If there are no other peers in this Paxos group, ElectionOwner
176 * can simply declare victory and we will make it so.
178 * @pre paxos_size() is 1
179 * @pre get_my_rank is 0
181 void declare_standalone_victory();
183 * Start a new election by proposing ourselves as the new Leader.
185 * Basically, send propose messages to all the peers.
187 * @pre participating is true
188 * @post epoch is an odd value
189 * @post electing_me is true
190 * @post We have invoked propose_to_peers() on our ElectionOwner
191 * @post We have invoked _start() on our ElectionOwner
195 * ElectionOwner has decided the election has taken too long and expired.
197 * This will happen when no one declared victory or started a new election
198 * during the allowed time span.
200 * When the election expires, we will check if we were the ones who won, and
201 * if so we will declare victory. If that is not the case, then we assume
202 * that the one we deferred to didn't declare victory quickly enough (in fact,
203 * as far as we know, it may even be dead); so, just propose ourselves as the
206 void end_election_period();
208 * Handle a proposal from some other node proposing asking to become
211 * If the message appears to be old (i.e., its epoch is lower than our epoch),
212 * then we may take one of two actions:
214 * @li Ignore it because it's nothing more than an old proposal
215 * @li Start new elections if we verify that it was sent by a monitor from
216 * outside the quorum; given its old state, it's fair to assume it just
217 * started, so we should start new elections so it may rejoin
219 * If we did not ignore the received message, then we know that this message
220 * was sent by some other node proposing itself to become the Leader. So, we
221 * will take one of the following actions:
223 * @li Ignore it because we already acked another node with higher rank
224 * @li Ignore it and start a new election because we outrank it
225 * @li Defer to it because it outranks us and the node we previously
228 * @pre Message epoch is from the current or a newer epoch
229 * @param mepoch The epoch of the proposal
230 * @param from The rank proposing itself as leader
232 void receive_propose(int from
, epoch_t mepoch
);
234 * Handle a message from some other participant Acking us as the Leader.
236 * When we receive such a message, one of three thing may be happening:
237 * @li We received a message with a newer epoch, which means we must have
238 * somehow lost track of what was going on (maybe we rebooted), thus we
239 * will start a new election
240 * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
241 * is true), and we are actually being Acked by someone; thus simply add
242 * the one acking us to the @p acked_me set. If we do now have acks from
243 * all the participants, then we can declare victory
244 * @li We already deferred the election to somebody else, so we will just
245 * ignore this message
247 * @pre Message epoch is from the current or a newer epoch
248 * @post Election is on-going if we deferred to somebody else
249 * @post Election is on-going if we are still waiting for further Acks
250 * @post Election is not on-going if we are victorious
251 * @post Election is not on-going if we must start a new one
253 * @param from The rank which acked us
254 * @param from_epoch The election epoch the ack belongs to
256 void receive_ack(int from
, epoch_t from_epoch
);
258 * Handle a message from some other participant declaring Victory.
260 * We just got a message from someone declaring themselves Victorious, thus
263 * However, if the message's epoch happens to be different from our epoch+1,
264 * then it means we lost track of something and we must start a new election.
266 * If that is not the case, then we will simply update our epoch to the one
267 * in the message and invoke start() to reset the quorum.
269 * @pre from_epoch is the current or a newer epoch
270 * @post Election is not on-going
271 * @post Updated @p epoch
272 * @post We are a peon in a new quorum if we lost the election
274 * @param from The victory-claiming rank
275 * @param from_epoch The election epoch in which they claim victory
277 bool receive_victory_claim(int from
, epoch_t from_epoch
);
281 * @returns Our current epoch number
283 epoch_t
get_epoch() const { return epoch
; }
284 int get_acked_leader() { return leader_acked
; }
288 * Initiate the ElectionLogic class.
290 * Basically, we will simply read whatever epoch value we have in our stable
291 * storage, or consider it to be 1 if none is read.
293 * @post @p epoch is set to 1 or higher.
299 * If we come across a higher epoch, we simply update ours, also making
300 * sure we are no longer being elected (even though we could have been,
301 * we no longer are since we no longer are on that old epoch).
303 * @pre Our epoch is not larger than @p e
304 * @post Our epoch equals @p e
306 * @param e Epoch to which we will update our epoch
308 void bump_epoch(epoch_t e
);
310 * Defer the current election to some other monitor.
312 * This means that we will ack some other monitor and drop out from the run
313 * to become the Leader. We will only defer an election if the monitor we
314 * are deferring to outranks us.
316 * @pre @p who outranks us (i.e., who < our rank)
317 * @pre @p who outranks any other monitor we have deferred to in the past
318 * @post electing_me is false
319 * @post leader_acked equals @p who
320 * @post we triggered ElectionOwner's _defer_to() on @p who
322 * @param who Some other monitor's numeric identifier.
328 * We won. Or at least we believe we won, but for all intents and purposes
329 * that does not matter. What matters is that we Won.
331 * That said, we must now bump our epoch to reflect that the election is over
332 * and then we must let everybody in the quorum know we are their brand new
335 * Actually, the quorum will be now defined as the group of monitors that
336 * acked us during the election process.
338 * @pre Election is on-going
339 * @pre electing_me is true
340 * @post electing_me is false
341 * @post epoch is bumped up into an even value
342 * @post Election is not on-going
343 * @post We have a quorum, composed of the monitors that acked us
344 * @post We invoked message_victory() on the ElectionOwner
346 void declare_victory();