]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/ElectionLogic.h
import 15.2.0 Octopus source
[ceph.git] / ceph / src / mon / ElectionLogic.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_ELECTIONLOGIC_H
17 #define CEPH_ELECTIONLOGIC_H
18
19 #include <map>
20 #include "include/types.h"
21
22 class ElectionOwner {
23 public:
24 /**
25 * Write down the given epoch in persistent storage, such that it
26 * can later be retrieved by read_persisted_epoch even across process
27 * or machine restarts.
28 *
29 * @param e The epoch to write
30 */
31 virtual void persist_epoch(epoch_t e) = 0;
32 /**
33 * Retrieve the most-previously-persisted epoch.
34 *
35 * @returns The latest epoch passed to persist_epoch()
36 */
37 virtual epoch_t read_persisted_epoch() const = 0;
38 /**
39 * Validate that the persistent store is working by committing
40 * to it. (There is no interface for retrieving the value; this
41 * tests local functionality before doing things like triggering
42 * elections to try and join a quorum.)
43 */
44 virtual void validate_store() = 0;
45 /**
46 * Notify the ElectionOwner that ElectionLogic has increased its
47 * election epoch. This resets an election (either on local loss or victory,
48 * or when trying a new election round) and the ElectionOwner
49 * should reset any tracking of its own to match. (The ElectionLogic
50 * will further trigger sending election messages if that is
51 * appropriate.)
52 */
53 virtual void notify_bump_epoch() = 0;
54 /**
55 * Notify the ElectionOwner we must start a new election.
56 */
57 virtual void trigger_new_election() = 0;
58 /**
59 * Retrieve this Paxos instance's rank.
60 */
61 virtual int get_my_rank() const = 0;
62 /**
63 * Send a PROPOSE message to all our peers. This happens when
64 * we have started a new election (which may mean attempting to
65 * override a current one).
66 *
67 * @param e The election epoch of our proposal.
68 */
69 virtual void propose_to_peers(epoch_t e) = 0;
70 /**
71 * The election has failed and we aren't sure what the state of the
72 * quorum is, so reset the entire system as if from scratch.
73 */
74 virtual void reset_election() = 0;
75 /**
76 * Ask the ElectionOwner if we-the-Monitor have ever participated in the
77 * quorum (including across process restarts!).
78 *
79 * @returns true if we have participated, false otherwise
80 */
81 virtual bool ever_participated() const = 0;
82 /**
83 * Ask the ElectionOwner for the size of the Paxos set. This includes
84 * those monitors which may not be in the current quorum!
85 */
86 virtual unsigned paxos_size() const = 0;
87 /**
88 * Tell the ElectionOwner we have started a new election.
89 *
90 * The ElectionOwner is responsible for timing out the election (by invoking
91 * end_election_period()) if it takes too long (as defined by the ElectionOwner).
92 * This function is the opportunity to do that and to clean up any other external
93 * election state it may be maintaining.
94 */
95 virtual void _start() = 0;
96 /**
97 * Tell the ElectionOwner to defer to the identified peer. Tell that peer
98 * we have deferred to it.
99 *
100 * @post we sent an ack message to @p who
101 */
102 virtual void _defer_to(int who) = 0;
103 /**
104 * We have won an election, so have the ElectionOwner message that to
105 * our new quorum!
106 *
107 * @param quorum The ranks of our peers which deferred to us and
108 * must be told of our victory
109 */
110 virtual void message_victory(const std::set<int>& quorum) = 0;
111 /**
112 * Query the ElectionOwner about if a given rank is in the
113 * currently active quorum.
114 * @param rank the Paxos rank whose status we are checking
115 * @returns true if the rank is in our current quorum, false otherwise.
116 */
117 virtual bool is_current_member(int rank) const = 0;
118 virtual ~ElectionOwner() {}
119 };
120
121 /**
122 * This class maintains local state for running an election
123 * between Paxos instances. It receives input requests
124 * and calls back out to its ElectionOwner to do persistence
125 * and message other entities.
126 */
127
128 class ElectionLogic {
129 ElectionOwner *elector;
130 CephContext *cct;
131 /**
132 * Latest epoch we've seen.
133 *
134 * @remarks if its value is odd, we're electing; if it's even, then we're
135 * stable.
136 */
137 epoch_t epoch = 0;
138 /**
139 * Indicates who we have acked
140 */
141 int leader_acked;
142 public:
143 /**
144 * Indicates if we are participating in the quorum.
145 *
146 * @remarks By default, we are created as participating. We may stop
147 * participating if something explicitly sets our value
148 * false, though. If that happens, it will
149 * have to set participating=true and invoke start() for us to resume
150 * participating in the quorum.
151 */
152 bool participating;
153 /**
154 * Indicates if we are the ones being elected.
155 *
156 * We always attempt to be the one being elected if we are the ones starting
157 * the election. If we are not the ones that started it, we will only attempt
158 * to be elected if we think we might have a chance (i.e., the other guy's
159 * rank is lower than ours).
160 */
161 bool electing_me;
162 /**
163 * Set containing all those that acked our proposal to become the Leader.
164 *
165 * If we are acked by ElectionOwner::paxos_size() peers, we will declare
166 * victory.
167 */
168 std::set<int> acked_me;
169
170 ElectionLogic(ElectionOwner *e, CephContext *c) : elector(e), cct(c),
171 leader_acked(-1),
172 participating(true),
173 electing_me(false) {}
174 /**
175 * If there are no other peers in this Paxos group, ElectionOwner
176 * can simply declare victory and we will make it so.
177 *
178 * @pre paxos_size() is 1
179 * @pre get_my_rank is 0
180 */
181 void declare_standalone_victory();
182 /**
183 * Start a new election by proposing ourselves as the new Leader.
184 *
185 * Basically, send propose messages to all the peers.
186 *
187 * @pre participating is true
188 * @post epoch is an odd value
189 * @post electing_me is true
190 * @post We have invoked propose_to_peers() on our ElectionOwner
191 * @post We have invoked _start() on our ElectionOwner
192 */
193 void start();
194 /**
195 * ElectionOwner has decided the election has taken too long and expired.
196 *
197 * This will happen when no one declared victory or started a new election
198 * during the allowed time span.
199 *
200 * When the election expires, we will check if we were the ones who won, and
201 * if so we will declare victory. If that is not the case, then we assume
202 * that the one we deferred to didn't declare victory quickly enough (in fact,
203 * as far as we know, it may even be dead); so, just propose ourselves as the
204 * Leader.
205 */
206 void end_election_period();
207 /**
208 * Handle a proposal from some other node proposing asking to become
209 * the Leader.
210 *
211 * If the message appears to be old (i.e., its epoch is lower than our epoch),
212 * then we may take one of two actions:
213 *
214 * @li Ignore it because it's nothing more than an old proposal
215 * @li Start new elections if we verify that it was sent by a monitor from
216 * outside the quorum; given its old state, it's fair to assume it just
217 * started, so we should start new elections so it may rejoin
218 *
219 * If we did not ignore the received message, then we know that this message
220 * was sent by some other node proposing itself to become the Leader. So, we
221 * will take one of the following actions:
222 *
223 * @li Ignore it because we already acked another node with higher rank
224 * @li Ignore it and start a new election because we outrank it
225 * @li Defer to it because it outranks us and the node we previously
226 * acked, if any
227 *
228 * @pre Message epoch is from the current or a newer epoch
229 * @param mepoch The epoch of the proposal
230 * @param from The rank proposing itself as leader
231 */
232 void receive_propose(int from, epoch_t mepoch);
233 /**
234 * Handle a message from some other participant Acking us as the Leader.
235 *
236 * When we receive such a message, one of three thing may be happening:
237 * @li We received a message with a newer epoch, which means we must have
238 * somehow lost track of what was going on (maybe we rebooted), thus we
239 * will start a new election
240 * @li We consider ourselves in the run for the Leader (i.e., @p electing_me
241 * is true), and we are actually being Acked by someone; thus simply add
242 * the one acking us to the @p acked_me set. If we do now have acks from
243 * all the participants, then we can declare victory
244 * @li We already deferred the election to somebody else, so we will just
245 * ignore this message
246 *
247 * @pre Message epoch is from the current or a newer epoch
248 * @post Election is on-going if we deferred to somebody else
249 * @post Election is on-going if we are still waiting for further Acks
250 * @post Election is not on-going if we are victorious
251 * @post Election is not on-going if we must start a new one
252 *
253 * @param from The rank which acked us
254 * @param from_epoch The election epoch the ack belongs to
255 */
256 void receive_ack(int from, epoch_t from_epoch);
257 /**
258 * Handle a message from some other participant declaring Victory.
259 *
260 * We just got a message from someone declaring themselves Victorious, thus
261 * the new Leader.
262 *
263 * However, if the message's epoch happens to be different from our epoch+1,
264 * then it means we lost track of something and we must start a new election.
265 *
266 * If that is not the case, then we will simply update our epoch to the one
267 * in the message and invoke start() to reset the quorum.
268 *
269 * @pre from_epoch is the current or a newer epoch
270 * @post Election is not on-going
271 * @post Updated @p epoch
272 * @post We are a peon in a new quorum if we lost the election
273 *
274 * @param from The victory-claiming rank
275 * @param from_epoch The election epoch in which they claim victory
276 */
277 bool receive_victory_claim(int from, epoch_t from_epoch);
278 /**
279 * Obtain our epoch
280 *
281 * @returns Our current epoch number
282 */
283 epoch_t get_epoch() const { return epoch; }
284 int get_acked_leader() { return leader_acked; }
285
286 private:
287 /**
288 * Initiate the ElectionLogic class.
289 *
290 * Basically, we will simply read whatever epoch value we have in our stable
291 * storage, or consider it to be 1 if none is read.
292 *
293 * @post @p epoch is set to 1 or higher.
294 */
295 void init();
296 /**
297 * Update our epoch.
298 *
299 * If we come across a higher epoch, we simply update ours, also making
300 * sure we are no longer being elected (even though we could have been,
301 * we no longer are since we no longer are on that old epoch).
302 *
303 * @pre Our epoch is not larger than @p e
304 * @post Our epoch equals @p e
305 *
306 * @param e Epoch to which we will update our epoch
307 */
308 void bump_epoch(epoch_t e);
309 /**
310 * Defer the current election to some other monitor.
311 *
312 * This means that we will ack some other monitor and drop out from the run
313 * to become the Leader. We will only defer an election if the monitor we
314 * are deferring to outranks us.
315 *
316 * @pre @p who outranks us (i.e., who < our rank)
317 * @pre @p who outranks any other monitor we have deferred to in the past
318 * @post electing_me is false
319 * @post leader_acked equals @p who
320 * @post we triggered ElectionOwner's _defer_to() on @p who
321 *
322 * @param who Some other monitor's numeric identifier.
323 */
324 void defer(int who);
325 /**
326 * Declare Victory.
327 *
328 * We won. Or at least we believe we won, but for all intents and purposes
329 * that does not matter. What matters is that we Won.
330 *
331 * That said, we must now bump our epoch to reflect that the election is over
332 * and then we must let everybody in the quorum know we are their brand new
333 * Leader.
334 *
335 * Actually, the quorum will be now defined as the group of monitors that
336 * acked us during the election process.
337 *
338 * @pre Election is on-going
339 * @pre electing_me is true
340 * @post electing_me is false
341 * @post epoch is bumped up into an even value
342 * @post Election is not on-going
343 * @post We have a quorum, composed of the monitors that acked us
344 * @post We invoked message_victory() on the ElectionOwner
345 */
346 void declare_victory();
347 };
348
349 #endif