]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Elector.h
a581daa7ff5cc882ab8b99b1f0630eaf05acecec
[ceph.git] / ceph / src / mon / Elector.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16 #ifndef CEPH_MON_ELECTOR_H
17 #define CEPH_MON_ELECTOR_H
18
19 #include <map>
20
21 #include "include/types.h"
22 #include "include/Context.h"
23 #include "mon/MonOpRequest.h"
24 #include "mon/mon_types.h"
25 #include "mon/ElectionLogic.h"
26 #include "mon/ConnectionTracker.h"
27
28 class Monitor;
29
30
31 /**
32 * This class is responsible for handling messages and maintaining
33 * an ElectionLogic which holds the local state when electing
34 * a new Leader. We may win or we may lose. If we win, it means we became the
35 * Leader; if we lose, it means we are a Peon.
36 */
37 class Elector : public ElectionOwner, RankProvider {
38 /**
39 * @defgroup Elector_h_class Elector
40 * @{
41 */
42 ElectionLogic logic;
43 // connectivity validation and scoring
44 ConnectionTracker peer_tracker;
45 std::map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked
46 std::map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent
47 std::set<int> live_pinging; // ranks which we are currently pinging
48 std::set<int> dead_pinging; // ranks which didn't answer (degrading scores)
49 double ping_timeout; // the timeout after which we consider a ping to be dead
50 int PING_DIVISOR = 2; // we time out pings
51
52 /**
53 * @defgroup Elector_h_internal_types Internal Types
54 * @{
55 */
56 /**
57 * This struct will hold the features from a given peer.
58 * Features may both be the cluster's (in the form of a uint64_t), or
59 * mon-specific features. Instead of keeping maps to hold them both, or
60 * a pair, which would be weird, a struct to keep them seems appropriate.
61 */
62 struct elector_info_t {
63 uint64_t cluster_features = 0;
64 mon_feature_t mon_features;
65 ceph_release_t mon_release{0};
66 std::map<std::string,std::string> metadata;
67 };
68
69 /**
70 * @}
71 */
72
73 /**
74 * The Monitor instance associated with this class.
75 */
76 Monitor *mon;
77
78 /**
79 * Event callback responsible for dealing with an expired election once a
80 * timer runs out and fires up.
81 */
82 Context *expire_event = nullptr;
83
84 /**
85 * Resets the expire_event timer, by cancelling any existing one and
86 * scheduling a new one.
87 *
88 * @remarks This function assumes as a default firing value the duration of
89 * the monitor's lease interval, and adds to it the value specified
90 * in @e plus
91 *
92 * @post expire_event is set
93 *
94 * @param plus The amount of time to be added to the default firing value.
95 */
96 void reset_timer(double plus=0.0);
97 /**
98 * Cancel the expire_event timer, if it is defined.
99 *
100 * @post expire_event is not set
101 */
102 void cancel_timer();
103
104 // electing me
105 /**
106 * @defgroup Elector_h_electing_me_vars We are being elected
107 * @{
108 */
109 /**
110 * Map containing info of all those that acked our proposal to become the Leader.
111 * Note each peer's info.
112 */
113 std::map<int, elector_info_t> peer_info;
114 /**
115 * @}
116 */
117
118 /**
119 * Handle a message from some other node proposing itself to become it
120 * the Leader.
121 *
122 * We validate that the sending Monitor is allowed to participate based on
123 * its supported features, then pass the request to our ElectionLogic.
124 *
125 * @invariant The received message is an operation of type OP_PROPOSE
126 *
127 * @pre Message epoch is from the current or a newer epoch
128 *
129 * @param m A message sent by another participant in the quorum.
130 */
131 void handle_propose(MonOpRequestRef op);
132 /**
133 * Handle a message from some other participant Acking us as the Leader.
134 *
135 * We validate that the sending Monitor is allowed to participate based on
136 * its supported features, add it to peer_info, and pass the ack to our
137 * ElectionLogic.
138 *
139 * @pre Message epoch is from the current or a newer epoch
140 *
141 * @param m A message with an operation type of OP_ACK
142 */
143 void handle_ack(MonOpRequestRef op);
144 /**
145 * Handle a message from some other participant declaring Victory.
146 *
147 * We just got a message from someone declaring themselves Victorious, thus
148 * the new Leader.
149 *
150 * We pass the Victory to our ElectionLogic, and if it confirms the
151 * victory we lose the election and start following this Leader. Otherwise,
152 * drop the message.
153 *
154 * @pre Message epoch is from the current or a newer epoch
155 * @post Election is not on-going
156 * @post Updated @p epoch
157 * @post We have a new quorum if we lost the election
158 *
159 * @param m A message with an operation type of OP_VICTORY
160 */
161 void handle_victory(MonOpRequestRef op);
162 /**
163 * Send a nak to a peer who's out of date, containing information about why.
164 *
165 * If we get a message from a peer who can't support the required quorum
166 * features, we have to ignore them. This function will at least send
167 * them a message about *why* they're being ignored -- if they're new
168 * enough to support such a message.
169 *
170 * @param m A message from a monitor not supporting required features. We
171 * take ownership of the reference.
172 */
173 void nak_old_peer(MonOpRequestRef op);
174 /**
175 * Handle a message from some other participant declaring
176 * we cannot join the quorum.
177 *
178 * Apparently the quorum requires some feature that we do not implement. Shut
179 * down gracefully.
180 *
181 * @pre Election is on-going.
182 * @post We've shut down.
183 *
184 * @param m A message with an operation type of OP_NAK
185 */
186 void handle_nak(MonOpRequestRef op);
187 /**
188 * Send a ping to the specified peer.
189 * @n optional time that we will use instead of calling ceph_clock_now()
190 */
191 void send_peer_ping(int peer, const utime_t *n=NULL);
192 /**
193 * Check the state of pinging the specified peer. This is our
194 * "tick" for heartbeating; scheduled by itself and begin_peer_ping().
195 */
196 void ping_check(int peer);
197 /**
198 * Move the peer out of live_pinging into dead_pinging set
199 * and schedule dead_ping()ing on it.
200 */
201 void begin_dead_ping(int peer);
202 /**
203 * Checks that the peer is still marked for dead pinging,
204 * and then marks it as dead for the appropriate interval.
205 */
206 void dead_ping(int peer);
207 /**
208 * Handle a ping from another monitor and assimilate the data it contains.
209 */
210 void handle_ping(MonOpRequestRef op);
211 /**
212 * Update our view of everybody else's connectivity based on the provided
213 * tracker bufferlist
214 */
215 void assimilate_connection_reports(const bufferlist& bl);
216
217 public:
218 /**
219 * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
220 * @{
221 */
222 /* Commit the given epoch to our MonStore.
223 * We also take the opportunity to persist our peer_tracker.
224 */
225 void persist_epoch(epoch_t e);
226 /* Read the epoch out of our MonStore */
227 epoch_t read_persisted_epoch() const;
228 /* Write a nonsense key "election_writeable_test" to our MonStore */
229 void validate_store();
230 /* Reset my tracking. Currently, just call Monitor::join_election() */
231 void notify_bump_epoch();
232 /* Call a new election: Invoke Monitor::start_election() */
233 void trigger_new_election();
234 /* Retrieve rank from the Monitor */
235 int get_my_rank() const;
236 /* Send MMonElection OP_PROPOSE to every monitor in the map. */
237 void propose_to_peers(epoch_t e, bufferlist &bl);
238 /* bootstrap() the Monitor */
239 void reset_election();
240 /* Retrieve the Monitor::has_ever_joined member */
241 bool ever_participated() const;
242 /* Retrieve monmap->size() */
243 unsigned paxos_size() const;
244 /* Right now we don't disallow anybody */
245 std::set<int> disallowed_leaders;
246 const std::set<int>& get_disallowed_leaders() const { return disallowed_leaders; }
247 /**
248 * Reset the expire_event timer so we can limit the amount of time we
249 * will be electing. Clean up our peer_info.
250 *
251 * @post we reset the expire_event timer
252 */
253 void _start();
254 /**
255 * Send an MMonElection message deferring to the identified monitor. We
256 * also increase the election timeout so the monitor we defer to
257 * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
258 *
259 * @post we sent an ack message to @p who
260 * @post we reset the expire_event timer
261 *
262 * @param who Some other monitor's numeric identifier.
263 */
264 void _defer_to(int who);
265 /**
266 * Our ElectionLogic told us we won an election! Identify the quorum
267 * features, tell our new peons we've won, and invoke Monitor::win_election().
268 */
269 void message_victory(const std::set<int>& quorum);
270 /* Check if rank is in mon->quorum */
271 bool is_current_member(int rank) const;
272 /*
273 * @}
274 */
275 /**
276 * Persist our peer_tracker to disk.
277 */
278 void persist_connectivity_scores();
279
280 Elector *elector;
281
282 /**
283 * Create an Elector class
284 *
285 * @param m A Monitor instance
286 * @param strategy The election strategy to use, defined in MonMap/ElectionLogic
287 */
288 explicit Elector(Monitor *m, int strategy);
289 virtual ~Elector() {}
290
291 /**
292 * Inform this class it is supposed to shutdown.
293 *
294 * We will simply cancel the @p expire_event if any exists.
295 *
296 * @post @p expire_event is cancelled
297 */
298 void shutdown();
299
300 /**
301 * Obtain our epoch from ElectionLogic.
302 *
303 * @returns Our current epoch number
304 */
305 epoch_t get_epoch() { return logic.get_epoch(); }
306
307 /**
308 * If the Monitor knows there are no Paxos peers (so
309 * we are rank 0 and there are no others) we can declare victory.
310 */
311 void declare_standalone_victory() {
312 logic.declare_standalone_victory();
313 }
314 /**
315 * Tell the Elector to start pinging a given peer.
316 * Do this when you discover a peer and it has a rank assigned.
317 * We do it ourselves on receipt of pings and when receiving other messages.
318 */
319 void begin_peer_ping(int peer);
320 /**
321 * Handle received messages.
322 *
323 * We will ignore all messages that are not of type @p MSG_MON_ELECTION
324 * (i.e., messages whose interface is not of type @p MMonElection). All of
325 * those that are will then be dispatched to their operation-specific
326 * functions.
327 *
328 * @param m A received message
329 */
330 void dispatch(MonOpRequestRef op);
331
332 /**
333 * Call an election.
334 *
335 * This function simply calls ElectionLogic::start.
336 */
337 void call_election() {
338 logic.start();
339 }
340
341 /**
342 * Stop participating in subsequent Elections.
343 *
344 * @post @p participating is false
345 */
346 void stop_participating() { logic.participating = false; }
347 /**
348 * Start participating in Elections.
349 *
350 * If we are already participating (i.e., @p participating is true), then
351 * calling this function is moot.
352 *
353 * However, if we are not participating (i.e., @p participating is false),
354 * then we will start participating by setting @p participating to true and
355 * we will call for an Election.
356 *
357 * @post @p participating is true
358 */
359 void start_participating();
360 /**
361 * Forget everything about our peers. :(
362 */
363 void notify_clear_peer_state();
364 /**
365 * Notify that our local rank has changed
366 * and we may need to update internal data structures.
367 */
368 void notify_rank_changed(int new_rank);
369 /**
370 * A peer has been removed so we should clean up state related to it.
371 * This is safe to call even if we haven't joined or are currently
372 * in a quorum.
373 */
374 void notify_rank_removed(int rank_removed);
375 void notify_strategy_maybe_changed(int strategy);
376 /**
377 * Set the disallowed leaders.
378 *
379 * If you call this and the new disallowed set
380 * contains your current leader, you are
381 * responsible for calling an election!
382 *
383 * @returns false if the set is unchanged,
384 * true if the set changed
385 */
386 bool set_disallowed_leaders(const std::set<int>& dl) {
387 if (dl == disallowed_leaders) return false;
388 disallowed_leaders = dl;
389 return true;
390 }
391 void dump_connection_scores(Formatter *f) {
392 f->open_object_section("connection scores");
393 peer_tracker.dump(f);
394 f->close_section();
395 }
396 /**
397 * @}
398 */
399 };
400
401 #endif