]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | #ifndef CEPH_MON_ELECTOR_H | |
17 | #define CEPH_MON_ELECTOR_H | |
18 | ||
19 | #include <map> | |
7c673cae FG |
20 | |
21 | #include "include/types.h" | |
22 | #include "include/Context.h" | |
23 | #include "mon/MonOpRequest.h" | |
24 | #include "mon/mon_types.h" | |
9f95a23c | 25 | #include "mon/ElectionLogic.h" |
f67539c2 | 26 | #include "mon/ConnectionTracker.h" |
7c673cae FG |
27 | |
28 | class Monitor; | |
29 | ||
f67539c2 | 30 | |
7c673cae | 31 | /** |
9f95a23c TL |
32 | * This class is responsible for handling messages and maintaining |
33 | * an ElectionLogic which holds the local state when electing | |
7c673cae FG |
34 | * a new Leader. We may win or we may lose. If we win, it means we became the |
35 | * Leader; if we lose, it means we are a Peon. | |
36 | */ | |
f67539c2 | 37 | class Elector : public ElectionOwner, RankProvider { |
7c673cae FG |
38 | /** |
39 | * @defgroup Elector_h_class Elector | |
40 | * @{ | |
41 | */ | |
9f95a23c | 42 | ElectionLogic logic; |
f67539c2 TL |
43 | // connectivity validation and scoring |
44 | ConnectionTracker peer_tracker; | |
20effc67 TL |
45 | std::map<int, utime_t> peer_acked_ping; // rank -> last ping stamp they acked |
46 | std::map<int, utime_t> peer_sent_ping; // rank -> last ping stamp we sent | |
47 | std::set<int> live_pinging; // ranks which we are currently pinging | |
48 | std::set<int> dead_pinging; // ranks which didn't answer (degrading scores) | |
f67539c2 TL |
49 | double ping_timeout; // the timeout after which we consider a ping to be dead |
50 | int PING_DIVISOR = 2; // we time out pings | |
9f95a23c | 51 | |
7c673cae FG |
52 | /** |
53 | * @defgroup Elector_h_internal_types Internal Types | |
54 | * @{ | |
55 | */ | |
56 | /** | |
57 | * This struct will hold the features from a given peer. | |
58 | * Features may both be the cluster's (in the form of a uint64_t), or | |
59 | * mon-specific features. Instead of keeping maps to hold them both, or | |
60 | * a pair, which would be weird, a struct to keep them seems appropriate. | |
61 | */ | |
224ce89b | 62 | struct elector_info_t { |
11fdf7f2 | 63 | uint64_t cluster_features = 0; |
7c673cae | 64 | mon_feature_t mon_features; |
9f95a23c | 65 | ceph_release_t mon_release{0}; |
f67539c2 | 66 | std::map<std::string,std::string> metadata; |
7c673cae FG |
67 | }; |
68 | ||
69 | /** | |
70 | * @} | |
71 | */ | |
72 | ||
73 | /** | |
74 | * The Monitor instance associated with this class. | |
75 | */ | |
76 | Monitor *mon; | |
77 | ||
78 | /** | |
79 | * Event callback responsible for dealing with an expired election once a | |
80 | * timer runs out and fires up. | |
81 | */ | |
82 | Context *expire_event = nullptr; | |
83 | ||
84 | /** | |
85 | * Resets the expire_event timer, by cancelling any existing one and | |
86 | * scheduling a new one. | |
87 | * | |
88 | * @remarks This function assumes as a default firing value the duration of | |
89 | * the monitor's lease interval, and adds to it the value specified | |
90 | * in @e plus | |
91 | * | |
92 | * @post expire_event is set | |
93 | * | |
94 | * @param plus The amount of time to be added to the default firing value. | |
95 | */ | |
96 | void reset_timer(double plus=0.0); | |
97 | /** | |
98 | * Cancel the expire_event timer, if it is defined. | |
99 | * | |
100 | * @post expire_event is not set | |
101 | */ | |
102 | void cancel_timer(); | |
103 | ||
7c673cae FG |
104 | // electing me |
105 | /** | |
106 | * @defgroup Elector_h_electing_me_vars We are being elected | |
107 | * @{ | |
108 | */ | |
109 | /** | |
9f95a23c TL |
110 | * Map containing info of all those that acked our proposal to become the Leader. |
111 | * Note each peer's info. | |
7c673cae | 112 | */ |
f67539c2 | 113 | std::map<int, elector_info_t> peer_info; |
7c673cae FG |
114 | /** |
115 | * @} | |
116 | */ | |
117 | ||
7c673cae FG |
118 | /** |
119 | * Handle a message from some other node proposing itself to become it | |
120 | * the Leader. | |
121 | * | |
9f95a23c TL |
122 | * We validate that the sending Monitor is allowed to participate based on |
123 | * its supported features, then pass the request to our ElectionLogic. | |
7c673cae FG |
124 | * |
125 | * @invariant The received message is an operation of type OP_PROPOSE | |
126 | * | |
9f95a23c TL |
127 | * @pre Message epoch is from the current or a newer epoch |
128 | * | |
7c673cae FG |
129 | * @param m A message sent by another participant in the quorum. |
130 | */ | |
131 | void handle_propose(MonOpRequestRef op); | |
132 | /** | |
133 | * Handle a message from some other participant Acking us as the Leader. | |
134 | * | |
9f95a23c TL |
135 | * We validate that the sending Monitor is allowed to participate based on |
136 | * its supported features, add it to peer_info, and pass the ack to our | |
137 | * ElectionLogic. | |
138 | * | |
139 | * @pre Message epoch is from the current or a newer epoch | |
7c673cae FG |
140 | * |
141 | * @param m A message with an operation type of OP_ACK | |
142 | */ | |
143 | void handle_ack(MonOpRequestRef op); | |
144 | /** | |
145 | * Handle a message from some other participant declaring Victory. | |
146 | * | |
147 | * We just got a message from someone declaring themselves Victorious, thus | |
148 | * the new Leader. | |
149 | * | |
9f95a23c TL |
150 | * We pass the Victory to our ElectionLogic, and if it confirms the |
151 | * victory we lose the election and start following this Leader. Otherwise, | |
152 | * drop the message. | |
7c673cae | 153 | * |
9f95a23c | 154 | * @pre Message epoch is from the current or a newer epoch |
7c673cae FG |
155 | * @post Election is not on-going |
156 | * @post Updated @p epoch | |
157 | * @post We have a new quorum if we lost the election | |
158 | * | |
159 | * @param m A message with an operation type of OP_VICTORY | |
160 | */ | |
161 | void handle_victory(MonOpRequestRef op); | |
162 | /** | |
163 | * Send a nak to a peer who's out of date, containing information about why. | |
164 | * | |
165 | * If we get a message from a peer who can't support the required quorum | |
166 | * features, we have to ignore them. This function will at least send | |
167 | * them a message about *why* they're being ignored -- if they're new | |
168 | * enough to support such a message. | |
169 | * | |
170 | * @param m A message from a monitor not supporting required features. We | |
171 | * take ownership of the reference. | |
172 | */ | |
173 | void nak_old_peer(MonOpRequestRef op); | |
174 | /** | |
175 | * Handle a message from some other participant declaring | |
176 | * we cannot join the quorum. | |
177 | * | |
178 | * Apparently the quorum requires some feature that we do not implement. Shut | |
179 | * down gracefully. | |
180 | * | |
181 | * @pre Election is on-going. | |
182 | * @post We've shut down. | |
183 | * | |
184 | * @param m A message with an operation type of OP_NAK | |
185 | */ | |
186 | void handle_nak(MonOpRequestRef op); | |
f67539c2 TL |
187 | /** |
188 | * Send a ping to the specified peer. | |
189 | * @n optional time that we will use instead of calling ceph_clock_now() | |
190 | */ | |
39ae355f | 191 | bool send_peer_ping(int peer, const utime_t *n=NULL); |
f67539c2 TL |
192 | /** |
193 | * Check the state of pinging the specified peer. This is our | |
194 | * "tick" for heartbeating; scheduled by itself and begin_peer_ping(). | |
195 | */ | |
196 | void ping_check(int peer); | |
197 | /** | |
198 | * Move the peer out of live_pinging into dead_pinging set | |
199 | * and schedule dead_ping()ing on it. | |
200 | */ | |
201 | void begin_dead_ping(int peer); | |
202 | /** | |
203 | * Checks that the peer is still marked for dead pinging, | |
204 | * and then marks it as dead for the appropriate interval. | |
205 | */ | |
206 | void dead_ping(int peer); | |
207 | /** | |
208 | * Handle a ping from another monitor and assimilate the data it contains. | |
209 | */ | |
210 | void handle_ping(MonOpRequestRef op); | |
211 | /** | |
212 | * Update our view of everybody else's connectivity based on the provided | |
213 | * tracker bufferlist | |
214 | */ | |
215 | void assimilate_connection_reports(const bufferlist& bl); | |
7c673cae FG |
216 | |
217 | public: | |
218 | /** | |
9f95a23c TL |
219 | * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface |
220 | * @{ | |
221 | */ | |
f67539c2 TL |
222 | /* Commit the given epoch to our MonStore. |
223 | * We also take the opportunity to persist our peer_tracker. | |
224 | */ | |
9f95a23c TL |
225 | void persist_epoch(epoch_t e); |
226 | /* Read the epoch out of our MonStore */ | |
227 | epoch_t read_persisted_epoch() const; | |
228 | /* Write a nonsense key "election_writeable_test" to our MonStore */ | |
229 | void validate_store(); | |
230 | /* Reset my tracking. Currently, just call Monitor::join_election() */ | |
231 | void notify_bump_epoch(); | |
232 | /* Call a new election: Invoke Monitor::start_election() */ | |
233 | void trigger_new_election(); | |
234 | /* Retrieve rank from the Monitor */ | |
235 | int get_my_rank() const; | |
236 | /* Send MMonElection OP_PROPOSE to every monitor in the map. */ | |
f67539c2 | 237 | void propose_to_peers(epoch_t e, bufferlist &bl); |
9f95a23c TL |
238 | /* bootstrap() the Monitor */ |
239 | void reset_election(); | |
240 | /* Retrieve the Monitor::has_ever_joined member */ | |
241 | bool ever_participated() const; | |
242 | /* Retrieve monmap->size() */ | |
243 | unsigned paxos_size() const; | |
f67539c2 | 244 | /* Right now we don't disallow anybody */ |
20effc67 TL |
245 | std::set<int> disallowed_leaders; |
246 | const std::set<int>& get_disallowed_leaders() const { return disallowed_leaders; } | |
9f95a23c TL |
247 | /** |
248 | * Reset the expire_event timer so we can limit the amount of time we | |
249 | * will be electing. Clean up our peer_info. | |
7c673cae | 250 | * |
9f95a23c | 251 | * @post we reset the expire_event timer |
7c673cae | 252 | */ |
9f95a23c | 253 | void _start(); |
7c673cae | 254 | /** |
9f95a23c TL |
255 | * Send an MMonElection message deferring to the identified monitor. We |
256 | * also increase the election timeout so the monitor we defer to | |
257 | * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?) | |
258 | * | |
259 | * @post we sent an ack message to @p who | |
260 | * @post we reset the expire_event timer | |
7c673cae | 261 | * |
9f95a23c TL |
262 | * @param who Some other monitor's numeric identifier. |
263 | */ | |
264 | void _defer_to(int who); | |
265 | /** | |
266 | * Our ElectionLogic told us we won an election! Identify the quorum | |
267 | * features, tell our new peons we've won, and invoke Monitor::win_election(). | |
268 | */ | |
269 | void message_victory(const std::set<int>& quorum); | |
270 | /* Check if rank is in mon->quorum */ | |
271 | bool is_current_member(int rank) const; | |
272 | /* | |
273 | * @} | |
274 | */ | |
f67539c2 TL |
275 | /** |
276 | * Persist our peer_tracker to disk. | |
277 | */ | |
278 | void persist_connectivity_scores(); | |
9f95a23c TL |
279 | |
280 | Elector *elector; | |
281 | ||
282 | /** | |
283 | * Create an Elector class | |
7c673cae | 284 | * |
9f95a23c | 285 | * @param m A Monitor instance |
f67539c2 | 286 | * @param strategy The election strategy to use, defined in MonMap/ElectionLogic |
7c673cae | 287 | */ |
f67539c2 | 288 | explicit Elector(Monitor *m, int strategy); |
9f95a23c TL |
289 | virtual ~Elector() {} |
290 | ||
7c673cae FG |
291 | /** |
292 | * Inform this class it is supposed to shutdown. | |
293 | * | |
294 | * We will simply cancel the @p expire_event if any exists. | |
295 | * | |
296 | * @post @p expire_event is cancelled | |
297 | */ | |
298 | void shutdown(); | |
299 | ||
300 | /** | |
9f95a23c | 301 | * Obtain our epoch from ElectionLogic. |
7c673cae FG |
302 | * |
303 | * @returns Our current epoch number | |
304 | */ | |
9f95a23c | 305 | epoch_t get_epoch() { return logic.get_epoch(); } |
7c673cae FG |
306 | |
307 | /** | |
9f95a23c TL |
308 | * If the Monitor knows there are no Paxos peers (so |
309 | * we are rank 0 and there are no others) we can declare victory. | |
7c673cae | 310 | */ |
9f95a23c TL |
311 | void declare_standalone_victory() { |
312 | logic.declare_standalone_victory(); | |
7c673cae | 313 | } |
f67539c2 TL |
314 | /** |
315 | * Tell the Elector to start pinging a given peer. | |
316 | * Do this when you discover a peer and it has a rank assigned. | |
317 | * We do it ourselves on receipt of pings and when receiving other messages. | |
318 | */ | |
319 | void begin_peer_ping(int peer); | |
7c673cae FG |
320 | /** |
321 | * Handle received messages. | |
322 | * | |
323 | * We will ignore all messages that are not of type @p MSG_MON_ELECTION | |
324 | * (i.e., messages whose interface is not of type @p MMonElection). All of | |
325 | * those that are will then be dispatched to their operation-specific | |
326 | * functions. | |
327 | * | |
328 | * @param m A received message | |
329 | */ | |
330 | void dispatch(MonOpRequestRef op); | |
331 | ||
332 | /** | |
333 | * Call an election. | |
334 | * | |
9f95a23c | 335 | * This function simply calls ElectionLogic::start. |
7c673cae FG |
336 | */ |
337 | void call_election() { | |
9f95a23c | 338 | logic.start(); |
7c673cae FG |
339 | } |
340 | ||
341 | /** | |
342 | * Stop participating in subsequent Elections. | |
343 | * | |
344 | * @post @p participating is false | |
345 | */ | |
9f95a23c | 346 | void stop_participating() { logic.participating = false; } |
7c673cae FG |
347 | /** |
348 | * Start participating in Elections. | |
349 | * | |
350 | * If we are already participating (i.e., @p participating is true), then | |
351 | * calling this function is moot. | |
352 | * | |
353 | * However, if we are not participating (i.e., @p participating is false), | |
354 | * then we will start participating by setting @p participating to true and | |
355 | * we will call for an Election. | |
356 | * | |
357 | * @post @p participating is true | |
358 | */ | |
359 | void start_participating(); | |
39ae355f TL |
360 | /** |
361 | * Check if our peer_tracker is self-consistent, not suffering from | |
362 | * https://tracker.ceph.com/issues/58049 | |
363 | */ | |
364 | bool peer_tracker_is_clean(); | |
f67539c2 TL |
365 | /** |
366 | * Forget everything about our peers. :( | |
367 | */ | |
368 | void notify_clear_peer_state(); | |
369 | /** | |
370 | * Notify that our local rank has changed | |
371 | * and we may need to update internal data structures. | |
372 | */ | |
373 | void notify_rank_changed(int new_rank); | |
374 | /** | |
375 | * A peer has been removed so we should clean up state related to it. | |
b3b6e05e TL |
376 | * This is safe to call even if we haven't joined or are currently |
377 | * in a quorum. | |
f67539c2 | 378 | */ |
1e59de90 | 379 | void notify_rank_removed(unsigned rank_removed, unsigned new_rank); |
f67539c2 TL |
380 | void notify_strategy_maybe_changed(int strategy); |
381 | /** | |
382 | * Set the disallowed leaders. | |
383 | * | |
384 | * If you call this and the new disallowed set | |
385 | * contains your current leader, you are | |
386 | * responsible for calling an election! | |
387 | * | |
388 | * @returns false if the set is unchanged, | |
389 | * true if the set changed | |
390 | */ | |
20effc67 | 391 | bool set_disallowed_leaders(const std::set<int>& dl) { |
f67539c2 TL |
392 | if (dl == disallowed_leaders) return false; |
393 | disallowed_leaders = dl; | |
394 | return true; | |
395 | } | |
396 | void dump_connection_scores(Formatter *f) { | |
397 | f->open_object_section("connection scores"); | |
398 | peer_tracker.dump(f); | |
399 | f->close_section(); | |
400 | } | |
7c673cae FG |
401 | /** |
402 | * @} | |
403 | */ | |
404 | }; | |
405 | ||
406 | #endif |