1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #ifndef CEPH_MON_ELECTOR_H
17 #define CEPH_MON_ELECTOR_H
21 #include "include/types.h"
22 #include "include/Context.h"
23 #include "mon/MonOpRequest.h"
24 #include "mon/mon_types.h"
25 #include "mon/ElectionLogic.h"
26 #include "mon/ConnectionTracker.h"
32 * This class is responsible for handling messages and maintaining
33 * an ElectionLogic which holds the local state when electing
34 * a new Leader. We may win or we may lose. If we win, it means we became the
35 * Leader; if we lose, it means we are a Peon.
37 class Elector
: public ElectionOwner
, RankProvider
{
39 * @defgroup Elector_h_class Elector
43 // connectivity validation and scoring
44 ConnectionTracker peer_tracker
;
45 std::map
<int, utime_t
> peer_acked_ping
; // rank -> last ping stamp they acked
46 std::map
<int, utime_t
> peer_sent_ping
; // rank -> last ping stamp we sent
47 std::set
<int> live_pinging
; // ranks which we are currently pinging
48 std::set
<int> dead_pinging
; // ranks which didn't answer (degrading scores)
49 double ping_timeout
; // the timeout after which we consider a ping to be dead
50 int PING_DIVISOR
= 2; // we time out pings
53 * @defgroup Elector_h_internal_types Internal Types
57 * This struct will hold the features from a given peer.
58 * Features may both be the cluster's (in the form of a uint64_t), or
59 * mon-specific features. Instead of keeping maps to hold them both, or
60 * a pair, which would be weird, a struct to keep them seems appropriate.
62 struct elector_info_t
{
63 uint64_t cluster_features
= 0;
64 mon_feature_t mon_features
;
65 ceph_release_t mon_release
{0};
66 std::map
<std::string
,std::string
> metadata
;
74 * The Monitor instance associated with this class.
79 * Event callback responsible for dealing with an expired election once a
80 * timer runs out and fires up.
82 Context
*expire_event
= nullptr;
85 * Resets the expire_event timer, by cancelling any existing one and
86 * scheduling a new one.
88 * @remarks This function assumes as a default firing value the duration of
89 * the monitor's lease interval, and adds to it the value specified
92 * @post expire_event is set
94 * @param plus The amount of time to be added to the default firing value.
96 void reset_timer(double plus
=0.0);
98 * Cancel the expire_event timer, if it is defined.
100 * @post expire_event is not set
106 * @defgroup Elector_h_electing_me_vars We are being elected
110 * Map containing info of all those that acked our proposal to become the Leader.
111 * Note each peer's info.
113 std::map
<int, elector_info_t
> peer_info
;
119 * Handle a message from some other node proposing itself to become it
122 * We validate that the sending Monitor is allowed to participate based on
123 * its supported features, then pass the request to our ElectionLogic.
125 * @invariant The received message is an operation of type OP_PROPOSE
127 * @pre Message epoch is from the current or a newer epoch
129 * @param m A message sent by another participant in the quorum.
131 void handle_propose(MonOpRequestRef op
);
133 * Handle a message from some other participant Acking us as the Leader.
135 * We validate that the sending Monitor is allowed to participate based on
136 * its supported features, add it to peer_info, and pass the ack to our
139 * @pre Message epoch is from the current or a newer epoch
141 * @param m A message with an operation type of OP_ACK
143 void handle_ack(MonOpRequestRef op
);
145 * Handle a message from some other participant declaring Victory.
147 * We just got a message from someone declaring themselves Victorious, thus
150 * We pass the Victory to our ElectionLogic, and if it confirms the
151 * victory we lose the election and start following this Leader. Otherwise,
154 * @pre Message epoch is from the current or a newer epoch
155 * @post Election is not on-going
156 * @post Updated @p epoch
157 * @post We have a new quorum if we lost the election
159 * @param m A message with an operation type of OP_VICTORY
161 void handle_victory(MonOpRequestRef op
);
163 * Send a nak to a peer who's out of date, containing information about why.
165 * If we get a message from a peer who can't support the required quorum
166 * features, we have to ignore them. This function will at least send
167 * them a message about *why* they're being ignored -- if they're new
168 * enough to support such a message.
170 * @param m A message from a monitor not supporting required features. We
171 * take ownership of the reference.
173 void nak_old_peer(MonOpRequestRef op
);
175 * Handle a message from some other participant declaring
176 * we cannot join the quorum.
178 * Apparently the quorum requires some feature that we do not implement. Shut
181 * @pre Election is on-going.
182 * @post We've shut down.
184 * @param m A message with an operation type of OP_NAK
186 void handle_nak(MonOpRequestRef op
);
188 * Send a ping to the specified peer.
189 * @n optional time that we will use instead of calling ceph_clock_now()
191 void send_peer_ping(int peer
, const utime_t
*n
=NULL
);
193 * Check the state of pinging the specified peer. This is our
194 * "tick" for heartbeating; scheduled by itself and begin_peer_ping().
196 void ping_check(int peer
);
198 * Move the peer out of live_pinging into dead_pinging set
199 * and schedule dead_ping()ing on it.
201 void begin_dead_ping(int peer
);
203 * Checks that the peer is still marked for dead pinging,
204 * and then marks it as dead for the appropriate interval.
206 void dead_ping(int peer
);
208 * Handle a ping from another monitor and assimilate the data it contains.
210 void handle_ping(MonOpRequestRef op
);
212 * Update our view of everybody else's connectivity based on the provided
215 void assimilate_connection_reports(const bufferlist
& bl
);
219 * @defgroup Elector_h_ElectionOwner Functions from the ElectionOwner interface
222 /* Commit the given epoch to our MonStore.
223 * We also take the opportunity to persist our peer_tracker.
225 void persist_epoch(epoch_t e
);
226 /* Read the epoch out of our MonStore */
227 epoch_t
read_persisted_epoch() const;
228 /* Write a nonsense key "election_writeable_test" to our MonStore */
229 void validate_store();
230 /* Reset my tracking. Currently, just call Monitor::join_election() */
231 void notify_bump_epoch();
232 /* Call a new election: Invoke Monitor::start_election() */
233 void trigger_new_election();
234 /* Retrieve rank from the Monitor */
235 int get_my_rank() const;
236 /* Send MMonElection OP_PROPOSE to every monitor in the map. */
237 void propose_to_peers(epoch_t e
, bufferlist
&bl
);
238 /* bootstrap() the Monitor */
239 void reset_election();
240 /* Retrieve the Monitor::has_ever_joined member */
241 bool ever_participated() const;
242 /* Retrieve monmap->size() */
243 unsigned paxos_size() const;
244 /* Right now we don't disallow anybody */
245 std::set
<int> disallowed_leaders
;
246 const std::set
<int>& get_disallowed_leaders() const { return disallowed_leaders
; }
248 * Reset the expire_event timer so we can limit the amount of time we
249 * will be electing. Clean up our peer_info.
251 * @post we reset the expire_event timer
255 * Send an MMonElection message deferring to the identified monitor. We
256 * also increase the election timeout so the monitor we defer to
257 * has some time to gather deferrals and actually win. (FIXME: necessary to protocol?)
259 * @post we sent an ack message to @p who
260 * @post we reset the expire_event timer
262 * @param who Some other monitor's numeric identifier.
264 void _defer_to(int who
);
266 * Our ElectionLogic told us we won an election! Identify the quorum
267 * features, tell our new peons we've won, and invoke Monitor::win_election().
269 void message_victory(const std::set
<int>& quorum
);
270 /* Check if rank is in mon->quorum */
271 bool is_current_member(int rank
) const;
276 * Persist our peer_tracker to disk.
278 void persist_connectivity_scores();
283 * Create an Elector class
285 * @param m A Monitor instance
286 * @param strategy The election strategy to use, defined in MonMap/ElectionLogic
288 explicit Elector(Monitor
*m
, int strategy
);
289 virtual ~Elector() {}
292 * Inform this class it is supposed to shutdown.
294 * We will simply cancel the @p expire_event if any exists.
296 * @post @p expire_event is cancelled
301 * Obtain our epoch from ElectionLogic.
303 * @returns Our current epoch number
305 epoch_t
get_epoch() { return logic
.get_epoch(); }
308 * If the Monitor knows there are no Paxos peers (so
309 * we are rank 0 and there are no others) we can declare victory.
311 void declare_standalone_victory() {
312 logic
.declare_standalone_victory();
315 * Tell the Elector to start pinging a given peer.
316 * Do this when you discover a peer and it has a rank assigned.
317 * We do it ourselves on receipt of pings and when receiving other messages.
319 void begin_peer_ping(int peer
);
321 * Handle received messages.
323 * We will ignore all messages that are not of type @p MSG_MON_ELECTION
324 * (i.e., messages whose interface is not of type @p MMonElection). All of
325 * those that are will then be dispatched to their operation-specific
328 * @param m A received message
330 void dispatch(MonOpRequestRef op
);
335 * This function simply calls ElectionLogic::start.
337 void call_election() {
342 * Stop participating in subsequent Elections.
344 * @post @p participating is false
346 void stop_participating() { logic
.participating
= false; }
348 * Start participating in Elections.
350 * If we are already participating (i.e., @p participating is true), then
351 * calling this function is moot.
353 * However, if we are not participating (i.e., @p participating is false),
354 * then we will start participating by setting @p participating to true and
355 * we will call for an Election.
357 * @post @p participating is true
359 void start_participating();
361 * Forget everything about our peers. :(
363 void notify_clear_peer_state();
365 * Notify that our local rank has changed
366 * and we may need to update internal data structures.
368 void notify_rank_changed(int new_rank
);
370 * A peer has been removed so we should clean up state related to it.
371 * This is safe to call even if we haven't joined or are currently
374 void notify_rank_removed(int rank_removed
);
375 void notify_strategy_maybe_changed(int strategy
);
377 * Set the disallowed leaders.
379 * If you call this and the new disallowed set
380 * contains your current leader, you are
381 * responsible for calling an election!
383 * @returns false if the set is unchanged,
384 * true if the set changed
386 bool set_disallowed_leaders(const std::set
<int>& dl
) {
387 if (dl
== disallowed_leaders
) return false;
388 disallowed_leaders
= dl
;
391 void dump_connection_scores(Formatter
*f
) {
392 f
->open_object_section("connection scores");
393 peer_tracker
.dump(f
);