]>
git.proxmox.com Git - ceph.git/blob - ceph/src/mon/ElectionLogic.cc
757c86165f5a1d375834791f7dd51077d056e3c5
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "ElectionLogic.h"
17 #include "include/ceph_assert.h"
18 #include "common/dout.h"
20 #define dout_subsys ceph_subsys_mon
22 #define dout_prefix _prefix(_dout, epoch, elector)
31 using std::ostringstream
;
36 using std::stringstream
;
39 using std::unique_ptr
;
41 using ceph::bufferlist
;
44 using ceph::Formatter
;
45 using ceph::JSONFormatter
;
46 using ceph::mono_clock
;
47 using ceph::mono_time
;
48 using ceph::timespan_str
;
49 static ostream
& _prefix(std::ostream
*_dout
, epoch_t epoch
, ElectionOwner
* elector
) {
50 return *_dout
<< "paxos." << elector
->get_my_rank()
51 << ").electionLogic(" << epoch
<< ") ";
53 void ElectionLogic::init()
55 epoch
= elector
->read_persisted_epoch();
57 ldout(cct
, 1) << "init, first boot, initializing epoch at 1 " << dendl
;
59 } else if (epoch
% 2) {
60 ldout(cct
, 1) << "init, last seen epoch " << epoch
61 << ", mid-election, bumping" << dendl
;
63 elector
->persist_epoch(epoch
);
65 ldout(cct
, 1) << "init, last seen epoch " << epoch
<< dendl
;
69 void ElectionLogic::bump_epoch(epoch_t e
)
71 ldout(cct
, 10) << __func__
<< epoch
<< " to " << e
<< dendl
;
72 ceph_assert(epoch
<= e
);
74 peer_tracker
->increase_epoch(e
);
75 elector
->persist_epoch(epoch
);
76 // clear up some state
79 elector
->notify_bump_epoch();
82 void ElectionLogic::declare_standalone_victory()
84 assert(elector
->paxos_size() == 1 && elector
->get_my_rank() == 0);
89 void ElectionLogic::clear_live_election_state()
93 reset_stable_tracker();
94 leader_peer_tracker
.reset();
97 void ElectionLogic::reset_stable_tracker()
99 stable_peer_tracker
.reset(new ConnectionTracker(*peer_tracker
));
102 void ElectionLogic::connectivity_bump_epoch_in_election(epoch_t mepoch
)
104 ceph_assert(mepoch
> epoch
);
106 reset_stable_tracker();
107 double lscore
, my_score
;
108 my_score
= connectivity_election_score(elector
->get_my_rank());
109 lscore
= connectivity_election_score(leader_acked
);
110 if (my_score
> lscore
) {
112 leader_peer_tracker
.reset();
116 void ElectionLogic::start()
118 if (!participating
) {
119 ldout(cct
, 0) << "not starting new election -- not participating" << dendl
;
122 ldout(cct
, 5) << "start -- can i be leader?" << dendl
;
127 // start by trying to elect me
128 if (epoch
% 2 == 0) {
129 bump_epoch(epoch
+1); // odd == election cycle
131 elector
->validate_store();
133 acked_me
.insert(elector
->get_my_rank());
134 clear_live_election_state();
135 reset_stable_tracker();
139 if (strategy
== CONNECTIVITY
) {
140 stable_peer_tracker
->encode(bl
);
142 elector
->propose_to_peers(epoch
, bl
);
146 void ElectionLogic::defer(int who
)
148 if (strategy
== CLASSIC
) {
149 ldout(cct
, 5) << "defer to " << who
<< dendl
;
150 ceph_assert(who
< elector
->get_my_rank());
152 ldout(cct
, 5) << "defer to " << who
<< ", disallowed_leaders=" << elector
->get_disallowed_leaders() << dendl
;
153 ceph_assert(!elector
->get_disallowed_leaders().count(who
));
164 elector
->_defer_to(who
);
167 void ElectionLogic::end_election_period()
169 ldout(cct
, 5) << "election period ended" << dendl
;
173 acked_me
.size() > (elector
->paxos_size() / 2)) {
177 // whoever i deferred to didn't declare victory quickly enough.
178 if (elector
->ever_participated())
181 elector
->reset_election();
186 void ElectionLogic::declare_victory()
188 ldout(cct
, 5) << "I win! acked_me=" << acked_me
<< dendl
;
189 last_election_winner
= elector
->get_my_rank();
190 last_voted_for
= last_election_winner
;
191 clear_live_election_state();
194 new_quorum
.swap(acked_me
);
196 ceph_assert(epoch
% 2 == 1); // election
197 bump_epoch(epoch
+1); // is over!
199 elector
->message_victory(new_quorum
);
202 bool ElectionLogic::propose_classic_prefix(int from
, epoch_t mepoch
)
204 if (mepoch
> epoch
) {
206 } else if (mepoch
< epoch
) {
207 // got an "old" propose,
208 if (epoch
% 2 == 0 && // in a non-election cycle
209 !elector
->is_current_member(from
)) { // from someone outside the quorum
210 // a mon just started up, call a new election so they can rejoin!
211 ldout(cct
, 5) << " got propose from old epoch, "
212 << from
<< " must have just started" << dendl
;
213 // we may be active; make sure we reset things in the monitor appropriately.
214 elector
->trigger_new_election();
216 ldout(cct
, 5) << " ignoring old propose" << dendl
;
223 void ElectionLogic::receive_propose(int from
, epoch_t mepoch
,
224 const ConnectionTracker
*ct
)
226 if (from
== elector
->get_my_rank()) {
227 lderr(cct
) << "I got a propose from my own rank, hopefully this is startup weirdness,dropping" << dendl
;
232 propose_classic_handler(from
, mepoch
);
235 propose_disallow_handler(from
, mepoch
);
238 propose_connectivity_handler(from
, mepoch
, ct
);
241 ceph_assert(0 == "how did election strategy become an invalid value?");
245 void ElectionLogic::propose_disallow_handler(int from
, epoch_t mepoch
)
247 if (propose_classic_prefix(from
, mepoch
)) {
250 const set
<int>& disallowed_leaders
= elector
->get_disallowed_leaders();
251 int my_rank
= elector
->get_my_rank();
252 bool me_disallowed
= disallowed_leaders
.count(my_rank
);
253 bool from_disallowed
= disallowed_leaders
.count(from
);
254 bool my_win
= !me_disallowed
&& // we are allowed to lead
255 (my_rank
< from
|| from_disallowed
); // we are a better choice than them
256 bool their_win
= !from_disallowed
&& // they are allowed to lead
257 (my_rank
> from
|| me_disallowed
) && // they are a better choice than us
258 (leader_acked
< 0 || leader_acked
>= from
); // they are a better choice than our previously-acked choice
262 // i would win over them.
263 if (leader_acked
>= 0) { // we already acked someone
264 ceph_assert(leader_acked
< from
|| from_disallowed
); // and they still win, of course
265 ldout(cct
, 5) << "no, we already acked " << leader_acked
<< dendl
;
267 // wait, i should win!
269 elector
->trigger_new_election();
273 // they would win over me
278 ldout(cct
, 5) << "no, we already acked " << leader_acked
<< dendl
;
283 void ElectionLogic::propose_classic_handler(int from
, epoch_t mepoch
)
285 if (propose_classic_prefix(from
, mepoch
)) {
288 if (elector
->get_my_rank() < from
) {
289 // i would win over them.
290 if (leader_acked
>= 0) { // we already acked someone
291 ceph_assert(leader_acked
< from
); // and they still win, of course
292 ldout(cct
, 5) << "no, we already acked " << leader_acked
<< dendl
;
294 // wait, i should win!
296 elector
->trigger_new_election();
300 // they would win over me
301 if (leader_acked
< 0 || // haven't acked anyone yet, or
302 leader_acked
> from
|| // they would win over who you did ack, or
303 leader_acked
== from
) { // this is the guy we're already deferring to
307 ldout(cct
, 5) << "no, we already acked " << leader_acked
<< dendl
;
312 double ElectionLogic::connectivity_election_score(int rank
)
314 if (elector
->get_disallowed_leaders().count(rank
)) {
319 if (stable_peer_tracker
) {
320 stable_peer_tracker
->get_total_connection_score(rank
, &score
, &liveness
);
322 peer_tracker
->get_total_connection_score(rank
, &score
, &liveness
);
327 void ElectionLogic::propose_connectivity_handler(int from
, epoch_t mepoch
,
328 const ConnectionTracker
*ct
)
330 if ((epoch
% 2 == 0) &&
331 last_election_winner
!= elector
->get_my_rank() &&
332 !elector
->is_current_member(from
)) {
333 // To prevent election flapping, peons ignore proposals from out-of-quorum
334 // peers unless their vote would materially change from the last election
336 double best_score
= 0;
337 double last_voted_for_score
= 0;
338 for (unsigned i
= 0; i
< elector
->paxos_size(); ++i
) {
339 double score
= connectivity_election_score(i
);
340 if (score
> best_score
) {
344 if (last_voted_for
>= 0 && i
== static_cast<unsigned>(last_voted_for
)) {
345 last_voted_for_score
= score
;
348 if (best_scorer
== last_voted_for
||
349 (best_score
- last_voted_for_score
< ignore_propose_margin
)) {
350 // drop this message; it won't change our vote so we defer to leader
354 if (mepoch
> epoch
) {
355 connectivity_bump_epoch_in_election(mepoch
);
356 } else if (mepoch
< epoch
) {
357 // got an "old" propose,
358 if (epoch
% 2 == 0 && // in a non-election cycle
359 !elector
->is_current_member(from
)) { // from someone outside the quorum
360 // a mon just started up, call a new election so they can rejoin!
361 ldout(cct
, 5) << " got propose from old epoch, "
362 << from
<< " must have just started" << dendl
;
363 // we may be active; make sure we reset things in the monitor appropriately.
364 elector
->trigger_new_election();
366 ldout(cct
, 5) << " ignoring old propose" << dendl
;
371 int my_rank
= elector
->get_my_rank();
372 double my_score
= connectivity_election_score(my_rank
);
373 double from_score
= connectivity_election_score(from
);
374 double leader_score
= -1;
375 if (leader_acked
>= 0) {
376 leader_score
= connectivity_election_score(leader_acked
);
379 ldout(cct
, 30) << "propose from rank=" << from
<< ", tracker: "
380 << (stable_peer_tracker
? *stable_peer_tracker
: *peer_tracker
) << dendl
;
382 ldout(cct
, 10) << "propose from rank=" << from
<< ",score=" << from_score
383 << "; my score=" << my_score
384 << "; currently acked " << leader_acked
385 << ",score=" << leader_score
<< dendl
;
387 bool my_win
= (my_score
>= 0) && // My score is non-zero; I am allowed to lead
388 ((my_rank
< from
&& my_score
>= from_score
) || // We have same scores and I have lower rank, or
389 (my_score
> from_score
)); // my score is higher
391 bool their_win
= (from_score
>= 0) && // Their score is non-zero; they're allowed to lead, AND
392 ((from
< my_rank
&& from_score
>= my_score
) || // Either they have lower rank and same score, or
393 (from_score
> my_score
)) && // their score is higher, AND
394 ((from
<= leader_acked
&& from_score
>= leader_score
) || // same conditions compared to leader, or IS leader
395 (from_score
> leader_score
));
398 // i would win over them.
399 if (leader_acked
>= 0) { // we already acked someone
400 ceph_assert(leader_score
>= from_score
); // and they still win, of course
401 ldout(cct
, 5) << "no, we already acked " << leader_acked
<< dendl
;
403 // wait, i should win!
405 elector
->trigger_new_election();
409 // they would win over me
410 if (their_win
|| from
== leader_acked
) {
411 if (leader_acked
>= 0 && from
!= leader_acked
) {
412 // we have to make sure our acked leader will ALSO defer to them, or else
413 // we can't, to maintain guarantees!
414 double leader_from_score
;
415 int leader_from_liveness
;
416 leader_peer_tracker
->
417 get_total_connection_score(from
, &leader_from_score
,
418 &leader_from_liveness
);
419 double leader_leader_score
;
420 int leader_leader_liveness
;
421 leader_peer_tracker
->
422 get_total_connection_score(leader_acked
, &leader_leader_score
,
423 &leader_leader_liveness
);
424 if ((from
< leader_acked
&& leader_from_score
>= leader_leader_score
) ||
425 (leader_from_score
> leader_leader_score
)) {
427 leader_peer_tracker
.reset(new ConnectionTracker(*ct
));
428 } else { // we can't defer to them *this* round even though they should win...
429 double cur_leader_score
, cur_from_score
;
430 int cur_leader_live
, cur_from_live
;
431 peer_tracker
->get_total_connection_score(leader_acked
, &cur_leader_score
, &cur_leader_live
);
432 peer_tracker
->get_total_connection_score(from
, &cur_from_score
, &cur_from_live
);
433 if ((from
< leader_acked
&& cur_from_score
>= cur_leader_score
) ||
434 (cur_from_score
> cur_leader_score
)) {
435 ldout(cct
, 5) << "Bumping epoch and starting new election; acked "
436 << leader_acked
<< " should defer to " << from
437 << " but there is score disagreement!" << dendl
;
441 ldout(cct
, 5) << "no, we already acked " << leader_acked
442 << " and it won't defer to " << from
443 << " despite better round scores" << dendl
;
448 leader_peer_tracker
.reset(new ConnectionTracker(*ct
));
452 ldout(cct
, 5) << "no, we already acked " << leader_acked
<< " with score >=" << from_score
<< dendl
;
457 void ElectionLogic::receive_ack(int from
, epoch_t from_epoch
)
459 ceph_assert(from_epoch
% 2 == 1); // sender in an election epoch
460 if (from_epoch
> epoch
) {
461 ldout(cct
, 5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl
;
462 bump_epoch(from_epoch
);
466 // is that _everyone_?
468 acked_me
.insert(from
);
469 if (acked_me
.size() == elector
->paxos_size()) {
470 // if yes, shortcut to election finish
474 // ignore, i'm deferring already.
475 ceph_assert(leader_acked
>= 0);
479 bool ElectionLogic::victory_makes_sense(int from
)
481 bool makes_sense
= false;
484 makes_sense
= (from
< elector
->get_my_rank());
487 makes_sense
= (from
< elector
->get_my_rank()) ||
488 elector
->get_disallowed_leaders().count(elector
->get_my_rank());
491 double my_score
, leader_score
;
492 my_score
= connectivity_election_score(elector
->get_my_rank());
493 leader_score
= connectivity_election_score(from
);
494 ldout(cct
, 5) << "victory from " << from
<< " makes sense? lscore:"
496 << "; my score:" << my_score
<< dendl
;
498 makes_sense
= (leader_score
>= my_score
);
501 ceph_assert(0 == "how did you get a nonsense election strategy assigned?");
506 bool ElectionLogic::receive_victory_claim(int from
, epoch_t from_epoch
)
508 bool election_okay
= victory_makes_sense(from
);
510 last_election_winner
= from
;
511 last_voted_for
= leader_acked
;
512 clear_live_election_state();
514 if (!election_okay
) {
515 ceph_assert(strategy
== CONNECTIVITY
);
516 ldout(cct
, 1) << "I should have been elected over this leader; bumping and restarting!" << dendl
;
517 bump_epoch(from_epoch
);
522 // i should have seen this election if i'm getting the victory.
523 if (from_epoch
!= epoch
+ 1) {
524 ldout(cct
, 5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl
;
525 bump_epoch(from_epoch
);
530 bump_epoch(from_epoch
);