]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/ElectionLogic.cc
757c86165f5a1d375834791f7dd51077d056e3c5
[ceph.git] / ceph / src / mon / ElectionLogic.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "ElectionLogic.h"
16
17 #include "include/ceph_assert.h"
18 #include "common/dout.h"
19
20 #define dout_subsys ceph_subsys_mon
21 #undef dout_prefix
22 #define dout_prefix _prefix(_dout, epoch, elector)
23 using std::cerr;
24 using std::cout;
25 using std::dec;
26 using std::hex;
27 using std::list;
28 using std::map;
29 using std::make_pair;
30 using std::ostream;
31 using std::ostringstream;
32 using std::pair;
33 using std::set;
34 using std::setfill;
35 using std::string;
36 using std::stringstream;
37 using std::to_string;
38 using std::vector;
39 using std::unique_ptr;
40
41 using ceph::bufferlist;
42 using ceph::decode;
43 using ceph::encode;
44 using ceph::Formatter;
45 using ceph::JSONFormatter;
46 using ceph::mono_clock;
47 using ceph::mono_time;
48 using ceph::timespan_str;
49 static ostream& _prefix(std::ostream *_dout, epoch_t epoch, ElectionOwner* elector) {
50 return *_dout << "paxos." << elector->get_my_rank()
51 << ").electionLogic(" << epoch << ") ";
52 }
53 void ElectionLogic::init()
54 {
55 epoch = elector->read_persisted_epoch();
56 if (!epoch) {
57 ldout(cct, 1) << "init, first boot, initializing epoch at 1 " << dendl;
58 epoch = 1;
59 } else if (epoch % 2) {
60 ldout(cct, 1) << "init, last seen epoch " << epoch
61 << ", mid-election, bumping" << dendl;
62 ++epoch;
63 elector->persist_epoch(epoch);
64 } else {
65 ldout(cct, 1) << "init, last seen epoch " << epoch << dendl;
66 }
67 }
68
69 void ElectionLogic::bump_epoch(epoch_t e)
70 {
71 ldout(cct, 10) << __func__ << epoch << " to " << e << dendl;
72 ceph_assert(epoch <= e);
73 epoch = e;
74 peer_tracker->increase_epoch(e);
75 elector->persist_epoch(epoch);
76 // clear up some state
77 electing_me = false;
78 acked_me.clear();
79 elector->notify_bump_epoch();
80 }
81
82 void ElectionLogic::declare_standalone_victory()
83 {
84 assert(elector->paxos_size() == 1 && elector->get_my_rank() == 0);
85 init();
86 bump_epoch(epoch+1);
87 }
88
89 void ElectionLogic::clear_live_election_state()
90 {
91 leader_acked = -1;
92 electing_me = false;
93 reset_stable_tracker();
94 leader_peer_tracker.reset();
95 }
96
97 void ElectionLogic::reset_stable_tracker()
98 {
99 stable_peer_tracker.reset(new ConnectionTracker(*peer_tracker));
100 }
101
102 void ElectionLogic::connectivity_bump_epoch_in_election(epoch_t mepoch)
103 {
104 ceph_assert(mepoch > epoch);
105 bump_epoch(mepoch);
106 reset_stable_tracker();
107 double lscore, my_score;
108 my_score = connectivity_election_score(elector->get_my_rank());
109 lscore = connectivity_election_score(leader_acked);
110 if (my_score > lscore) {
111 leader_acked = -1;
112 leader_peer_tracker.reset();
113 }
114 }
115
116 void ElectionLogic::start()
117 {
118 if (!participating) {
119 ldout(cct, 0) << "not starting new election -- not participating" << dendl;
120 return;
121 }
122 ldout(cct, 5) << "start -- can i be leader?" << dendl;
123
124 acked_me.clear();
125 init();
126
127 // start by trying to elect me
128 if (epoch % 2 == 0) {
129 bump_epoch(epoch+1); // odd == election cycle
130 } else {
131 elector->validate_store();
132 }
133 acked_me.insert(elector->get_my_rank());
134 clear_live_election_state();
135 reset_stable_tracker();
136 electing_me = true;
137
138 bufferlist bl;
139 if (strategy == CONNECTIVITY) {
140 stable_peer_tracker->encode(bl);
141 }
142 elector->propose_to_peers(epoch, bl);
143 elector->_start();
144 }
145
146 void ElectionLogic::defer(int who)
147 {
148 if (strategy == CLASSIC) {
149 ldout(cct, 5) << "defer to " << who << dendl;
150 ceph_assert(who < elector->get_my_rank());
151 } else {
152 ldout(cct, 5) << "defer to " << who << ", disallowed_leaders=" << elector->get_disallowed_leaders() << dendl;
153 ceph_assert(!elector->get_disallowed_leaders().count(who));
154 }
155
156 if (electing_me) {
157 // drop out
158 acked_me.clear();
159 electing_me = false;
160 }
161
162 // ack them
163 leader_acked = who;
164 elector->_defer_to(who);
165 }
166
167 void ElectionLogic::end_election_period()
168 {
169 ldout(cct, 5) << "election period ended" << dendl;
170
171 // did i win?
172 if (electing_me &&
173 acked_me.size() > (elector->paxos_size() / 2)) {
174 // i win
175 declare_victory();
176 } else {
177 // whoever i deferred to didn't declare victory quickly enough.
178 if (elector->ever_participated())
179 start();
180 else
181 elector->reset_election();
182 }
183 }
184
185
186 void ElectionLogic::declare_victory()
187 {
188 ldout(cct, 5) << "I win! acked_me=" << acked_me << dendl;
189 last_election_winner = elector->get_my_rank();
190 last_voted_for = last_election_winner;
191 clear_live_election_state();
192
193 set<int> new_quorum;
194 new_quorum.swap(acked_me);
195
196 ceph_assert(epoch % 2 == 1); // election
197 bump_epoch(epoch+1); // is over!
198
199 elector->message_victory(new_quorum);
200 }
201
202 bool ElectionLogic::propose_classic_prefix(int from, epoch_t mepoch)
203 {
204 if (mepoch > epoch) {
205 bump_epoch(mepoch);
206 } else if (mepoch < epoch) {
207 // got an "old" propose,
208 if (epoch % 2 == 0 && // in a non-election cycle
209 !elector->is_current_member(from)) { // from someone outside the quorum
210 // a mon just started up, call a new election so they can rejoin!
211 ldout(cct, 5) << " got propose from old epoch, "
212 << from << " must have just started" << dendl;
213 // we may be active; make sure we reset things in the monitor appropriately.
214 elector->trigger_new_election();
215 } else {
216 ldout(cct, 5) << " ignoring old propose" << dendl;
217 }
218 return true;
219 }
220 return false;
221 }
222
223 void ElectionLogic::receive_propose(int from, epoch_t mepoch,
224 const ConnectionTracker *ct)
225 {
226 if (from == elector->get_my_rank()) {
227 lderr(cct) << "I got a propose from my own rank, hopefully this is startup weirdness,dropping" << dendl;
228 return;
229 }
230 switch (strategy) {
231 case CLASSIC:
232 propose_classic_handler(from, mepoch);
233 break;
234 case DISALLOW:
235 propose_disallow_handler(from, mepoch);
236 break;
237 case CONNECTIVITY:
238 propose_connectivity_handler(from, mepoch, ct);
239 break;
240 default:
241 ceph_assert(0 == "how did election strategy become an invalid value?");
242 }
243 }
244
245 void ElectionLogic::propose_disallow_handler(int from, epoch_t mepoch)
246 {
247 if (propose_classic_prefix(from, mepoch)) {
248 return;
249 }
250 const set<int>& disallowed_leaders = elector->get_disallowed_leaders();
251 int my_rank = elector->get_my_rank();
252 bool me_disallowed = disallowed_leaders.count(my_rank);
253 bool from_disallowed = disallowed_leaders.count(from);
254 bool my_win = !me_disallowed && // we are allowed to lead
255 (my_rank < from || from_disallowed); // we are a better choice than them
256 bool their_win = !from_disallowed && // they are allowed to lead
257 (my_rank > from || me_disallowed) && // they are a better choice than us
258 (leader_acked < 0 || leader_acked >= from); // they are a better choice than our previously-acked choice
259
260
261 if (my_win) {
262 // i would win over them.
263 if (leader_acked >= 0) { // we already acked someone
264 ceph_assert(leader_acked < from || from_disallowed); // and they still win, of course
265 ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
266 } else {
267 // wait, i should win!
268 if (!electing_me) {
269 elector->trigger_new_election();
270 }
271 }
272 } else {
273 // they would win over me
274 if (their_win) {
275 defer(from);
276 } else {
277 // ignore them!
278 ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
279 }
280 }
281 }
282
283 void ElectionLogic::propose_classic_handler(int from, epoch_t mepoch)
284 {
285 if (propose_classic_prefix(from, mepoch)) {
286 return;
287 }
288 if (elector->get_my_rank() < from) {
289 // i would win over them.
290 if (leader_acked >= 0) { // we already acked someone
291 ceph_assert(leader_acked < from); // and they still win, of course
292 ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
293 } else {
294 // wait, i should win!
295 if (!electing_me) {
296 elector->trigger_new_election();
297 }
298 }
299 } else {
300 // they would win over me
301 if (leader_acked < 0 || // haven't acked anyone yet, or
302 leader_acked > from || // they would win over who you did ack, or
303 leader_acked == from) { // this is the guy we're already deferring to
304 defer(from);
305 } else {
306 // ignore them!
307 ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
308 }
309 }
310 }
311
312 double ElectionLogic::connectivity_election_score(int rank)
313 {
314 if (elector->get_disallowed_leaders().count(rank)) {
315 return -1;
316 }
317 double score;
318 int liveness;
319 if (stable_peer_tracker) {
320 stable_peer_tracker->get_total_connection_score(rank, &score, &liveness);
321 } else {
322 peer_tracker->get_total_connection_score(rank, &score, &liveness);
323 }
324 return score;
325 }
326
327 void ElectionLogic::propose_connectivity_handler(int from, epoch_t mepoch,
328 const ConnectionTracker *ct)
329 {
330 if ((epoch % 2 == 0) &&
331 last_election_winner != elector->get_my_rank() &&
332 !elector->is_current_member(from)) {
333 // To prevent election flapping, peons ignore proposals from out-of-quorum
334 // peers unless their vote would materially change from the last election
335 int best_scorer = 0;
336 double best_score = 0;
337 double last_voted_for_score = 0;
338 for (unsigned i = 0; i < elector->paxos_size(); ++i) {
339 double score = connectivity_election_score(i);
340 if (score > best_score) {
341 best_scorer = i;
342 best_score = score;
343 }
344 if (last_voted_for >= 0 && i == static_cast<unsigned>(last_voted_for)) {
345 last_voted_for_score = score;
346 }
347 }
348 if (best_scorer == last_voted_for ||
349 (best_score - last_voted_for_score < ignore_propose_margin)) {
350 // drop this message; it won't change our vote so we defer to leader
351 return;
352 }
353 }
354 if (mepoch > epoch) {
355 connectivity_bump_epoch_in_election(mepoch);
356 } else if (mepoch < epoch) {
357 // got an "old" propose,
358 if (epoch % 2 == 0 && // in a non-election cycle
359 !elector->is_current_member(from)) { // from someone outside the quorum
360 // a mon just started up, call a new election so they can rejoin!
361 ldout(cct, 5) << " got propose from old epoch, "
362 << from << " must have just started" << dendl;
363 // we may be active; make sure we reset things in the monitor appropriately.
364 elector->trigger_new_election();
365 } else {
366 ldout(cct, 5) << " ignoring old propose" << dendl;
367 }
368 return;
369 }
370
371 int my_rank = elector->get_my_rank();
372 double my_score = connectivity_election_score(my_rank);
373 double from_score = connectivity_election_score(from);
374 double leader_score = -1;
375 if (leader_acked >= 0) {
376 leader_score = connectivity_election_score(leader_acked);
377 }
378
379 ldout(cct, 30) << "propose from rank=" << from << ", tracker: "
380 << (stable_peer_tracker ? *stable_peer_tracker : *peer_tracker) << dendl;
381
382 ldout(cct, 10) << "propose from rank=" << from << ",score=" << from_score
383 << "; my score=" << my_score
384 << "; currently acked " << leader_acked
385 << ",score=" << leader_score << dendl;
386
387 bool my_win = (my_score >= 0) && // My score is non-zero; I am allowed to lead
388 ((my_rank < from && my_score >= from_score) || // We have same scores and I have lower rank, or
389 (my_score > from_score)); // my score is higher
390
391 bool their_win = (from_score >= 0) && // Their score is non-zero; they're allowed to lead, AND
392 ((from < my_rank && from_score >= my_score) || // Either they have lower rank and same score, or
393 (from_score > my_score)) && // their score is higher, AND
394 ((from <= leader_acked && from_score >= leader_score) || // same conditions compared to leader, or IS leader
395 (from_score > leader_score));
396
397 if (my_win) {
398 // i would win over them.
399 if (leader_acked >= 0) { // we already acked someone
400 ceph_assert(leader_score >= from_score); // and they still win, of course
401 ldout(cct, 5) << "no, we already acked " << leader_acked << dendl;
402 } else {
403 // wait, i should win!
404 if (!electing_me) {
405 elector->trigger_new_election();
406 }
407 }
408 } else {
409 // they would win over me
410 if (their_win || from == leader_acked) {
411 if (leader_acked >= 0 && from != leader_acked) {
412 // we have to make sure our acked leader will ALSO defer to them, or else
413 // we can't, to maintain guarantees!
414 double leader_from_score;
415 int leader_from_liveness;
416 leader_peer_tracker->
417 get_total_connection_score(from, &leader_from_score,
418 &leader_from_liveness);
419 double leader_leader_score;
420 int leader_leader_liveness;
421 leader_peer_tracker->
422 get_total_connection_score(leader_acked, &leader_leader_score,
423 &leader_leader_liveness);
424 if ((from < leader_acked && leader_from_score >= leader_leader_score) ||
425 (leader_from_score > leader_leader_score)) {
426 defer(from);
427 leader_peer_tracker.reset(new ConnectionTracker(*ct));
428 } else { // we can't defer to them *this* round even though they should win...
429 double cur_leader_score, cur_from_score;
430 int cur_leader_live, cur_from_live;
431 peer_tracker->get_total_connection_score(leader_acked, &cur_leader_score, &cur_leader_live);
432 peer_tracker->get_total_connection_score(from, &cur_from_score, &cur_from_live);
433 if ((from < leader_acked && cur_from_score >= cur_leader_score) ||
434 (cur_from_score > cur_leader_score)) {
435 ldout(cct, 5) << "Bumping epoch and starting new election; acked "
436 << leader_acked << " should defer to " << from
437 << " but there is score disagreement!" << dendl;
438 bump_epoch(epoch+1);
439 start();
440 } else {
441 ldout(cct, 5) << "no, we already acked " << leader_acked
442 << " and it won't defer to " << from
443 << " despite better round scores" << dendl;
444 }
445 }
446 } else {
447 defer(from);
448 leader_peer_tracker.reset(new ConnectionTracker(*ct));
449 }
450 } else {
451 // ignore them!
452 ldout(cct, 5) << "no, we already acked " << leader_acked << " with score >=" << from_score << dendl;
453 }
454 }
455 }
456
457 void ElectionLogic::receive_ack(int from, epoch_t from_epoch)
458 {
459 ceph_assert(from_epoch % 2 == 1); // sender in an election epoch
460 if (from_epoch > epoch) {
461 ldout(cct, 5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl;
462 bump_epoch(from_epoch);
463 start();
464 return;
465 }
466 // is that _everyone_?
467 if (electing_me) {
468 acked_me.insert(from);
469 if (acked_me.size() == elector->paxos_size()) {
470 // if yes, shortcut to election finish
471 declare_victory();
472 }
473 } else {
474 // ignore, i'm deferring already.
475 ceph_assert(leader_acked >= 0);
476 }
477 }
478
479 bool ElectionLogic::victory_makes_sense(int from)
480 {
481 bool makes_sense = false;
482 switch (strategy) {
483 case CLASSIC:
484 makes_sense = (from < elector->get_my_rank());
485 break;
486 case DISALLOW:
487 makes_sense = (from < elector->get_my_rank()) ||
488 elector->get_disallowed_leaders().count(elector->get_my_rank());
489 break;
490 case CONNECTIVITY:
491 double my_score, leader_score;
492 my_score = connectivity_election_score(elector->get_my_rank());
493 leader_score = connectivity_election_score(from);
494 ldout(cct, 5) << "victory from " << from << " makes sense? lscore:"
495 << leader_score
496 << "; my score:" << my_score << dendl;
497
498 makes_sense = (leader_score >= my_score);
499 break;
500 default:
501 ceph_assert(0 == "how did you get a nonsense election strategy assigned?");
502 }
503 return makes_sense;
504 }
505
506 bool ElectionLogic::receive_victory_claim(int from, epoch_t from_epoch)
507 {
508 bool election_okay = victory_makes_sense(from);
509
510 last_election_winner = from;
511 last_voted_for = leader_acked;
512 clear_live_election_state();
513
514 if (!election_okay) {
515 ceph_assert(strategy == CONNECTIVITY);
516 ldout(cct, 1) << "I should have been elected over this leader; bumping and restarting!" << dendl;
517 bump_epoch(from_epoch);
518 start();
519 return false;
520 }
521
522 // i should have seen this election if i'm getting the victory.
523 if (from_epoch != epoch + 1) {
524 ldout(cct, 5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl;
525 bump_epoch(from_epoch);
526 start();
527 return false;
528 }
529
530 bump_epoch(from_epoch);
531
532 // they win
533 return true;
534 }