]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/mon/test_election.cc
ca5415ff6a930a84aead5d1a7af8cca9b3f8037e
[ceph.git] / ceph / src / test / mon / test_election.cc
1 // -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "gtest/gtest.h"
5 #include "mon/ElectionLogic.h"
6 #include "mon/ConnectionTracker.h"
7 #include "common/dout.h"
8
9 #include "global/global_context.h"
10 #include "global/global_init.h"
11 #include "common/common_init.h"
12 #include "common/ceph_argparse.h"
13
14 using namespace std;
15
16 #define dout_subsys ceph_subsys_test
17 #undef dout_prefix
18 #define dout_prefix _prefix(_dout, prefix_name(), timestep_count())
19 static ostream& _prefix(std::ostream *_dout, const char *prefix, int timesteps) {
20 return *_dout << prefix << timesteps << " ";
21 }
22
23 const char* prefix_name() { return "test_election: "; }
24 int timestep_count() { return -1; }
25
26 int main(int argc, char **argv) {
27 vector<const char*> args(argv, argv+argc);
28 bool user_set_debug = false;
29 for (auto& arg : args) {
30 if (strncmp("--debug_mon", arg, 11) == 0) user_set_debug = true;
31 }
32 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
33 CODE_ENVIRONMENT_UTILITY,
34 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
35 common_init_finish(g_ceph_context);
36 if (!user_set_debug) g_ceph_context->_conf.set_val("debug mon", "0/20");
37
38 ::testing::InitGoogleTest(&argc, argv);
39 return RUN_ALL_TESTS();
40 }
41
42
43 class Owner;
44 struct Election {
45 map<int, Owner*> electors;
46 map<int, set<int> > blocked_messages;
47 int count;
48 ElectionLogic::election_strategy election_strategy;
49 int ping_interval;
50 set<int> disallowed_leaders;
51
52 vector< function<void()> > messages;
53 int pending_election_messages;
54 int timesteps_run = 0;
55 int last_quorum_change = 0;
56 int last_quorum_formed = -1;
57 set<int> last_quorum_reported;
58 int last_leader = -1;
59
60 Election(int c, ElectionLogic::election_strategy es, int pingi=1, double tracker_halflife=5);
61 ~Election();
62 // ElectionOwner interfaces
63 int get_paxos_size() { return count; }
64 const set<int>& get_disallowed_leaders() const { return disallowed_leaders; }
65 void propose_to(int from, int to, epoch_t e, bufferlist& cbl);
66 void defer_to(int from, int to, epoch_t e);
67 void claim_victory(int from, int to, epoch_t e, const set<int>& members);
68 void accept_victory(int from, int to, epoch_t e);
69 void report_quorum(const set<int>& quorum);
70 void queue_stable_message(int from, int to, function<void()> m);
71 void queue_timeout_message(int from, int to, function<void()> m);
72 void queue_stable_or_timeout(int from, int to,
73 function<void()> m, function<void()> t);
74 void queue_election_message(int from, int to, function<void()> m);
75
76 // test runner interfaces
77 int run_timesteps(int max);
78 void start_one(int who);
79 void start_all();
80 bool election_stable() const;
81 bool quorum_stable(int timesteps_stable) const;
82 bool all_agree_on_leader() const;
83 bool check_epoch_agreement() const;
84 void block_messages(int from, int to);
85 void block_bidirectional_messages(int a, int b);
86 void unblock_messages(int from, int to);
87 void unblock_bidirectional_messages(int a, int b);
88 void add_disallowed_leader(int disallowed) { disallowed_leaders.insert(disallowed); }
89 void remove_elector(int rank);
90 const char* prefix_name() const { return "Election: "; }
91 int timestep_count() const { return timesteps_run; }
92 };
93 struct Owner : public ElectionOwner, RankProvider {
94 Election *parent;
95 int rank;
96 epoch_t persisted_epoch;
97 bool ever_joined;
98 ConnectionTracker peer_tracker;
99 ElectionLogic logic;
100 set<int> quorum;
101 int victory_accepters;
102 int timer_steps; // timesteps until we trigger timeout
103 bool timer_election; // the timeout is for normal election, or victory
104 bool rank_deleted = false;
105 string prefix_str;
106 Owner(int r, ElectionLogic::election_strategy es, double tracker_halflife,
107 Election *p) : parent(p), rank(r), persisted_epoch(0),
108 ever_joined(false),
109 peer_tracker(this, rank, tracker_halflife, 5),
110 logic(this, es, &peer_tracker, 0.0005, g_ceph_context),
111 victory_accepters(0),
112 timer_steps(-1), timer_election(true) {
113 std::stringstream str;
114 str << "Owner" << rank << " ";
115 prefix_str = str.str();
116 }
117
118 // in-memory store: just save to variable
119 void persist_epoch(epoch_t e) { persisted_epoch = e; }
120 // in-memory store: just return variable
121 epoch_t read_persisted_epoch() const { return persisted_epoch; }
122 // in-memory store: don't need to validate
123 void validate_store() { return; }
124 // don't need to do anything with our state right now
125 void notify_bump_epoch() {}
126 void notify_rank_removed(int removed_rank) {
127 peer_tracker.notify_rank_removed(removed_rank);
128 if (rank > removed_rank)
129 --rank;
130 }
131 void notify_deleted() { rank_deleted = true; rank = -1; cancel_timer(); }
132 // pass back to ElectionLogic; we don't need this redirect ourselves
133 void trigger_new_election() { ceph_assert (!rank_deleted); logic.start(); }
134 int get_my_rank() const { return rank; }
135 // we don't need to persist scores as we don't reset and lose memory state
136 void persist_connectivity_scores() {}
137 void propose_to_peers(epoch_t e, bufferlist& bl) {
138 ceph_assert (!rank_deleted);
139 for (int i = 0; i < parent->get_paxos_size(); ++i) {
140 if (i == rank) continue;
141 parent->propose_to(rank, i, e, bl);
142 }
143 }
144 void reset_election() {
145 ceph_assert (!rank_deleted);
146 _start();
147 logic.start();
148 }
149 bool ever_participated() const { return ever_joined; }
150 unsigned paxos_size() const { return parent->get_paxos_size(); }
151 const set<int>& get_disallowed_leaders() const {
152 return parent->get_disallowed_leaders();
153 }
154 void cancel_timer() {
155 timer_steps = -1;
156 }
157 void reset_timer(int steps) {
158 cancel_timer();
159 timer_steps = 3 + steps; // FIXME? magic number, current step + roundtrip
160 timer_election = true;
161 }
162 void start_victory_timer() {
163 cancel_timer();
164 timer_election = false;
165 timer_steps = 3; // FIXME? current step + roundtrip
166 }
167 void _start() {
168 reset_timer(0);
169 quorum.clear();
170 }
171 void _defer_to(int who) {
172 ceph_assert (!rank_deleted);
173 parent->defer_to(rank, who, logic.get_epoch());
174 reset_timer(0); // wtf does changing this 0->1 cause breakage?
175 }
176 void message_victory(const std::set<int>& members) {
177 ceph_assert (!rank_deleted);
178 for (auto i : members) {
179 if (i == rank) continue;
180 parent->claim_victory(rank, i, logic.get_epoch(), members);
181 }
182 start_victory_timer();
183 quorum = members;
184 victory_accepters = 1;
185 }
186 bool is_current_member(int r) const { return quorum.count(r) != 0; }
187 void receive_propose(int from, epoch_t e, ConnectionTracker *oct) {
188 if (rank_deleted) return;
189 logic.receive_propose(from, e, oct);
190 delete oct;
191 }
192 void receive_ack(int from, epoch_t e) {
193 if (rank_deleted) return;
194 if (e < logic.get_epoch())
195 return;
196 logic.receive_ack(from, e);
197 }
198 void receive_victory_claim(int from, epoch_t e, const set<int>& members) {
199 if (rank_deleted) return;
200 if (e < logic.get_epoch())
201 return;
202 if (logic.receive_victory_claim(from, e)) {
203 quorum = members;
204 cancel_timer();
205 parent->accept_victory(rank, from, e);
206 }
207 }
208 void receive_victory_ack(int from, epoch_t e) {
209 if (rank_deleted) return;
210 if (e < logic.get_epoch())
211 return;
212 ++victory_accepters;
213 if (victory_accepters == static_cast<int>(quorum.size())) {
214 cancel_timer();
215 parent->report_quorum(quorum);
216 }
217 }
218 void receive_scores(bufferlist bl) {
219 ConnectionTracker oct(bl);
220 peer_tracker.receive_peer_report(oct);
221 ldout(g_ceph_context, 10) << "received scores " << oct << dendl;
222 }
223 void receive_ping(int from_rank, bufferlist bl) {
224 ldout(g_ceph_context, 6) << "receive ping from " << from_rank << dendl;
225 peer_tracker.report_live_connection(from_rank, parent->ping_interval);
226 receive_scores(bl);
227 }
228 void receive_ping_timeout(int from_rank) {
229 ldout(g_ceph_context, 6) << "timeout ping from " << from_rank << dendl;
230 peer_tracker.report_dead_connection(from_rank, parent->ping_interval);
231 }
232 void election_timeout() {
233 ldout(g_ceph_context, 2) << "election epoch " << logic.get_epoch()
234 << " timed out for " << rank
235 << ", electing me:" << logic.electing_me
236 << ", acked_me:" << logic.acked_me << dendl;
237 ceph_assert (!rank_deleted);
238 logic.end_election_period();
239 }
240 void victory_timeout() {
241 ldout(g_ceph_context, 2) << "victory epoch " << logic.get_epoch()
242 << " timed out for " << rank
243 << ", electing me:" << logic.electing_me
244 << ", acked_me:" << logic.acked_me << dendl;
245 ceph_assert (!rank_deleted);
246 reset_election();
247 }
248 void encode_scores(bufferlist& bl) {
249 encode(peer_tracker, bl);
250 }
251 void send_pings() {
252 ceph_assert (!rank_deleted);
253 if (!parent->ping_interval ||
254 parent->timesteps_run % parent->ping_interval != 0) {
255 return;
256 }
257
258 bufferlist bl;
259 encode_scores(bl);
260 for (int i = 0; i < parent->get_paxos_size(); ++i) {
261 if (i == rank)
262 continue;
263 Owner *o = parent->electors[i];
264 parent->queue_stable_or_timeout(rank, i,
265 [o, r=rank, bl] { o->receive_ping(r, bl); },
266 [o, r=rank] { o->receive_ping_timeout(r); }
267 );
268 }
269 }
270 void notify_timestep() {
271 ceph_assert (!rank_deleted);
272 assert(timer_steps != 0);
273 if (timer_steps > 0) {
274 --timer_steps;
275 }
276 if (timer_steps == 0) {
277 if (timer_election) {
278 election_timeout();
279 } else {
280 victory_timeout();
281 }
282 }
283 send_pings();
284 }
285 const char *prefix_name() const {
286 return prefix_str.c_str();
287 }
288 int timestep_count() const { return parent->timesteps_run; }
289 };
290
291 Election::Election(int c, ElectionLogic::election_strategy es, int pingi,
292 double tracker_halflife) : count(c), election_strategy(es), ping_interval(pingi),
293 pending_election_messages(0), timesteps_run(0), last_quorum_change(0), last_quorum_formed(-1)
294 {
295 for (int i = 0; i < count; ++i) {
296 electors[i] = new Owner(i, election_strategy, tracker_halflife, this);
297 }
298 }
299
300 Election::~Election()
301 {
302 {
303 for (auto i : electors) {
304 delete i.second;
305 }
306 }
307 }
308
309 void Election::queue_stable_message(int from, int to, function<void()> m)
310 {
311 if (!blocked_messages[from].count(to)) {
312 messages.push_back(m);
313 }
314 }
315
316 void Election::queue_election_message(int from, int to, function<void()> m)
317 {
318 if (last_quorum_reported.count(from)) {
319 last_quorum_change = timesteps_run;
320 last_quorum_reported.clear();
321 last_leader = -1;
322 }
323 if (!blocked_messages[from].count(to)) {
324 bufferlist bl;
325 electors[from]->encode_scores(bl);
326 Owner *o = electors[to];
327 messages.push_back([this,m,o,bl] {
328 --this->pending_election_messages;
329 o->receive_scores(bl);
330 m();
331 });
332 ++pending_election_messages;
333 }
334 }
335
336 void Election::queue_timeout_message(int from, int to, function<void()> m)
337 {
338 ceph_assert(blocked_messages[from].count(to));
339 messages.push_back(m);
340 }
341
342 void Election::queue_stable_or_timeout(int from, int to,
343 function<void()> m, function<void()> t)
344 {
345 if (blocked_messages[from].count(to)) {
346 queue_timeout_message(from, to, t);
347 } else {
348 queue_stable_message(from, to, m);
349 }
350 }
351
352 void Election::defer_to(int from, int to, epoch_t e)
353 {
354 Owner *o = electors[to];
355 queue_election_message(from, to, [o, from, e] {
356 o->receive_ack(from, e);
357 });
358 }
359
360 void Election::propose_to(int from, int to, epoch_t e, bufferlist& cbl)
361 {
362 Owner *o = electors[to];
363 ConnectionTracker *oct = NULL;
364 if (cbl.length()) {
365 oct = new ConnectionTracker(cbl); // we leak these on blocked cons, meh
366 }
367 queue_election_message(from, to, [o, from, e, oct] {
368 o->receive_propose(from, e, oct);
369 });
370 }
371
372 void Election::claim_victory(int from, int to, epoch_t e, const set<int>& members)
373 {
374 Owner *o = electors[to];
375 queue_election_message(from, to, [o, from, e, members] {
376 o->receive_victory_claim(from, e, members);
377 });
378 }
379
380 void Election::accept_victory(int from, int to, epoch_t e)
381 {
382 Owner *o = electors[to];
383 queue_election_message(from, to, [o, from, e] {
384 o->receive_victory_ack(from, e);
385 });
386 }
387
388 void Election::report_quorum(const set<int>& quorum)
389 {
390 for (int i : quorum) {
391 electors[i]->ever_joined = true;
392 }
393 last_quorum_formed = last_quorum_change = timesteps_run;
394 last_quorum_reported = quorum;
395 last_leader = electors[*(quorum.begin())]->logic.get_election_winner();
396 }
397
398 int Election::run_timesteps(int max)
399 {
400 vector< function<void()> > current_m;
401 int steps = 0;
402 for (; (!max || steps < max) && // we have timesteps left AND ONE OF
403 (pending_election_messages || // there are messages pending.
404 !election_stable()); // somebody's not happy and will act in future
405 ++steps) {
406 current_m.clear();
407 current_m.swap(messages);
408 ++timesteps_run;
409 for (auto& m : current_m) {
410 m();
411 }
412 for (auto o : electors) {
413 o.second->notify_timestep();
414 }
415 }
416
417 return steps;
418 }
419
420 void Election::start_one(int who)
421 {
422 assert(who < static_cast<int>(electors.size()));
423 electors[who]->logic.start();
424 }
425
426 void Election::start_all() {
427 for (auto e : electors) {
428 e.second->logic.start();
429 }
430 }
431
432 bool Election::election_stable() const
433 {
434 // see if anybody has a timer running
435 for (auto i : electors) {
436 if (i.second->timer_steps != -1) {
437 ldout(g_ceph_context, 30) << "rank " << i.first << " has timer value " << i.second->timer_steps << dendl;
438 return false;
439 }
440 }
441 return (pending_election_messages == 0);
442 }
443
444 bool Election::quorum_stable(int timesteps_stable) const
445 {
446 ldout(g_ceph_context, 1) << "quorum_stable? last formed:" << last_quorum_formed
447 << ", last changed " << last_quorum_change
448 << ", last reported members " << last_quorum_reported << dendl;
449 if (last_quorum_reported.empty()) {
450 return false;
451 }
452 if (last_quorum_formed < last_quorum_change) {
453 return false;
454 }
455 for (auto i : last_quorum_reported) {
456 if (electors.find(i)->second->timer_steps != -1) {
457 return false;
458 }
459 }
460 if (timesteps_run - timesteps_stable > last_quorum_change)
461 return true;
462 return election_stable();
463 }
464
465 bool Election::all_agree_on_leader() const
466 {
467 int leader = electors.find(0)->second->logic.get_election_winner();
468 ldout(g_ceph_context, 10) << "all_agree_on_leader on " << leader << dendl;
469 for (auto& i: electors) {
470 if (leader != i.second->logic.get_election_winner()) {
471 ldout(g_ceph_context, 10) << "rank " << i.first << " has different leader "
472 << i.second->logic.get_election_winner() << dendl;
473 return false;
474 }
475 }
476 if (disallowed_leaders.count(leader)) {
477 ldout(g_ceph_context, 10) << "that leader is disallowed! member of "
478 << disallowed_leaders << dendl;
479 return false;
480 }
481 return true;
482 }
483
484 bool Election::check_epoch_agreement() const
485 {
486 epoch_t epoch = electors.find(0)->second->logic.get_epoch();
487 for (auto& i : electors) {
488 if (epoch != i.second->logic.get_epoch()) {
489 return false;
490 }
491 }
492 return true;
493 }
494
495 void Election::block_messages(int from, int to)
496 {
497 blocked_messages[from].insert(to);
498 }
499 void Election::block_bidirectional_messages(int a, int b)
500 {
501 block_messages(a, b);
502 block_messages(b, a);
503 }
504 void Election::unblock_messages(int from, int to)
505 {
506 blocked_messages[from].erase(to);
507 }
508 void Election::unblock_bidirectional_messages(int a, int b)
509 {
510 unblock_messages(a, b);
511 unblock_messages(b, a);
512 }
513
514 void Election::remove_elector(int rank)
515 {
516 for (auto ei = electors.begin(); ei != electors.end(); ) {
517 if (ei->first == rank) {
518 ei->second->notify_deleted();
519 electors.erase(ei++);
520 continue;
521 }
522 ei->second->notify_rank_removed(rank);
523 if (ei->first > rank) {
524 electors[ei->first - 1] = ei->second;
525 electors.erase(ei++);
526 continue;
527 }
528 ++ei;
529 }
530 for (auto bi = blocked_messages.begin(); bi != blocked_messages.end(); ) {
531 if (bi->first == rank) {
532 blocked_messages.erase(bi++);
533 continue;
534 }
535 bi->second.erase(rank);
536 for (auto i = bi->second.upper_bound(rank);
537 i != bi->second.end();) {
538 bi->second.insert(*i - 1);
539 bi->second.erase(*(i++));
540 }
541 ++bi;
542 }
543 --count;
544 }
545
546 void single_startup_election_completes(ElectionLogic::election_strategy strategy)
547 {
548 for (int starter = 0; starter < 5; ++starter) {
549 Election election(5, strategy);
550 election.start_one(starter);
551 // This test is not actually legit since you should start
552 // all the ElectionLogics, but it seems to work
553 int steps = election.run_timesteps(0);
554 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
555 ASSERT_TRUE(election.election_stable());
556 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
557 ASSERT_TRUE(election.all_agree_on_leader());
558 ASSERT_TRUE(election.check_epoch_agreement());
559 }
560 }
561
562 void everybody_starts_completes(ElectionLogic::election_strategy strategy)
563 {
564 Election election(5, strategy);
565 election.start_all();
566 int steps = election.run_timesteps(0);
567 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
568 ASSERT_TRUE(election.election_stable());
569 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
570 ASSERT_TRUE(election.all_agree_on_leader());
571 ASSERT_TRUE(election.check_epoch_agreement());
572 }
573
574 void blocked_connection_continues_election(ElectionLogic::election_strategy strategy)
575 {
576 Election election(5, strategy);
577 election.block_bidirectional_messages(0, 1);
578 election.start_all();
579 int steps = election.run_timesteps(100);
580 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
581 // This is a failure mode!
582 ASSERT_FALSE(election.election_stable());
583 ASSERT_FALSE(election.quorum_stable(6)); // double the timer_steps we use
584 election.unblock_bidirectional_messages(0, 1);
585 steps = election.run_timesteps(100);
586 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
587 ASSERT_TRUE(election.election_stable());
588 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
589 ASSERT_TRUE(election.all_agree_on_leader());
590 ASSERT_TRUE(election.check_epoch_agreement());
591 }
592
593 void blocked_connection_converges_election(ElectionLogic::election_strategy strategy)
594 {
595 Election election(5, strategy);
596 election.block_bidirectional_messages(0, 1);
597 election.start_all();
598 int steps = election.run_timesteps(100);
599 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
600 ASSERT_TRUE(election.election_stable());
601 ASSERT_TRUE(election.all_agree_on_leader());
602 ASSERT_TRUE(election.check_epoch_agreement());
603 election.unblock_bidirectional_messages(0, 1);
604 steps = election.run_timesteps(100);
605 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
606 ASSERT_TRUE(election.election_stable());
607 ASSERT_TRUE(election.all_agree_on_leader());
608 ASSERT_TRUE(election.check_epoch_agreement());
609 }
610
611 void disallowed_doesnt_win(ElectionLogic::election_strategy strategy)
612 {
613 int MON_COUNT = 5;
614 for (int i = 0; i < MON_COUNT - 1; ++i) {
615 Election election(MON_COUNT, strategy);
616 for (int j = 0; j <= i; ++j) {
617 election.add_disallowed_leader(j);
618 }
619 election.start_all();
620 int steps = election.run_timesteps(0);
621 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
622 ASSERT_TRUE(election.election_stable());
623 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
624 ASSERT_TRUE(election.all_agree_on_leader());
625 ASSERT_TRUE(election.check_epoch_agreement());
626 int leader = election.electors[0]->logic.get_election_winner();
627 for (int j = 0; j <= i; ++j) {
628 ASSERT_NE(j, leader);
629 }
630 }
631 for (int i = MON_COUNT - 1; i > 0; --i) {
632 Election election(MON_COUNT, strategy);
633 for (int j = i; j <= MON_COUNT - 1; ++j) {
634 election.add_disallowed_leader(j);
635 }
636 election.start_all();
637 int steps = election.run_timesteps(0);
638 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
639 ASSERT_TRUE(election.election_stable());
640 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
641 ASSERT_TRUE(election.all_agree_on_leader());
642 ASSERT_TRUE(election.check_epoch_agreement());
643 int leader = election.electors[0]->logic.get_election_winner();
644 for (int j = i; j < MON_COUNT; ++j) {
645 ASSERT_NE(j, leader);
646 }
647 }
648 }
649
650 void converges_after_flapping(ElectionLogic::election_strategy strategy)
651 {
652 Election election(5, strategy);
653 auto block_cons = [&] {
654 auto& e = election;
655 // leave 4 connected to both sides so it will trigger but not trivially win
656 e.block_bidirectional_messages(0, 2);
657 e.block_bidirectional_messages(0, 3);
658 e.block_bidirectional_messages(1, 2);
659 e.block_bidirectional_messages(1, 3);
660 };
661 auto unblock_cons = [&] {
662 auto& e = election;
663 e.unblock_bidirectional_messages(0, 2);
664 e.unblock_bidirectional_messages(0, 3);
665 e.unblock_bidirectional_messages(1, 2);
666 e.unblock_bidirectional_messages(1, 3);
667 };
668 block_cons();
669 election.start_all();
670 for (int i = 0; i < 5; ++i) {
671 election.run_timesteps(5);
672 unblock_cons();
673 election.run_timesteps(5);
674 block_cons();
675 }
676 unblock_cons();
677 election.run_timesteps(100);
678 ASSERT_TRUE(election.election_stable());
679 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
680 ASSERT_TRUE(election.all_agree_on_leader());
681 ASSERT_TRUE(election.check_epoch_agreement());
682 }
683
684 void converges_while_flapping(ElectionLogic::election_strategy strategy)
685 {
686 Election election(5, strategy);
687 auto block_cons = [&] {
688 auto& e = election;
689 // leave 4 connected to both sides so it will trigger but not trivially win
690 e.block_bidirectional_messages(0, 2);
691 e.block_bidirectional_messages(0, 3);
692 e.block_bidirectional_messages(1, 2);
693 e.block_bidirectional_messages(1, 3);
694 };
695 auto unblock_cons = [&] {
696 auto& e = election;
697 e.unblock_bidirectional_messages(0, 2);
698 e.unblock_bidirectional_messages(0, 3);
699 e.unblock_bidirectional_messages(1, 2);
700 e.unblock_bidirectional_messages(1, 3);
701 };
702 block_cons();
703 election.start_all();
704 for (int i = 0; i < 5; ++i) {
705 election.run_timesteps(10);
706 ASSERT_TRUE(election.quorum_stable(6));
707 unblock_cons();
708 election.run_timesteps(5);
709 block_cons();
710 ASSERT_TRUE(election.election_stable());
711 ASSERT_TRUE(election.all_agree_on_leader());
712 ASSERT_TRUE(election.check_epoch_agreement());
713 }
714 unblock_cons();
715 election.run_timesteps(100);
716 ASSERT_TRUE(election.election_stable());
717 ASSERT_TRUE(election.quorum_stable(6));
718 ASSERT_TRUE(election.all_agree_on_leader());
719 ASSERT_TRUE(election.check_epoch_agreement());
720 }
721
722 void netsplit_with_disallowed_tiebreaker_converges(ElectionLogic::election_strategy strategy)
723 {
724 Election election(5, strategy);
725 election.add_disallowed_leader(4);
726 auto netsplit = [&] {
727 auto& e = election;
728 e.block_bidirectional_messages(0, 2);
729 e.block_bidirectional_messages(0, 3);
730 e.block_bidirectional_messages(1, 2);
731 e.block_bidirectional_messages(1, 3);
732 };
733 auto unsplit = [&] {
734 auto& e = election;
735 e.unblock_bidirectional_messages(0, 2);
736 e.unblock_bidirectional_messages(0, 3);
737 e.unblock_bidirectional_messages(1, 2);
738 e.unblock_bidirectional_messages(1, 3);
739 };
740 // hmm, we don't have timeouts to call elections automatically yet
741 auto call_elections = [&] {
742 for (auto i : election.electors) {
743 i.second->trigger_new_election();
744 }
745 };
746 // turn everybody on, run happy for a while
747 election.start_all();
748 election.run_timesteps(0);
749 ASSERT_TRUE(election.election_stable());
750 ASSERT_TRUE(election.quorum_stable(6));
751 ASSERT_TRUE(election.all_agree_on_leader());
752 ASSERT_TRUE(election.check_epoch_agreement());
753 int starting_leader = election.last_leader;
754 // do some netsplits, but leave disallowed tiebreaker alive
755 for (int i = 0; i < 5; ++i) {
756 netsplit();
757 call_elections();
758 election.run_timesteps(15); // tests fail when I run 10 because 0 and 1 time out on same timestamp for some reason, why?
759 // this ASSERT_EQ only holds while we bias for ranks
760 ASSERT_EQ(starting_leader, election.last_leader);
761 ASSERT_TRUE(election.quorum_stable(6));
762 ASSERT_FALSE(election.election_stable());
763 unsplit();
764 call_elections();
765 election.run_timesteps(10);
766 ASSERT_EQ(starting_leader, election.last_leader);
767 ASSERT_TRUE(election.quorum_stable(6));
768 ASSERT_TRUE(election.election_stable());
769 ASSERT_TRUE(election.all_agree_on_leader());
770 ASSERT_TRUE(election.check_epoch_agreement());
771 }
772
773 // now disconnect the tiebreaker and make sure nobody can win
774 int presplit_quorum_time = election.last_quorum_formed;
775 netsplit();
776 election.block_bidirectional_messages(4, 0);
777 election.block_bidirectional_messages(4, 1);
778 election.block_bidirectional_messages(4, 2);
779 election.block_bidirectional_messages(4, 3);
780 call_elections();
781 election.run_timesteps(100);
782 ASSERT_EQ(election.last_quorum_formed, presplit_quorum_time);
783
784 // now let in the previously-losing side
785 election.unblock_bidirectional_messages(4, 2);
786 election.unblock_bidirectional_messages(4, 3);
787 call_elections();
788 election.run_timesteps(100);
789 ASSERT_TRUE(election.quorum_stable(50));
790 ASSERT_FALSE(election.election_stable());
791
792 // now reconnect everybody
793 unsplit();
794 election.unblock_bidirectional_messages(4, 0);
795 election.unblock_bidirectional_messages(4, 1);
796 call_elections();
797 election.run_timesteps(100);
798 ASSERT_TRUE(election.quorum_stable(50));
799 ASSERT_TRUE(election.election_stable());
800 ASSERT_TRUE(election.all_agree_on_leader());
801 ASSERT_TRUE(election.check_epoch_agreement());
802 }
803
804 void handles_singly_connected_peon(ElectionLogic::election_strategy strategy)
805 {
806 Election election(5, strategy);
807 election.block_bidirectional_messages(0, 1);
808 election.block_bidirectional_messages(0, 2);
809 election.block_bidirectional_messages(0, 3);
810 election.block_bidirectional_messages(0, 4);
811
812 election.start_all();
813 election.run_timesteps(20);
814 ASSERT_TRUE(election.quorum_stable(5));
815 ASSERT_FALSE(election.election_stable());
816
817 election.unblock_bidirectional_messages(0, 1);
818 election.run_timesteps(100);
819 ASSERT_TRUE(election.quorum_stable(50));
820 ASSERT_TRUE(election.election_stable());
821 ASSERT_TRUE(election.all_agree_on_leader());
822 ASSERT_TRUE(election.check_epoch_agreement());
823
824 election.block_bidirectional_messages(0, 1);
825 election.unblock_bidirectional_messages(0, 4);
826 for (auto i : election.electors) {
827 i.second->trigger_new_election();
828 }
829 election.run_timesteps(15);
830 ASSERT_TRUE(election.quorum_stable(50));
831 ASSERT_TRUE(election.election_stable());
832 ASSERT_TRUE(election.all_agree_on_leader());
833 ASSERT_TRUE(election.check_epoch_agreement());
834 }
835
836 ConnectionReport *get_connection_reports(ConnectionTracker& ct) {
837 return &ct.my_reports;
838 }
839 map<int,ConnectionReport> *get_peer_reports(ConnectionTracker& ct) {
840 return &ct.peer_reports;
841 }
842 void handles_outdated_scoring(ElectionLogic::election_strategy strategy)
843 {
844 Election election(3, strategy, 5); // ping every 5 timesteps so they start elections before settling scores!
845
846 // start everybody up and run for a bit
847 election.start_all();
848 election.run_timesteps(20);
849 ASSERT_TRUE(election.quorum_stable(5));
850 ASSERT_TRUE(election.election_stable());
851 ASSERT_TRUE(election.all_agree_on_leader());
852 ASSERT_TRUE(election.check_epoch_agreement());
853
854 // now mess up the scores to disagree
855 ConnectionTracker& ct0 = election.electors[0]->peer_tracker;
856 ConnectionReport& cr0 = *get_connection_reports(ct0);
857 cr0.history[1] = 0.5;
858 cr0.history[2] = 0.5;
859 ct0.increase_version();
860 ConnectionTracker& ct1 = election.electors[1]->peer_tracker;
861 ConnectionReport& cr1 = *get_connection_reports(ct1);
862 cr1.history[0] = 0.5;
863 cr1.history[2] = 0.5;
864 ct1.increase_version();
865 ConnectionTracker& ct2 = election.electors[2]->peer_tracker;
866 ConnectionReport& cr2 = *get_connection_reports(ct2);
867 cr2.history[0] = 0.5;
868 map<int,ConnectionReport>&cp2 = *get_peer_reports(ct2);
869 cp2[0].history[2] = 0;
870 cp2[1].history[2] = 0;
871 ct2.increase_version();
872 election.ping_interval = 0; // disable pinging to update the scores
873 ldout(g_ceph_context, 5) << "mangled the scores to be different" << dendl;
874
875 election.start_all();
876 election.run_timesteps(50);
877 ASSERT_TRUE(election.quorum_stable(30));
878 ASSERT_TRUE(election.election_stable());
879 ASSERT_TRUE(election.all_agree_on_leader());
880 ASSERT_TRUE(election.check_epoch_agreement());
881 }
882
883 void handles_disagreeing_connectivity(ElectionLogic::election_strategy strategy)
884 {
885 Election election(5, strategy, 5); // ping every 5 timesteps so they start elections before settling scores!
886
887 // start everybody up and run for a bit
888 election.start_all();
889 election.run_timesteps(20);
890 ASSERT_TRUE(election.quorum_stable(5));
891 ASSERT_TRUE(election.election_stable());
892 ASSERT_TRUE(election.all_agree_on_leader());
893 ASSERT_TRUE(election.check_epoch_agreement());
894
895 // block all the connections
896 for (int i = 0; i < 5; ++i) {
897 for (int j = i+1; j < 5; ++j) {
898 election.block_bidirectional_messages(i, j);
899 }
900 }
901
902 // now start them electing, which will obviously fail
903 election.start_all();
904 election.run_timesteps(50); // let them all demote scores of their peers
905 ASSERT_FALSE(election.quorum_stable(10));
906 ASSERT_FALSE(election.election_stable());
907
908 // now reconnect them, at which point they should start running an election before exchanging scores
909 for (int i = 0; i < 5; ++i) {
910 for (int j = i+1; j < 5; ++j) {
911 election.unblock_bidirectional_messages(i, j);
912 }
913 }
914 election.run_timesteps(100);
915
916 // these will pass if the nodes managed to converge on scores, but I expect failure
917 ASSERT_TRUE(election.quorum_stable(5));
918 ASSERT_TRUE(election.election_stable());
919 ASSERT_TRUE(election.all_agree_on_leader());
920 ASSERT_TRUE(election.check_epoch_agreement());
921 }
922
923 void handles_removing_ranks(ElectionLogic::election_strategy strategy)
924 {
925 ceph_assert(strategy == ElectionLogic::CONNECTIVITY);
926 for (int deletee = 0; deletee < 5; ++deletee) {
927 Election election(5, strategy);
928 election.start_all();
929 int steps = election.run_timesteps(0);
930 ldout(g_ceph_context, 10) << "ran in " << steps << " timesteps" << dendl;
931 ASSERT_TRUE(election.election_stable());
932 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
933 ASSERT_TRUE(election.all_agree_on_leader());
934 ASSERT_TRUE(election.check_epoch_agreement());
935 election.remove_elector(deletee);
936 ldout(g_ceph_context, 1) << "removed rank " << deletee << " from set" << dendl;
937 election.start_all();
938 steps = election.run_timesteps(0);
939 ASSERT_TRUE(election.election_stable());
940 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
941 ASSERT_TRUE(election.all_agree_on_leader());
942 ASSERT_TRUE(election.check_epoch_agreement());
943 }
944 {
945 Election election(7, strategy);
946 for (int i = 0; i < (7 - 3); ++i) {
947 election.start_all();
948 election.remove_elector(0);
949 int steps = election.run_timesteps(0);
950 ldout(g_ceph_context, 1) << "ran in " << steps << " timesteps" << dendl;
951 ASSERT_TRUE(election.election_stable());
952 ASSERT_TRUE(election.quorum_stable(6)); // double the timer_steps we use
953 ASSERT_TRUE(election.all_agree_on_leader());
954 ASSERT_TRUE(election.check_epoch_agreement());
955 }
956 }
957 }
958
959 // TODO: write a test with more complicated connectivity graphs and make sure
960 // they are stable with multiple disconnected ranks pinging peons
961
962 // TODO: Write a test that disallowing and disconnecting 0 is otherwise stable?
963
964 // TODO: figure out how to test for bumping election epochs with changing scores,
965 // a la what happened in run
966 // http://pulpito.ceph.com/gregf-2019-11-26_10:50:50-rados:monthrash-wip-elector-distro-basic-mira/
967
968 #define test_classic(utest) TEST(classic, utest) { utest(ElectionLogic::CLASSIC); }
969
970 #define test_disallowed(utest) TEST(disallowed, utest) { utest(ElectionLogic::DISALLOW); }
971
972 #define test_connectivity(utest) TEST(connectivity, utest) { utest(ElectionLogic::CONNECTIVITY); }
973
974
975 // TODO: test for expected failures; gtest probably supports that?
976 test_classic(single_startup_election_completes)
977 test_classic(everybody_starts_completes)
978 test_classic(blocked_connection_continues_election)
979 test_classic(converges_after_flapping)
980
981 test_disallowed(single_startup_election_completes)
982 test_disallowed(everybody_starts_completes)
983 test_disallowed(blocked_connection_continues_election)
984 test_disallowed(disallowed_doesnt_win)
985 test_disallowed(converges_after_flapping)
986
987 /* skip single_startup_election_completes because we crash
988 on init conditions. That's fine since as noted above it's not
989 quite following the rules anyway. */
990 test_connectivity(everybody_starts_completes)
991 test_connectivity(blocked_connection_converges_election)
992 test_connectivity(disallowed_doesnt_win)
993 test_connectivity(converges_after_flapping)
994 test_connectivity(converges_while_flapping)
995 test_connectivity(netsplit_with_disallowed_tiebreaker_converges)
996 test_connectivity(handles_singly_connected_peon)
997 test_connectivity(handles_disagreeing_connectivity)
998 test_connectivity(handles_outdated_scoring)
999 test_connectivity(handles_removing_ranks)