1 // -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "gtest/gtest.h"
5 #include "mon/ElectionLogic.h"
6 #include "mon/ConnectionTracker.h"
7 #include "common/dout.h"
9 #include "global/global_context.h"
10 #include "global/global_init.h"
11 #include "common/common_init.h"
12 #include "common/ceph_argparse.h"
16 #define dout_subsys ceph_subsys_test
18 #define dout_prefix _prefix(_dout, prefix_name(), timestep_count())
19 static ostream
& _prefix(std::ostream
*_dout
, const char *prefix
, int timesteps
) {
20 return *_dout
<< prefix
<< timesteps
<< " ";
23 const char* prefix_name() { return "test_election: "; }
24 int timestep_count() { return -1; }
26 int main(int argc
, char **argv
) {
27 vector
<const char*> args(argv
, argv
+argc
);
28 bool user_set_debug
= false;
29 for (auto& arg
: args
) {
30 if (strncmp("--debug_mon", arg
, 11) == 0) user_set_debug
= true;
32 auto cct
= global_init(NULL
, args
, CEPH_ENTITY_TYPE_CLIENT
,
33 CODE_ENVIRONMENT_UTILITY
,
34 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE
);
35 common_init_finish(g_ceph_context
);
36 if (!user_set_debug
) g_ceph_context
->_conf
.set_val("debug mon", "0/20");
38 ::testing::InitGoogleTest(&argc
, argv
);
39 return RUN_ALL_TESTS();
45 map
<int, Owner
*> electors
;
46 map
<int, set
<int> > blocked_messages
;
48 ElectionLogic::election_strategy election_strategy
;
50 set
<int> disallowed_leaders
;
52 vector
< function
<void()> > messages
;
53 int pending_election_messages
;
54 int timesteps_run
= 0;
55 int last_quorum_change
= 0;
56 int last_quorum_formed
= -1;
57 set
<int> last_quorum_reported
;
60 Election(int c
, ElectionLogic::election_strategy es
, int pingi
=1, double tracker_halflife
=5);
62 // ElectionOwner interfaces
63 int get_paxos_size() { return count
; }
64 const set
<int>& get_disallowed_leaders() const { return disallowed_leaders
; }
65 void propose_to(int from
, int to
, epoch_t e
, bufferlist
& cbl
);
66 void defer_to(int from
, int to
, epoch_t e
);
67 void claim_victory(int from
, int to
, epoch_t e
, const set
<int>& members
);
68 void accept_victory(int from
, int to
, epoch_t e
);
69 void report_quorum(const set
<int>& quorum
);
70 void queue_stable_message(int from
, int to
, function
<void()> m
);
71 void queue_timeout_message(int from
, int to
, function
<void()> m
);
72 void queue_stable_or_timeout(int from
, int to
,
73 function
<void()> m
, function
<void()> t
);
74 void queue_election_message(int from
, int to
, function
<void()> m
);
76 // test runner interfaces
77 int run_timesteps(int max
);
78 void start_one(int who
);
80 bool election_stable() const;
81 bool quorum_stable(int timesteps_stable
) const;
82 bool all_agree_on_leader() const;
83 bool check_epoch_agreement() const;
84 void block_messages(int from
, int to
);
85 void block_bidirectional_messages(int a
, int b
);
86 void unblock_messages(int from
, int to
);
87 void unblock_bidirectional_messages(int a
, int b
);
88 void add_disallowed_leader(int disallowed
) { disallowed_leaders
.insert(disallowed
); }
89 void remove_elector(int rank
);
90 const char* prefix_name() const { return "Election: "; }
91 int timestep_count() const { return timesteps_run
; }
93 struct Owner
: public ElectionOwner
, RankProvider
{
96 epoch_t persisted_epoch
;
98 ConnectionTracker peer_tracker
;
101 int victory_accepters
;
102 int timer_steps
; // timesteps until we trigger timeout
103 bool timer_election
; // the timeout is for normal election, or victory
104 bool rank_deleted
= false;
106 Owner(int r
, ElectionLogic::election_strategy es
, double tracker_halflife
,
107 Election
*p
) : parent(p
), rank(r
), persisted_epoch(0),
109 peer_tracker(this, rank
, tracker_halflife
, 5),
110 logic(this, es
, &peer_tracker
, 0.0005, g_ceph_context
),
111 victory_accepters(0),
112 timer_steps(-1), timer_election(true) {
113 std::stringstream str
;
114 str
<< "Owner" << rank
<< " ";
115 prefix_str
= str
.str();
118 // in-memory store: just save to variable
119 void persist_epoch(epoch_t e
) { persisted_epoch
= e
; }
120 // in-memory store: just return variable
121 epoch_t
read_persisted_epoch() const { return persisted_epoch
; }
122 // in-memory store: don't need to validate
123 void validate_store() { return; }
124 // don't need to do anything with our state right now
125 void notify_bump_epoch() {}
126 void notify_rank_removed(int removed_rank
) {
127 peer_tracker
.notify_rank_removed(removed_rank
);
128 if (rank
> removed_rank
)
131 void notify_deleted() { rank_deleted
= true; rank
= -1; cancel_timer(); }
132 // pass back to ElectionLogic; we don't need this redirect ourselves
133 void trigger_new_election() { ceph_assert (!rank_deleted
); logic
.start(); }
134 int get_my_rank() const { return rank
; }
135 // we don't need to persist scores as we don't reset and lose memory state
136 void persist_connectivity_scores() {}
137 void propose_to_peers(epoch_t e
, bufferlist
& bl
) {
138 ceph_assert (!rank_deleted
);
139 for (int i
= 0; i
< parent
->get_paxos_size(); ++i
) {
140 if (i
== rank
) continue;
141 parent
->propose_to(rank
, i
, e
, bl
);
144 void reset_election() {
145 ceph_assert (!rank_deleted
);
149 bool ever_participated() const { return ever_joined
; }
150 unsigned paxos_size() const { return parent
->get_paxos_size(); }
151 const set
<int>& get_disallowed_leaders() const {
152 return parent
->get_disallowed_leaders();
154 void cancel_timer() {
157 void reset_timer(int steps
) {
159 timer_steps
= 3 + steps
; // FIXME? magic number, current step + roundtrip
160 timer_election
= true;
162 void start_victory_timer() {
164 timer_election
= false;
165 timer_steps
= 3; // FIXME? current step + roundtrip
171 void _defer_to(int who
) {
172 ceph_assert (!rank_deleted
);
173 parent
->defer_to(rank
, who
, logic
.get_epoch());
174 reset_timer(0); // wtf does changing this 0->1 cause breakage?
176 void message_victory(const std::set
<int>& members
) {
177 ceph_assert (!rank_deleted
);
178 for (auto i
: members
) {
179 if (i
== rank
) continue;
180 parent
->claim_victory(rank
, i
, logic
.get_epoch(), members
);
182 start_victory_timer();
184 victory_accepters
= 1;
186 bool is_current_member(int r
) const { return quorum
.count(r
) != 0; }
187 void receive_propose(int from
, epoch_t e
, ConnectionTracker
*oct
) {
188 if (rank_deleted
) return;
189 logic
.receive_propose(from
, e
, oct
);
192 void receive_ack(int from
, epoch_t e
) {
193 if (rank_deleted
) return;
194 if (e
< logic
.get_epoch())
196 logic
.receive_ack(from
, e
);
198 void receive_victory_claim(int from
, epoch_t e
, const set
<int>& members
) {
199 if (rank_deleted
) return;
200 if (e
< logic
.get_epoch())
202 if (logic
.receive_victory_claim(from
, e
)) {
205 parent
->accept_victory(rank
, from
, e
);
208 void receive_victory_ack(int from
, epoch_t e
) {
209 if (rank_deleted
) return;
210 if (e
< logic
.get_epoch())
213 if (victory_accepters
== static_cast<int>(quorum
.size())) {
215 parent
->report_quorum(quorum
);
218 void receive_scores(bufferlist bl
) {
219 ConnectionTracker
oct(bl
);
220 peer_tracker
.receive_peer_report(oct
);
221 ldout(g_ceph_context
, 10) << "received scores " << oct
<< dendl
;
223 void receive_ping(int from_rank
, bufferlist bl
) {
224 ldout(g_ceph_context
, 6) << "receive ping from " << from_rank
<< dendl
;
225 peer_tracker
.report_live_connection(from_rank
, parent
->ping_interval
);
228 void receive_ping_timeout(int from_rank
) {
229 ldout(g_ceph_context
, 6) << "timeout ping from " << from_rank
<< dendl
;
230 peer_tracker
.report_dead_connection(from_rank
, parent
->ping_interval
);
232 void election_timeout() {
233 ldout(g_ceph_context
, 2) << "election epoch " << logic
.get_epoch()
234 << " timed out for " << rank
235 << ", electing me:" << logic
.electing_me
236 << ", acked_me:" << logic
.acked_me
<< dendl
;
237 ceph_assert (!rank_deleted
);
238 logic
.end_election_period();
240 void victory_timeout() {
241 ldout(g_ceph_context
, 2) << "victory epoch " << logic
.get_epoch()
242 << " timed out for " << rank
243 << ", electing me:" << logic
.electing_me
244 << ", acked_me:" << logic
.acked_me
<< dendl
;
245 ceph_assert (!rank_deleted
);
248 void encode_scores(bufferlist
& bl
) {
249 encode(peer_tracker
, bl
);
252 ceph_assert (!rank_deleted
);
253 if (!parent
->ping_interval
||
254 parent
->timesteps_run
% parent
->ping_interval
!= 0) {
260 for (int i
= 0; i
< parent
->get_paxos_size(); ++i
) {
263 Owner
*o
= parent
->electors
[i
];
264 parent
->queue_stable_or_timeout(rank
, i
,
265 [o
, r
=rank
, bl
] { o
->receive_ping(r
, bl
); },
266 [o
, r
=rank
] { o
->receive_ping_timeout(r
); }
270 void notify_timestep() {
271 ceph_assert (!rank_deleted
);
272 assert(timer_steps
!= 0);
273 if (timer_steps
> 0) {
276 if (timer_steps
== 0) {
277 if (timer_election
) {
285 const char *prefix_name() const {
286 return prefix_str
.c_str();
288 int timestep_count() const { return parent
->timesteps_run
; }
291 Election::Election(int c
, ElectionLogic::election_strategy es
, int pingi
,
292 double tracker_halflife
) : count(c
), election_strategy(es
), ping_interval(pingi
),
293 pending_election_messages(0), timesteps_run(0), last_quorum_change(0), last_quorum_formed(-1)
295 for (int i
= 0; i
< count
; ++i
) {
296 electors
[i
] = new Owner(i
, election_strategy
, tracker_halflife
, this);
300 Election::~Election()
303 for (auto i
: electors
) {
309 void Election::queue_stable_message(int from
, int to
, function
<void()> m
)
311 if (!blocked_messages
[from
].count(to
)) {
312 messages
.push_back(m
);
316 void Election::queue_election_message(int from
, int to
, function
<void()> m
)
318 if (last_quorum_reported
.count(from
)) {
319 last_quorum_change
= timesteps_run
;
320 last_quorum_reported
.clear();
323 if (!blocked_messages
[from
].count(to
)) {
325 electors
[from
]->encode_scores(bl
);
326 Owner
*o
= electors
[to
];
327 messages
.push_back([this,m
,o
,bl
] {
328 --this->pending_election_messages
;
329 o
->receive_scores(bl
);
332 ++pending_election_messages
;
336 void Election::queue_timeout_message(int from
, int to
, function
<void()> m
)
338 ceph_assert(blocked_messages
[from
].count(to
));
339 messages
.push_back(m
);
342 void Election::queue_stable_or_timeout(int from
, int to
,
343 function
<void()> m
, function
<void()> t
)
345 if (blocked_messages
[from
].count(to
)) {
346 queue_timeout_message(from
, to
, t
);
348 queue_stable_message(from
, to
, m
);
352 void Election::defer_to(int from
, int to
, epoch_t e
)
354 Owner
*o
= electors
[to
];
355 queue_election_message(from
, to
, [o
, from
, e
] {
356 o
->receive_ack(from
, e
);
360 void Election::propose_to(int from
, int to
, epoch_t e
, bufferlist
& cbl
)
362 Owner
*o
= electors
[to
];
363 ConnectionTracker
*oct
= NULL
;
365 oct
= new ConnectionTracker(cbl
); // we leak these on blocked cons, meh
367 queue_election_message(from
, to
, [o
, from
, e
, oct
] {
368 o
->receive_propose(from
, e
, oct
);
372 void Election::claim_victory(int from
, int to
, epoch_t e
, const set
<int>& members
)
374 Owner
*o
= electors
[to
];
375 queue_election_message(from
, to
, [o
, from
, e
, members
] {
376 o
->receive_victory_claim(from
, e
, members
);
380 void Election::accept_victory(int from
, int to
, epoch_t e
)
382 Owner
*o
= electors
[to
];
383 queue_election_message(from
, to
, [o
, from
, e
] {
384 o
->receive_victory_ack(from
, e
);
388 void Election::report_quorum(const set
<int>& quorum
)
390 for (int i
: quorum
) {
391 electors
[i
]->ever_joined
= true;
393 last_quorum_formed
= last_quorum_change
= timesteps_run
;
394 last_quorum_reported
= quorum
;
395 last_leader
= electors
[*(quorum
.begin())]->logic
.get_election_winner();
398 int Election::run_timesteps(int max
)
400 vector
< function
<void()> > current_m
;
402 for (; (!max
|| steps
< max
) && // we have timesteps left AND ONE OF
403 (pending_election_messages
|| // there are messages pending.
404 !election_stable()); // somebody's not happy and will act in future
407 current_m
.swap(messages
);
409 for (auto& m
: current_m
) {
412 for (auto o
: electors
) {
413 o
.second
->notify_timestep();
420 void Election::start_one(int who
)
422 assert(who
< static_cast<int>(electors
.size()));
423 electors
[who
]->logic
.start();
426 void Election::start_all() {
427 for (auto e
: electors
) {
428 e
.second
->logic
.start();
432 bool Election::election_stable() const
434 // see if anybody has a timer running
435 for (auto i
: electors
) {
436 if (i
.second
->timer_steps
!= -1) {
437 ldout(g_ceph_context
, 30) << "rank " << i
.first
<< " has timer value " << i
.second
->timer_steps
<< dendl
;
441 return (pending_election_messages
== 0);
444 bool Election::quorum_stable(int timesteps_stable
) const
446 ldout(g_ceph_context
, 1) << "quorum_stable? last formed:" << last_quorum_formed
447 << ", last changed " << last_quorum_change
448 << ", last reported members " << last_quorum_reported
<< dendl
;
449 if (last_quorum_reported
.empty()) {
452 if (last_quorum_formed
< last_quorum_change
) {
455 for (auto i
: last_quorum_reported
) {
456 if (electors
.find(i
)->second
->timer_steps
!= -1) {
460 if (timesteps_run
- timesteps_stable
> last_quorum_change
)
462 return election_stable();
465 bool Election::all_agree_on_leader() const
467 int leader
= electors
.find(0)->second
->logic
.get_election_winner();
468 ldout(g_ceph_context
, 10) << "all_agree_on_leader on " << leader
<< dendl
;
469 for (auto& i
: electors
) {
470 if (leader
!= i
.second
->logic
.get_election_winner()) {
471 ldout(g_ceph_context
, 10) << "rank " << i
.first
<< " has different leader "
472 << i
.second
->logic
.get_election_winner() << dendl
;
476 if (disallowed_leaders
.count(leader
)) {
477 ldout(g_ceph_context
, 10) << "that leader is disallowed! member of "
478 << disallowed_leaders
<< dendl
;
484 bool Election::check_epoch_agreement() const
486 epoch_t epoch
= electors
.find(0)->second
->logic
.get_epoch();
487 for (auto& i
: electors
) {
488 if (epoch
!= i
.second
->logic
.get_epoch()) {
495 void Election::block_messages(int from
, int to
)
497 blocked_messages
[from
].insert(to
);
499 void Election::block_bidirectional_messages(int a
, int b
)
501 block_messages(a
, b
);
502 block_messages(b
, a
);
504 void Election::unblock_messages(int from
, int to
)
506 blocked_messages
[from
].erase(to
);
508 void Election::unblock_bidirectional_messages(int a
, int b
)
510 unblock_messages(a
, b
);
511 unblock_messages(b
, a
);
514 void Election::remove_elector(int rank
)
516 for (auto ei
= electors
.begin(); ei
!= electors
.end(); ) {
517 if (ei
->first
== rank
) {
518 ei
->second
->notify_deleted();
519 electors
.erase(ei
++);
522 ei
->second
->notify_rank_removed(rank
);
523 if (ei
->first
> rank
) {
524 electors
[ei
->first
- 1] = ei
->second
;
525 electors
.erase(ei
++);
530 for (auto bi
= blocked_messages
.begin(); bi
!= blocked_messages
.end(); ) {
531 if (bi
->first
== rank
) {
532 blocked_messages
.erase(bi
++);
535 bi
->second
.erase(rank
);
536 for (auto i
= bi
->second
.upper_bound(rank
);
537 i
!= bi
->second
.end();) {
538 bi
->second
.insert(*i
- 1);
539 bi
->second
.erase(*(i
++));
546 void single_startup_election_completes(ElectionLogic::election_strategy strategy
)
548 for (int starter
= 0; starter
< 5; ++starter
) {
549 Election
election(5, strategy
);
550 election
.start_one(starter
);
551 // This test is not actually legit since you should start
552 // all the ElectionLogics, but it seems to work
553 int steps
= election
.run_timesteps(0);
554 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
555 ASSERT_TRUE(election
.election_stable());
556 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
557 ASSERT_TRUE(election
.all_agree_on_leader());
558 ASSERT_TRUE(election
.check_epoch_agreement());
562 void everybody_starts_completes(ElectionLogic::election_strategy strategy
)
564 Election
election(5, strategy
);
565 election
.start_all();
566 int steps
= election
.run_timesteps(0);
567 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
568 ASSERT_TRUE(election
.election_stable());
569 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
570 ASSERT_TRUE(election
.all_agree_on_leader());
571 ASSERT_TRUE(election
.check_epoch_agreement());
574 void blocked_connection_continues_election(ElectionLogic::election_strategy strategy
)
576 Election
election(5, strategy
);
577 election
.block_bidirectional_messages(0, 1);
578 election
.start_all();
579 int steps
= election
.run_timesteps(100);
580 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
581 // This is a failure mode!
582 ASSERT_FALSE(election
.election_stable());
583 ASSERT_FALSE(election
.quorum_stable(6)); // double the timer_steps we use
584 election
.unblock_bidirectional_messages(0, 1);
585 steps
= election
.run_timesteps(100);
586 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
587 ASSERT_TRUE(election
.election_stable());
588 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
589 ASSERT_TRUE(election
.all_agree_on_leader());
590 ASSERT_TRUE(election
.check_epoch_agreement());
593 void blocked_connection_converges_election(ElectionLogic::election_strategy strategy
)
595 Election
election(5, strategy
);
596 election
.block_bidirectional_messages(0, 1);
597 election
.start_all();
598 int steps
= election
.run_timesteps(100);
599 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
600 ASSERT_TRUE(election
.election_stable());
601 ASSERT_TRUE(election
.all_agree_on_leader());
602 ASSERT_TRUE(election
.check_epoch_agreement());
603 election
.unblock_bidirectional_messages(0, 1);
604 steps
= election
.run_timesteps(100);
605 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
606 ASSERT_TRUE(election
.election_stable());
607 ASSERT_TRUE(election
.all_agree_on_leader());
608 ASSERT_TRUE(election
.check_epoch_agreement());
611 void disallowed_doesnt_win(ElectionLogic::election_strategy strategy
)
614 for (int i
= 0; i
< MON_COUNT
- 1; ++i
) {
615 Election
election(MON_COUNT
, strategy
);
616 for (int j
= 0; j
<= i
; ++j
) {
617 election
.add_disallowed_leader(j
);
619 election
.start_all();
620 int steps
= election
.run_timesteps(0);
621 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
622 ASSERT_TRUE(election
.election_stable());
623 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
624 ASSERT_TRUE(election
.all_agree_on_leader());
625 ASSERT_TRUE(election
.check_epoch_agreement());
626 int leader
= election
.electors
[0]->logic
.get_election_winner();
627 for (int j
= 0; j
<= i
; ++j
) {
628 ASSERT_NE(j
, leader
);
631 for (int i
= MON_COUNT
- 1; i
> 0; --i
) {
632 Election
election(MON_COUNT
, strategy
);
633 for (int j
= i
; j
<= MON_COUNT
- 1; ++j
) {
634 election
.add_disallowed_leader(j
);
636 election
.start_all();
637 int steps
= election
.run_timesteps(0);
638 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
639 ASSERT_TRUE(election
.election_stable());
640 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
641 ASSERT_TRUE(election
.all_agree_on_leader());
642 ASSERT_TRUE(election
.check_epoch_agreement());
643 int leader
= election
.electors
[0]->logic
.get_election_winner();
644 for (int j
= i
; j
< MON_COUNT
; ++j
) {
645 ASSERT_NE(j
, leader
);
650 void converges_after_flapping(ElectionLogic::election_strategy strategy
)
652 Election
election(5, strategy
);
653 auto block_cons
= [&] {
655 // leave 4 connected to both sides so it will trigger but not trivially win
656 e
.block_bidirectional_messages(0, 2);
657 e
.block_bidirectional_messages(0, 3);
658 e
.block_bidirectional_messages(1, 2);
659 e
.block_bidirectional_messages(1, 3);
661 auto unblock_cons
= [&] {
663 e
.unblock_bidirectional_messages(0, 2);
664 e
.unblock_bidirectional_messages(0, 3);
665 e
.unblock_bidirectional_messages(1, 2);
666 e
.unblock_bidirectional_messages(1, 3);
669 election
.start_all();
670 for (int i
= 0; i
< 5; ++i
) {
671 election
.run_timesteps(5);
673 election
.run_timesteps(5);
677 election
.run_timesteps(100);
678 ASSERT_TRUE(election
.election_stable());
679 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
680 ASSERT_TRUE(election
.all_agree_on_leader());
681 ASSERT_TRUE(election
.check_epoch_agreement());
684 void converges_while_flapping(ElectionLogic::election_strategy strategy
)
686 Election
election(5, strategy
);
687 auto block_cons
= [&] {
689 // leave 4 connected to both sides so it will trigger but not trivially win
690 e
.block_bidirectional_messages(0, 2);
691 e
.block_bidirectional_messages(0, 3);
692 e
.block_bidirectional_messages(1, 2);
693 e
.block_bidirectional_messages(1, 3);
695 auto unblock_cons
= [&] {
697 e
.unblock_bidirectional_messages(0, 2);
698 e
.unblock_bidirectional_messages(0, 3);
699 e
.unblock_bidirectional_messages(1, 2);
700 e
.unblock_bidirectional_messages(1, 3);
703 election
.start_all();
704 for (int i
= 0; i
< 5; ++i
) {
705 election
.run_timesteps(10);
706 ASSERT_TRUE(election
.quorum_stable(6));
708 election
.run_timesteps(5);
710 ASSERT_TRUE(election
.election_stable());
711 ASSERT_TRUE(election
.all_agree_on_leader());
712 ASSERT_TRUE(election
.check_epoch_agreement());
715 election
.run_timesteps(100);
716 ASSERT_TRUE(election
.election_stable());
717 ASSERT_TRUE(election
.quorum_stable(6));
718 ASSERT_TRUE(election
.all_agree_on_leader());
719 ASSERT_TRUE(election
.check_epoch_agreement());
722 void netsplit_with_disallowed_tiebreaker_converges(ElectionLogic::election_strategy strategy
)
724 Election
election(5, strategy
);
725 election
.add_disallowed_leader(4);
726 auto netsplit
= [&] {
728 e
.block_bidirectional_messages(0, 2);
729 e
.block_bidirectional_messages(0, 3);
730 e
.block_bidirectional_messages(1, 2);
731 e
.block_bidirectional_messages(1, 3);
735 e
.unblock_bidirectional_messages(0, 2);
736 e
.unblock_bidirectional_messages(0, 3);
737 e
.unblock_bidirectional_messages(1, 2);
738 e
.unblock_bidirectional_messages(1, 3);
740 // hmm, we don't have timeouts to call elections automatically yet
741 auto call_elections
= [&] {
742 for (auto i
: election
.electors
) {
743 i
.second
->trigger_new_election();
746 // turn everybody on, run happy for a while
747 election
.start_all();
748 election
.run_timesteps(0);
749 ASSERT_TRUE(election
.election_stable());
750 ASSERT_TRUE(election
.quorum_stable(6));
751 ASSERT_TRUE(election
.all_agree_on_leader());
752 ASSERT_TRUE(election
.check_epoch_agreement());
753 int starting_leader
= election
.last_leader
;
754 // do some netsplits, but leave disallowed tiebreaker alive
755 for (int i
= 0; i
< 5; ++i
) {
758 election
.run_timesteps(15); // tests fail when I run 10 because 0 and 1 time out on same timestamp for some reason, why?
759 // this ASSERT_EQ only holds while we bias for ranks
760 ASSERT_EQ(starting_leader
, election
.last_leader
);
761 ASSERT_TRUE(election
.quorum_stable(6));
762 ASSERT_FALSE(election
.election_stable());
765 election
.run_timesteps(10);
766 ASSERT_EQ(starting_leader
, election
.last_leader
);
767 ASSERT_TRUE(election
.quorum_stable(6));
768 ASSERT_TRUE(election
.election_stable());
769 ASSERT_TRUE(election
.all_agree_on_leader());
770 ASSERT_TRUE(election
.check_epoch_agreement());
773 // now disconnect the tiebreaker and make sure nobody can win
774 int presplit_quorum_time
= election
.last_quorum_formed
;
776 election
.block_bidirectional_messages(4, 0);
777 election
.block_bidirectional_messages(4, 1);
778 election
.block_bidirectional_messages(4, 2);
779 election
.block_bidirectional_messages(4, 3);
781 election
.run_timesteps(100);
782 ASSERT_EQ(election
.last_quorum_formed
, presplit_quorum_time
);
784 // now let in the previously-losing side
785 election
.unblock_bidirectional_messages(4, 2);
786 election
.unblock_bidirectional_messages(4, 3);
788 election
.run_timesteps(100);
789 ASSERT_TRUE(election
.quorum_stable(50));
790 ASSERT_FALSE(election
.election_stable());
792 // now reconnect everybody
794 election
.unblock_bidirectional_messages(4, 0);
795 election
.unblock_bidirectional_messages(4, 1);
797 election
.run_timesteps(100);
798 ASSERT_TRUE(election
.quorum_stable(50));
799 ASSERT_TRUE(election
.election_stable());
800 ASSERT_TRUE(election
.all_agree_on_leader());
801 ASSERT_TRUE(election
.check_epoch_agreement());
804 void handles_singly_connected_peon(ElectionLogic::election_strategy strategy
)
806 Election
election(5, strategy
);
807 election
.block_bidirectional_messages(0, 1);
808 election
.block_bidirectional_messages(0, 2);
809 election
.block_bidirectional_messages(0, 3);
810 election
.block_bidirectional_messages(0, 4);
812 election
.start_all();
813 election
.run_timesteps(20);
814 ASSERT_TRUE(election
.quorum_stable(5));
815 ASSERT_FALSE(election
.election_stable());
817 election
.unblock_bidirectional_messages(0, 1);
818 election
.run_timesteps(100);
819 ASSERT_TRUE(election
.quorum_stable(50));
820 ASSERT_TRUE(election
.election_stable());
821 ASSERT_TRUE(election
.all_agree_on_leader());
822 ASSERT_TRUE(election
.check_epoch_agreement());
824 election
.block_bidirectional_messages(0, 1);
825 election
.unblock_bidirectional_messages(0, 4);
826 for (auto i
: election
.electors
) {
827 i
.second
->trigger_new_election();
829 election
.run_timesteps(15);
830 ASSERT_TRUE(election
.quorum_stable(50));
831 ASSERT_TRUE(election
.election_stable());
832 ASSERT_TRUE(election
.all_agree_on_leader());
833 ASSERT_TRUE(election
.check_epoch_agreement());
836 ConnectionReport
*get_connection_reports(ConnectionTracker
& ct
) {
837 return &ct
.my_reports
;
839 map
<int,ConnectionReport
> *get_peer_reports(ConnectionTracker
& ct
) {
840 return &ct
.peer_reports
;
842 void handles_outdated_scoring(ElectionLogic::election_strategy strategy
)
844 Election
election(3, strategy
, 5); // ping every 5 timesteps so they start elections before settling scores!
846 // start everybody up and run for a bit
847 election
.start_all();
848 election
.run_timesteps(20);
849 ASSERT_TRUE(election
.quorum_stable(5));
850 ASSERT_TRUE(election
.election_stable());
851 ASSERT_TRUE(election
.all_agree_on_leader());
852 ASSERT_TRUE(election
.check_epoch_agreement());
854 // now mess up the scores to disagree
855 ConnectionTracker
& ct0
= election
.electors
[0]->peer_tracker
;
856 ConnectionReport
& cr0
= *get_connection_reports(ct0
);
857 cr0
.history
[1] = 0.5;
858 cr0
.history
[2] = 0.5;
859 ct0
.increase_version();
860 ConnectionTracker
& ct1
= election
.electors
[1]->peer_tracker
;
861 ConnectionReport
& cr1
= *get_connection_reports(ct1
);
862 cr1
.history
[0] = 0.5;
863 cr1
.history
[2] = 0.5;
864 ct1
.increase_version();
865 ConnectionTracker
& ct2
= election
.electors
[2]->peer_tracker
;
866 ConnectionReport
& cr2
= *get_connection_reports(ct2
);
867 cr2
.history
[0] = 0.5;
868 map
<int,ConnectionReport
>&cp2
= *get_peer_reports(ct2
);
869 cp2
[0].history
[2] = 0;
870 cp2
[1].history
[2] = 0;
871 ct2
.increase_version();
872 election
.ping_interval
= 0; // disable pinging to update the scores
873 ldout(g_ceph_context
, 5) << "mangled the scores to be different" << dendl
;
875 election
.start_all();
876 election
.run_timesteps(50);
877 ASSERT_TRUE(election
.quorum_stable(30));
878 ASSERT_TRUE(election
.election_stable());
879 ASSERT_TRUE(election
.all_agree_on_leader());
880 ASSERT_TRUE(election
.check_epoch_agreement());
883 void handles_disagreeing_connectivity(ElectionLogic::election_strategy strategy
)
885 Election
election(5, strategy
, 5); // ping every 5 timesteps so they start elections before settling scores!
887 // start everybody up and run for a bit
888 election
.start_all();
889 election
.run_timesteps(20);
890 ASSERT_TRUE(election
.quorum_stable(5));
891 ASSERT_TRUE(election
.election_stable());
892 ASSERT_TRUE(election
.all_agree_on_leader());
893 ASSERT_TRUE(election
.check_epoch_agreement());
895 // block all the connections
896 for (int i
= 0; i
< 5; ++i
) {
897 for (int j
= i
+1; j
< 5; ++j
) {
898 election
.block_bidirectional_messages(i
, j
);
902 // now start them electing, which will obviously fail
903 election
.start_all();
904 election
.run_timesteps(50); // let them all demote scores of their peers
905 ASSERT_FALSE(election
.quorum_stable(10));
906 ASSERT_FALSE(election
.election_stable());
908 // now reconnect them, at which point they should start running an election before exchanging scores
909 for (int i
= 0; i
< 5; ++i
) {
910 for (int j
= i
+1; j
< 5; ++j
) {
911 election
.unblock_bidirectional_messages(i
, j
);
914 election
.run_timesteps(100);
916 // these will pass if the nodes managed to converge on scores, but I expect failure
917 ASSERT_TRUE(election
.quorum_stable(5));
918 ASSERT_TRUE(election
.election_stable());
919 ASSERT_TRUE(election
.all_agree_on_leader());
920 ASSERT_TRUE(election
.check_epoch_agreement());
923 void handles_removing_ranks(ElectionLogic::election_strategy strategy
)
925 ceph_assert(strategy
== ElectionLogic::CONNECTIVITY
);
926 for (int deletee
= 0; deletee
< 5; ++deletee
) {
927 Election
election(5, strategy
);
928 election
.start_all();
929 int steps
= election
.run_timesteps(0);
930 ldout(g_ceph_context
, 10) << "ran in " << steps
<< " timesteps" << dendl
;
931 ASSERT_TRUE(election
.election_stable());
932 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
933 ASSERT_TRUE(election
.all_agree_on_leader());
934 ASSERT_TRUE(election
.check_epoch_agreement());
935 election
.remove_elector(deletee
);
936 ldout(g_ceph_context
, 1) << "removed rank " << deletee
<< " from set" << dendl
;
937 election
.start_all();
938 steps
= election
.run_timesteps(0);
939 ASSERT_TRUE(election
.election_stable());
940 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
941 ASSERT_TRUE(election
.all_agree_on_leader());
942 ASSERT_TRUE(election
.check_epoch_agreement());
945 Election
election(7, strategy
);
946 for (int i
= 0; i
< (7 - 3); ++i
) {
947 election
.start_all();
948 election
.remove_elector(0);
949 int steps
= election
.run_timesteps(0);
950 ldout(g_ceph_context
, 1) << "ran in " << steps
<< " timesteps" << dendl
;
951 ASSERT_TRUE(election
.election_stable());
952 ASSERT_TRUE(election
.quorum_stable(6)); // double the timer_steps we use
953 ASSERT_TRUE(election
.all_agree_on_leader());
954 ASSERT_TRUE(election
.check_epoch_agreement());
959 // TODO: write a test with more complicated connectivity graphs and make sure
960 // they are stable with multiple disconnected ranks pinging peons
962 // TODO: Write a test that disallowing and disconnecting 0 is otherwise stable?
964 // TODO: figure out how to test for bumping election epochs with changing scores,
965 // a la what happened in run
966 // http://pulpito.ceph.com/gregf-2019-11-26_10:50:50-rados:monthrash-wip-elector-distro-basic-mira/
968 #define test_classic(utest) TEST(classic, utest) { utest(ElectionLogic::CLASSIC); }
970 #define test_disallowed(utest) TEST(disallowed, utest) { utest(ElectionLogic::DISALLOW); }
972 #define test_connectivity(utest) TEST(connectivity, utest) { utest(ElectionLogic::CONNECTIVITY); }
975 // TODO: test for expected failures; gtest probably supports that?
976 test_classic(single_startup_election_completes
)
977 test_classic(everybody_starts_completes
)
978 test_classic(blocked_connection_continues_election
)
979 test_classic(converges_after_flapping
)
981 test_disallowed(single_startup_election_completes
)
982 test_disallowed(everybody_starts_completes
)
983 test_disallowed(blocked_connection_continues_election
)
984 test_disallowed(disallowed_doesnt_win
)
985 test_disallowed(converges_after_flapping
)
987 /* skip single_startup_election_completes because we crash
988 on init conditions. That's fine since as noted above it's not
989 quite following the rules anyway. */
990 test_connectivity(everybody_starts_completes
)
991 test_connectivity(blocked_connection_converges_election
)
992 test_connectivity(disallowed_doesnt_win
)
993 test_connectivity(converges_after_flapping
)
994 test_connectivity(converges_while_flapping
)
995 test_connectivity(netsplit_with_disallowed_tiebreaker_converges
)
996 test_connectivity(handles_singly_connected_peon
)
997 test_connectivity(handles_disagreeing_connectivity
)
998 test_connectivity(handles_outdated_scoring
)
999 test_connectivity(handles_removing_ranks
)