]>
Commit | Line | Data |
---|---|---|
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- | |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "Elector.h" | |
16 | #include "Monitor.h" | |
17 | ||
18 | #include "common/Timer.h" | |
19 | #include "MonitorDBStore.h" | |
20 | #include "messages/MMonElection.h" | |
21 | ||
22 | #include "common/config.h" | |
23 | #include "include/assert.h" | |
24 | ||
25 | #define dout_subsys ceph_subsys_mon | |
26 | #undef dout_prefix | |
27 | #define dout_prefix _prefix(_dout, mon, epoch) | |
28 | static ostream& _prefix(std::ostream *_dout, Monitor *mon, epoch_t epoch) { | |
29 | return *_dout << "mon." << mon->name << "@" << mon->rank | |
30 | << "(" << mon->get_state_name() | |
31 | << ").elector(" << epoch << ") "; | |
32 | } | |
33 | ||
34 | ||
35 | void Elector::init() | |
36 | { | |
37 | epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch"); | |
38 | if (!epoch) { | |
39 | dout(1) << "init, first boot, initializing epoch at 1 " << dendl; | |
40 | epoch = 1; | |
41 | } else if (epoch % 2) { | |
42 | dout(1) << "init, last seen epoch " << epoch | |
43 | << ", mid-election, bumping" << dendl; | |
44 | ++epoch; | |
45 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
46 | t->put(Monitor::MONITOR_NAME, "election_epoch", epoch); | |
47 | mon->store->apply_transaction(t); | |
48 | } else { | |
49 | dout(1) << "init, last seen epoch " << epoch << dendl; | |
50 | } | |
51 | } | |
52 | ||
53 | void Elector::shutdown() | |
54 | { | |
55 | cancel_timer(); | |
56 | } | |
57 | ||
58 | void Elector::bump_epoch(epoch_t e) | |
59 | { | |
60 | dout(10) << "bump_epoch " << epoch << " to " << e << dendl; | |
61 | assert(epoch <= e); | |
62 | epoch = e; | |
63 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
64 | t->put(Monitor::MONITOR_NAME, "election_epoch", epoch); | |
65 | mon->store->apply_transaction(t); | |
66 | ||
67 | mon->join_election(); | |
68 | ||
69 | // clear up some state | |
70 | electing_me = false; | |
71 | acked_me.clear(); | |
72 | } | |
73 | ||
74 | ||
75 | void Elector::start() | |
76 | { | |
77 | if (!participating) { | |
78 | dout(0) << "not starting new election -- not participating" << dendl; | |
79 | return; | |
80 | } | |
81 | dout(5) << "start -- can i be leader?" << dendl; | |
82 | ||
83 | acked_me.clear(); | |
84 | init(); | |
85 | ||
86 | // start by trying to elect me | |
87 | if (epoch % 2 == 0) { | |
88 | bump_epoch(epoch+1); // odd == election cycle | |
89 | } else { | |
90 | // do a trivial db write just to ensure it is writeable. | |
91 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
92 | t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand()); | |
93 | int r = mon->store->apply_transaction(t); | |
94 | assert(r >= 0); | |
95 | } | |
96 | start_stamp = ceph_clock_now(); | |
97 | electing_me = true; | |
98 | acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL; | |
99 | acked_me[mon->rank].mon_features = ceph::features::mon::get_supported(); | |
100 | mon->collect_metadata(&acked_me[mon->rank].metadata); | |
101 | leader_acked = -1; | |
102 | ||
103 | // bcast to everyone else | |
104 | for (unsigned i=0; i<mon->monmap->size(); ++i) { | |
105 | if ((int)i == mon->rank) continue; | |
106 | MMonElection *m = | |
107 | new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap); | |
108 | m->mon_features = ceph::features::mon::get_supported(); | |
109 | mon->messenger->send_message(m, mon->monmap->get_inst(i)); | |
110 | } | |
111 | ||
112 | reset_timer(); | |
113 | } | |
114 | ||
115 | void Elector::defer(int who) | |
116 | { | |
117 | dout(5) << "defer to " << who << dendl; | |
118 | ||
119 | if (electing_me) { | |
120 | // drop out | |
121 | acked_me.clear(); | |
122 | electing_me = false; | |
123 | } | |
124 | ||
125 | // ack them | |
126 | leader_acked = who; | |
127 | ack_stamp = ceph_clock_now(); | |
128 | MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap); | |
129 | m->mon_features = ceph::features::mon::get_supported(); | |
130 | mon->collect_metadata(&m->metadata); | |
131 | ||
132 | // This field is unused completely in luminous, but jewel uses it to | |
133 | // determine whether we are a dumpling mon due to some crufty old | |
134 | // code. It only needs to see this buffer non-empty, so put | |
135 | // something useless there. | |
136 | m->sharing_bl = mon->get_local_commands_bl(mon->get_required_mon_features()); | |
137 | ||
138 | mon->messenger->send_message(m, mon->monmap->get_inst(who)); | |
139 | ||
140 | // set a timer | |
141 | reset_timer(1.0); // give the leader some extra time to declare victory | |
142 | } | |
143 | ||
144 | ||
145 | void Elector::reset_timer(double plus) | |
146 | { | |
147 | // set the timer | |
148 | cancel_timer(); | |
149 | /** | |
150 | * This class is used as the callback when the expire_event timer fires up. | |
151 | * | |
152 | * If the expire_event is fired, then it means that we had an election going, | |
153 | * either started by us or by some other participant, but it took too long, | |
154 | * thus expiring. | |
155 | * | |
156 | * When the election expires, we will check if we were the ones who won, and | |
157 | * if so we will declare victory. If that is not the case, then we assume | |
158 | * that the one we defered to didn't declare victory quickly enough (in fact, | |
159 | * as far as we know, we may even be dead); so, just propose ourselves as the | |
160 | * Leader. | |
161 | */ | |
162 | expire_event = new C_MonContext(mon, [this](int) { | |
163 | expire(); | |
164 | }); | |
165 | mon->timer.add_event_after(g_conf->mon_election_timeout + plus, | |
166 | expire_event); | |
167 | } | |
168 | ||
169 | ||
170 | void Elector::cancel_timer() | |
171 | { | |
172 | if (expire_event) { | |
173 | mon->timer.cancel_event(expire_event); | |
174 | expire_event = 0; | |
175 | } | |
176 | } | |
177 | ||
178 | void Elector::expire() | |
179 | { | |
180 | dout(5) << "election timer expired" << dendl; | |
181 | ||
182 | // did i win? | |
183 | if (electing_me && | |
184 | acked_me.size() > (unsigned)(mon->monmap->size() / 2)) { | |
185 | // i win | |
186 | victory(); | |
187 | } else { | |
188 | // whoever i deferred to didn't declare victory quickly enough. | |
189 | if (mon->has_ever_joined) | |
190 | start(); | |
191 | else | |
192 | mon->bootstrap(); | |
193 | } | |
194 | } | |
195 | ||
196 | ||
197 | void Elector::victory() | |
198 | { | |
199 | leader_acked = -1; | |
200 | electing_me = false; | |
201 | ||
202 | uint64_t cluster_features = CEPH_FEATURES_ALL; | |
203 | mon_feature_t mon_features = ceph::features::mon::get_supported(); | |
204 | set<int> quorum; | |
205 | map<int,Metadata> metadata; | |
206 | for (map<int, elector_info_t>::iterator p = acked_me.begin(); | |
207 | p != acked_me.end(); | |
208 | ++p) { | |
209 | quorum.insert(p->first); | |
210 | cluster_features &= p->second.cluster_features; | |
211 | mon_features &= p->second.mon_features; | |
212 | metadata[p->first] = p->second.metadata; | |
213 | } | |
214 | ||
215 | cancel_timer(); | |
216 | ||
217 | assert(epoch % 2 == 1); // election | |
218 | bump_epoch(epoch+1); // is over! | |
219 | ||
220 | // tell everyone! | |
221 | for (set<int>::iterator p = quorum.begin(); | |
222 | p != quorum.end(); | |
223 | ++p) { | |
224 | if (*p == mon->rank) continue; | |
225 | MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, | |
226 | mon->monmap); | |
227 | m->quorum = quorum; | |
228 | m->quorum_features = cluster_features; | |
229 | m->mon_features = mon_features; | |
230 | m->sharing_bl = mon->get_local_commands_bl(mon_features); | |
231 | mon->messenger->send_message(m, mon->monmap->get_inst(*p)); | |
232 | } | |
233 | ||
234 | // tell monitor | |
235 | mon->win_election(epoch, quorum, | |
236 | cluster_features, mon_features, metadata); | |
237 | } | |
238 | ||
239 | ||
240 | void Elector::handle_propose(MonOpRequestRef op) | |
241 | { | |
242 | op->mark_event("elector:handle_propose"); | |
243 | MMonElection *m = static_cast<MMonElection*>(op->get_req()); | |
244 | dout(5) << "handle_propose from " << m->get_source() << dendl; | |
245 | int from = m->get_source().num(); | |
246 | ||
247 | assert(m->epoch % 2 == 1); // election | |
248 | uint64_t required_features = mon->get_required_features(); | |
249 | mon_feature_t required_mon_features = mon->get_required_mon_features(); | |
250 | ||
251 | dout(10) << __func__ << " required features " << required_features | |
252 | << " " << required_mon_features | |
253 | << ", peer features " << m->get_connection()->get_features() | |
254 | << " " << m->mon_features | |
255 | << dendl; | |
256 | ||
257 | if ((required_features ^ m->get_connection()->get_features()) & | |
258 | required_features) { | |
259 | dout(5) << " ignoring propose from mon" << from | |
260 | << " without required features" << dendl; | |
261 | nak_old_peer(op); | |
262 | return; | |
263 | } else if (!m->mon_features.contains_all(required_mon_features)) { | |
264 | // all the features in 'required_mon_features' not in 'm->mon_features' | |
265 | mon_feature_t missing = required_mon_features.diff(m->mon_features); | |
266 | dout(5) << " ignoring propose from mon." << from | |
267 | << " without required mon_features " << missing | |
268 | << dendl; | |
269 | nak_old_peer(op); | |
270 | } else if (m->epoch > epoch) { | |
271 | bump_epoch(m->epoch); | |
272 | } else if (m->epoch < epoch) { | |
273 | // got an "old" propose, | |
274 | if (epoch % 2 == 0 && // in a non-election cycle | |
275 | mon->quorum.count(from) == 0) { // from someone outside the quorum | |
276 | // a mon just started up, call a new election so they can rejoin! | |
277 | dout(5) << " got propose from old epoch, quorum is " << mon->quorum | |
278 | << ", " << m->get_source() << " must have just started" << dendl; | |
279 | // we may be active; make sure we reset things in the monitor appropriately. | |
280 | mon->start_election(); | |
281 | } else { | |
282 | dout(5) << " ignoring old propose" << dendl; | |
283 | return; | |
284 | } | |
285 | } | |
286 | ||
287 | if (mon->rank < from) { | |
288 | // i would win over them. | |
289 | if (leader_acked >= 0) { // we already acked someone | |
290 | assert(leader_acked < from); // and they still win, of course | |
291 | dout(5) << "no, we already acked " << leader_acked << dendl; | |
292 | } else { | |
293 | // wait, i should win! | |
294 | if (!electing_me) { | |
295 | mon->start_election(); | |
296 | } | |
297 | } | |
298 | } else { | |
299 | // they would win over me | |
300 | if (leader_acked < 0 || // haven't acked anyone yet, or | |
301 | leader_acked > from || // they would win over who you did ack, or | |
302 | leader_acked == from) { // this is the guy we're already deferring to | |
303 | defer(from); | |
304 | } else { | |
305 | // ignore them! | |
306 | dout(5) << "no, we already acked " << leader_acked << dendl; | |
307 | } | |
308 | } | |
309 | } | |
310 | ||
311 | void Elector::handle_ack(MonOpRequestRef op) | |
312 | { | |
313 | op->mark_event("elector:handle_ack"); | |
314 | MMonElection *m = static_cast<MMonElection*>(op->get_req()); | |
315 | dout(5) << "handle_ack from " << m->get_source() << dendl; | |
316 | int from = m->get_source().num(); | |
317 | ||
318 | assert(m->epoch % 2 == 1); // election | |
319 | if (m->epoch > epoch) { | |
320 | dout(5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl; | |
321 | bump_epoch(m->epoch); | |
322 | start(); | |
323 | return; | |
324 | } | |
325 | assert(m->epoch == epoch); | |
326 | uint64_t required_features = mon->get_required_features(); | |
327 | if ((required_features ^ m->get_connection()->get_features()) & | |
328 | required_features) { | |
329 | dout(5) << " ignoring ack from mon" << from | |
330 | << " without required features" << dendl; | |
331 | return; | |
332 | } | |
333 | ||
334 | mon_feature_t required_mon_features = mon->get_required_mon_features(); | |
335 | if (!m->mon_features.contains_all(required_mon_features)) { | |
336 | mon_feature_t missing = required_mon_features.diff(m->mon_features); | |
337 | dout(5) << " ignoring ack from mon." << from | |
338 | << " without required mon_features " << missing | |
339 | << dendl; | |
340 | return; | |
341 | } | |
342 | ||
343 | if (electing_me) { | |
344 | // thanks | |
345 | acked_me[from].cluster_features = m->get_connection()->get_features(); | |
346 | acked_me[from].mon_features = m->mon_features; | |
347 | acked_me[from].metadata = m->metadata; | |
348 | dout(5) << " so far i have {"; | |
349 | for (map<int, elector_info_t>::const_iterator p = acked_me.begin(); | |
350 | p != acked_me.end(); | |
351 | ++p) { | |
352 | if (p != acked_me.begin()) | |
353 | *_dout << ","; | |
354 | *_dout << " mon." << p->first << ":" | |
355 | << " features " << p->second.cluster_features | |
356 | << " " << p->second.mon_features; | |
357 | } | |
358 | *_dout << " }" << dendl; | |
359 | ||
360 | // is that _everyone_? | |
361 | if (acked_me.size() == mon->monmap->size()) { | |
362 | // if yes, shortcut to election finish | |
363 | victory(); | |
364 | } | |
365 | } else { | |
366 | // ignore, i'm deferring already. | |
367 | assert(leader_acked >= 0); | |
368 | } | |
369 | } | |
370 | ||
371 | ||
372 | void Elector::handle_victory(MonOpRequestRef op) | |
373 | { | |
374 | op->mark_event("elector:handle_victory"); | |
375 | MMonElection *m = static_cast<MMonElection*>(op->get_req()); | |
376 | dout(5) << "handle_victory from " << m->get_source() | |
377 | << " quorum_features " << m->quorum_features | |
378 | << " " << m->mon_features | |
379 | << dendl; | |
380 | int from = m->get_source().num(); | |
381 | ||
382 | assert(from < mon->rank); | |
383 | assert(m->epoch % 2 == 0); | |
384 | ||
385 | leader_acked = -1; | |
386 | ||
387 | // i should have seen this election if i'm getting the victory. | |
388 | if (m->epoch != epoch + 1) { | |
389 | dout(5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl; | |
390 | bump_epoch(m->epoch); | |
391 | start(); | |
392 | return; | |
393 | } | |
394 | ||
395 | bump_epoch(m->epoch); | |
396 | ||
397 | // they win | |
398 | mon->lose_election(epoch, m->quorum, from, | |
399 | m->quorum_features, m->mon_features); | |
400 | ||
401 | // cancel my timer | |
402 | cancel_timer(); | |
403 | ||
404 | // stash leader's commands | |
405 | assert(m->sharing_bl.length()); | |
406 | vector<MonCommand> new_cmds; | |
407 | bufferlist::iterator bi = m->sharing_bl.begin(); | |
408 | MonCommand::decode_vector(new_cmds, bi); | |
409 | mon->set_leader_commands(new_cmds); | |
410 | } | |
411 | ||
412 | void Elector::nak_old_peer(MonOpRequestRef op) | |
413 | { | |
414 | op->mark_event("elector:nak_old_peer"); | |
415 | MMonElection *m = static_cast<MMonElection*>(op->get_req()); | |
416 | uint64_t supported_features = m->get_connection()->get_features(); | |
417 | uint64_t required_features = mon->get_required_features(); | |
418 | mon_feature_t required_mon_features = mon->get_required_mon_features(); | |
419 | dout(10) << "sending nak to peer " << m->get_source() | |
420 | << " that only supports " << supported_features | |
421 | << " " << m->mon_features | |
422 | << " of the required " << required_features | |
423 | << " " << required_mon_features | |
424 | << dendl; | |
425 | ||
426 | MMonElection *reply = new MMonElection(MMonElection::OP_NAK, m->epoch, | |
427 | mon->monmap); | |
428 | reply->quorum_features = required_features; | |
429 | reply->mon_features = required_mon_features; | |
430 | mon->features.encode(reply->sharing_bl); | |
431 | m->get_connection()->send_message(reply); | |
432 | } | |
433 | ||
434 | void Elector::handle_nak(MonOpRequestRef op) | |
435 | { | |
436 | op->mark_event("elector:handle_nak"); | |
437 | MMonElection *m = static_cast<MMonElection*>(op->get_req()); | |
438 | dout(1) << "handle_nak from " << m->get_source() | |
439 | << " quorum_features " << m->quorum_features | |
440 | << " " << m->mon_features | |
441 | << dendl; | |
442 | ||
443 | CompatSet other; | |
444 | bufferlist::iterator bi = m->sharing_bl.begin(); | |
445 | other.decode(bi); | |
446 | CompatSet diff = Monitor::get_supported_features().unsupported(other); | |
447 | ||
448 | mon_feature_t mon_supported = ceph::features::mon::get_supported(); | |
449 | // all features in 'm->mon_features' not in 'mon_supported' | |
450 | mon_feature_t mon_diff = m->mon_features.diff(mon_supported); | |
451 | ||
452 | derr << "Shutting down because I do not support required monitor features: { " | |
453 | << diff << " } " << mon_diff << dendl; | |
454 | ||
455 | exit(0); | |
456 | // the end! | |
457 | } | |
458 | ||
459 | void Elector::dispatch(MonOpRequestRef op) | |
460 | { | |
461 | op->mark_event("elector:dispatch"); | |
462 | assert(op->is_type_election()); | |
463 | ||
464 | switch (op->get_req()->get_type()) { | |
465 | ||
466 | case MSG_MON_ELECTION: | |
467 | { | |
468 | if (!participating) { | |
469 | return; | |
470 | } | |
471 | if (op->get_req()->get_source().num() >= mon->monmap->size()) { | |
472 | dout(5) << " ignoring bogus election message with bad mon rank " | |
473 | << op->get_req()->get_source() << dendl; | |
474 | return; | |
475 | } | |
476 | ||
477 | MMonElection *em = static_cast<MMonElection*>(op->get_req()); | |
478 | ||
479 | // assume an old message encoding would have matched | |
480 | if (em->fsid != mon->monmap->fsid) { | |
481 | dout(0) << " ignoring election msg fsid " | |
482 | << em->fsid << " != " << mon->monmap->fsid << dendl; | |
483 | return; | |
484 | } | |
485 | ||
486 | if (!mon->monmap->contains(em->get_source_addr())) { | |
487 | dout(1) << "discarding election message: " << em->get_source_addr() | |
488 | << " not in my monmap " << *mon->monmap << dendl; | |
489 | return; | |
490 | } | |
491 | ||
492 | MonMap peermap; | |
493 | peermap.decode(em->monmap_bl); | |
494 | if (peermap.epoch > mon->monmap->epoch) { | |
495 | dout(0) << em->get_source_inst() << " has newer monmap epoch " << peermap.epoch | |
496 | << " > my epoch " << mon->monmap->epoch | |
497 | << ", taking it" | |
498 | << dendl; | |
499 | mon->monmap->decode(em->monmap_bl); | |
500 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
501 | t->put("monmap", mon->monmap->epoch, em->monmap_bl); | |
502 | t->put("monmap", "last_committed", mon->monmap->epoch); | |
503 | mon->store->apply_transaction(t); | |
504 | //mon->monmon()->paxos->stash_latest(mon->monmap->epoch, em->monmap_bl); | |
505 | cancel_timer(); | |
506 | mon->bootstrap(); | |
507 | return; | |
508 | } | |
509 | if (peermap.epoch < mon->monmap->epoch) { | |
510 | dout(0) << em->get_source_inst() << " has older monmap epoch " << peermap.epoch | |
511 | << " < my epoch " << mon->monmap->epoch | |
512 | << dendl; | |
513 | } | |
514 | ||
515 | switch (em->op) { | |
516 | case MMonElection::OP_PROPOSE: | |
517 | handle_propose(op); | |
518 | return; | |
519 | } | |
520 | ||
521 | if (em->epoch < epoch) { | |
522 | dout(5) << "old epoch, dropping" << dendl; | |
523 | break; | |
524 | } | |
525 | ||
526 | switch (em->op) { | |
527 | case MMonElection::OP_ACK: | |
528 | handle_ack(op); | |
529 | return; | |
530 | case MMonElection::OP_VICTORY: | |
531 | handle_victory(op); | |
532 | return; | |
533 | case MMonElection::OP_NAK: | |
534 | handle_nak(op); | |
535 | return; | |
536 | default: | |
537 | ceph_abort(); | |
538 | } | |
539 | } | |
540 | break; | |
541 | ||
542 | default: | |
543 | ceph_abort(); | |
544 | } | |
545 | } | |
546 | ||
547 | void Elector::start_participating() | |
548 | { | |
549 | if (!participating) { | |
550 | participating = true; | |
551 | } | |
552 | } |