]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/Paxos.h
update sources to 12.2.7
[ceph.git] / ceph / src / mon / Paxos.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15/*
16time---->
17
18cccccccccccccccccca????????????????????????????????????????
19cccccccccccccccccca????????????????????????????????????????
20cccccccccccccccccca???????????????????????????????????????? leader
21cccccccccccccccccc?????????????????????????????????????????
22ccccc??????????????????????????????????????????????????????
23
24last_committed
25
26pn_from
27pn
28
29a 12v
30b 12v
31c 14v
32d
33e 12v
34*/
35
36/**
37 * Paxos storage layout and behavior
38 *
39 * Currently, we use a key/value store to hold all the Paxos-related data, but
40 * it can logically be depicted as this:
41 *
42 * paxos:
43 * first_committed -> 1
44 * last_committed -> 4
45 * 1 -> value_1
46 * 2 -> value_2
47 * 3 -> value_3
48 * 4 -> value_4
49 *
50 * Since we are relying on a k/v store supporting atomic transactions, we can
51 * guarantee that if 'last_committed' has a value of '4', then we have up to
52 * version 4 on the store, and no more than that; the same applies to
53 * 'first_committed', which holding '1' will strictly meaning that our lowest
54 * version is 1.
55 *
56 * Each version's value (value_1, value_2, ..., value_n) is a blob of data,
57 * incomprehensible to the Paxos. These values are proposed to the Paxos on
58 * propose_new_value() and each one is a transaction encoded in a bufferlist.
59 *
60 * The Paxos will write the value to disk, associating it with its version,
61 * but will take a step further: the value shall be decoded, and the operations
62 * on that transaction shall be applied during the same transaction that will
63 * write the value's encoded bufferlist to disk. This behavior ensures that
64 * whatever is being proposed will only be available on the store when it is
65 * applied by Paxos, which will then be aware of such new values, guaranteeing
66 * the store state is always consistent without requiring shady workarounds.
67 *
68 * So, let's say that FooMonitor proposes the following transaction, neatly
69 * encoded on a bufferlist of course:
70 *
71 * Tx_Foo
72 * put(foo, last_committed, 3)
73 * put(foo, 3, foo_value_3)
74 * erase(foo, 2)
75 * erase(foo, 1)
76 * put(foo, first_committed, 3)
77 *
78 * And knowing that the Paxos is proposed Tx_Foo as a bufferlist, once it is
79 * ready to commit, and assuming we are now committing version 5 of the Paxos,
80 * we will do something along the lines of:
81 *
82 * Tx proposed_tx;
83 * proposed_tx.decode(Tx_foo_bufferlist);
84 *
85 * Tx our_tx;
86 * our_tx.put(paxos, last_committed, 5);
87 * our_tx.put(paxos, 5, Tx_foo_bufferlist);
88 * our_tx.append(proposed_tx);
89 *
90 * store_apply(our_tx);
91 *
92 * And the store should look like this after we apply 'our_tx':
93 *
94 * paxos:
95 * first_committed -> 1
96 * last_committed -> 5
97 * 1 -> value_1
98 * 2 -> value_2
99 * 3 -> value_3
100 * 4 -> value_4
101 * 5 -> Tx_foo_bufferlist
102 * foo:
103 * first_committed -> 3
104 * last_committed -> 3
105 * 3 -> foo_value_3
106 *
107 */
108
109#ifndef CEPH_MON_PAXOS_H
110#define CEPH_MON_PAXOS_H
111
112#include "include/types.h"
113#include "mon_types.h"
114#include "include/buffer.h"
115#include "msg/msg_types.h"
116#include "include/Context.h"
117#include "common/perf_counters.h"
118#include <errno.h>
119
120#include "MonitorDBStore.h"
121#include "mon/MonOpRequest.h"
122
123class Monitor;
124class MMonPaxos;
125
126enum {
127 l_paxos_first = 45800,
128 l_paxos_start_leader,
129 l_paxos_start_peon,
130 l_paxos_restart,
131 l_paxos_refresh,
132 l_paxos_refresh_latency,
133 l_paxos_begin,
134 l_paxos_begin_keys,
135 l_paxos_begin_bytes,
136 l_paxos_begin_latency,
137 l_paxos_commit,
138 l_paxos_commit_keys,
139 l_paxos_commit_bytes,
140 l_paxos_commit_latency,
141 l_paxos_collect,
142 l_paxos_collect_keys,
143 l_paxos_collect_bytes,
144 l_paxos_collect_latency,
145 l_paxos_collect_uncommitted,
146 l_paxos_collect_timeout,
147 l_paxos_accept_timeout,
148 l_paxos_lease_ack_timeout,
149 l_paxos_lease_timeout,
150 l_paxos_store_state,
151 l_paxos_store_state_keys,
152 l_paxos_store_state_bytes,
153 l_paxos_store_state_latency,
154 l_paxos_share_state,
155 l_paxos_share_state_keys,
156 l_paxos_share_state_bytes,
157 l_paxos_new_pn,
158 l_paxos_new_pn_latency,
159 l_paxos_last,
160};
161
162
163// i am one state machine.
164/**
165 * This libary is based on the Paxos algorithm, but varies in a few key ways:
166 * 1- Only a single new value is generated at a time, simplifying the recovery logic.
167 * 2- Nodes track "committed" values, and share them generously (and trustingly)
168 * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is
169 * safe to "read" their copy of the last committed value.
170 *
171 * This provides a simple replication substrate that services can be built on top of.
172 * See PaxosService.h
173 */
174class Paxos {
175 /**
176 * @defgroup Paxos_h_class Paxos
177 * @{
178 */
179 /**
180 * The Monitor to which this Paxos class is associated with.
181 */
182 Monitor *mon;
183
184 /// perf counter for internal instrumentations
185 PerfCounters *logger;
186
187 void init_logger();
188
189 // my state machine info
190 const string paxos_name;
191
192 friend class Monitor;
193 friend class PaxosService;
194
195 list<std::string> extra_state_dirs;
196
197 // LEADER+PEON
198
199 // -- generic state --
200public:
201 /**
202 * @defgroup Paxos_h_states States on which the leader/peon may be.
203 * @{
204 */
205 enum {
206 /**
207 * Leader/Peon is in Paxos' Recovery state
208 */
209 STATE_RECOVERING,
210 /**
211 * Leader/Peon is idle, and the Peon may or may not have a valid lease.
212 */
213 STATE_ACTIVE,
214 /**
215 * Leader/Peon is updating to a new value.
216 */
217 STATE_UPDATING,
218 /*
219 * Leader proposing an old value
220 */
221 STATE_UPDATING_PREVIOUS,
222 /*
223 * Leader/Peon is writing a new commit. readable, but not
224 * writeable.
225 */
226 STATE_WRITING,
227 /*
228 * Leader/Peon is writing a new commit from a previous round.
229 */
230 STATE_WRITING_PREVIOUS,
231 // leader: refresh following a commit
232 STATE_REFRESH,
35e4c445
FG
233 // Shutdown after WRITING or WRITING_PREVIOUS
234 STATE_SHUTDOWN
7c673cae
FG
235 };
236
237 /**
238 * Obtain state name from constant value.
239 *
240 * @note This function will raise a fatal error if @p s is not
241 * a valid state value.
242 *
243 * @param s State value.
244 * @return The state's name.
245 */
246 static const string get_statename(int s) {
247 switch (s) {
248 case STATE_RECOVERING:
249 return "recovering";
250 case STATE_ACTIVE:
251 return "active";
252 case STATE_UPDATING:
253 return "updating";
254 case STATE_UPDATING_PREVIOUS:
255 return "updating-previous";
256 case STATE_WRITING:
257 return "writing";
258 case STATE_WRITING_PREVIOUS:
259 return "writing-previous";
260 case STATE_REFRESH:
261 return "refresh";
35e4c445
FG
262 case STATE_SHUTDOWN:
263 return "shutdown";
7c673cae
FG
264 default:
265 return "UNKNOWN";
266 }
267 }
268
269private:
270 /**
271 * The state we are in.
272 */
273 int state;
274 /**
275 * @}
276 */
35e4c445
FG
277 int commits_started = 0;
278
279 Cond shutdown_cond;
7c673cae
FG
280
281public:
282 /**
283 * Check if we are recovering.
284 *
285 * @return 'true' if we are on the Recovering state; 'false' otherwise.
286 */
287 bool is_recovering() const { return (state == STATE_RECOVERING); }
288 /**
289 * Check if we are active.
290 *
291 * @return 'true' if we are on the Active state; 'false' otherwise.
292 */
293 bool is_active() const { return state == STATE_ACTIVE; }
294 /**
295 * Check if we are updating.
296 *
297 * @return 'true' if we are on the Updating state; 'false' otherwise.
298 */
299 bool is_updating() const { return state == STATE_UPDATING; }
300
301 /**
302 * Check if we are updating/proposing a previous value from a
303 * previous quorum
304 */
305 bool is_updating_previous() const { return state == STATE_UPDATING_PREVIOUS; }
306
307 /// @return 'true' if we are writing an update to disk
308 bool is_writing() const { return state == STATE_WRITING; }
309
310 /// @return 'true' if we are writing an update-previous to disk
311 bool is_writing_previous() const { return state == STATE_WRITING_PREVIOUS; }
312
313 /// @return 'true' if we are refreshing an update just committed
314 bool is_refresh() const { return state == STATE_REFRESH; }
315
35e4c445
FG
316 /// @return 'true' if we are in the process of shutting down
317 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
318
7c673cae
FG
319private:
320 /**
321 * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
322 * @note These variables are common to both the Leader and the Peons.
323 * @{
324 */
325 /**
326 *
327 */
328 version_t first_committed;
329 /**
330 * Last Proposal Number
331 *
332 * @todo Expand description
333 */
334 version_t last_pn;
335 /**
336 * Last committed value's version.
337 *
338 * On both the Leader and the Peons, this is the last value's version that
339 * was accepted by a given quorum and thus committed, that this instance
340 * knows about.
341 *
342 * @note It may not be the last committed value's version throughout the
343 * system. If we are a Peon, we may have not been part of the quorum
344 * that accepted the value, and for this very same reason we may still
345 * be a (couple of) version(s) behind, until we learn about the most
346 * recent version. This should only happen if we are not active (i.e.,
347 * part of the quorum), which should not happen if we are up, running
348 * and able to communicate with others -- thus able to be part of the
349 * monmap and trigger new elections.
350 */
351 version_t last_committed;
352 /**
353 * Last committed value's time.
354 *
355 * When the commit finished.
356 */
357 utime_t last_commit_time;
358 /**
359 * The last Proposal Number we have accepted.
360 *
361 * On the Leader, it will be the Proposal Number picked by the Leader
362 * itself. On the Peon, however, it will be the proposal sent by the Leader
31f18b77 363 * and it will only be updated if its value is higher than the one
7c673cae
FG
364 * already known by the Peon.
365 */
366 version_t accepted_pn;
367 /**
368 * The last_committed epoch of the leader at the time we accepted the last pn.
369 *
370 * This has NO SEMANTIC MEANING, and is there only for the debug output.
371 */
372 version_t accepted_pn_from;
373 /**
374 * Map holding the first committed version by each quorum member.
375 *
376 * The versions kept in this map are updated during the collect phase.
377 * When the Leader starts the collect phase, each Peon will reply with its
378 * first committed version, which will then be kept in this map.
379 */
380 map<int,version_t> peer_first_committed;
381 /**
382 * Map holding the last committed version by each quorum member.
383 *
384 * The versions kept in this map are updated during the collect phase.
385 * When the Leader starts the collect phase, each Peon will reply with its
386 * last committed version, which will then be kept in this map.
387 */
388 map<int,version_t> peer_last_committed;
389 /**
390 * @}
391 */
392
393 // active (phase 2)
394 /**
395 * @defgroup Paxos_h_active_vars Common active-related member variables
396 * @{
397 */
398 /**
399 * When does our read lease expires.
400 *
401 * Instead of performing a full commit each time a read is requested, we
402 * keep leases. Each lease will have an expiration date, which may or may
403 * not be extended.
404 */
405 utime_t lease_expire;
406 /**
407 * List of callbacks waiting for our state to change into STATE_ACTIVE.
408 */
409 list<Context*> waiting_for_active;
410 /**
411 * List of callbacks waiting for the chance to read a version from us.
412 *
413 * Each entry on the list may result from an attempt to read a version that
414 * wasn't available at the time, or an attempt made during a period during
415 * which we could not satisfy the read request. The first case happens if
416 * the requested version is greater than our last committed version. The
417 * second scenario may happen if we are recovering, or if we don't have a
418 * valid lease.
419 *
420 * The list will be woken up once we change to STATE_ACTIVE with an extended
421 * lease -- which can be achieved if we have everyone on the quorum on board
422 * with the latest proposal, or if we don't really care about the remaining
423 * uncommitted values --, or if we're on a quorum of one.
424 */
425 list<Context*> waiting_for_readable;
426 /**
427 * @}
428 */
429
430 // -- leader --
431 // recovery (paxos phase 1)
432 /**
433 * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars
434 * @{
435 */
436 /**
437 * Number of replies to the collect phase we've received so far.
438 *
439 * This variable is reset to 1 each time we start a collect phase; it is
440 * incremented each time we receive a reply to the collect message, and
441 * is used to determine whether or not we have received replies from the
442 * whole quorum.
443 */
444 unsigned num_last;
445 /**
446 * Uncommitted value's version.
447 *
448 * If we have, or end up knowing about, an uncommitted value, then its
449 * version will be kept in this variable.
450 *
451 * @note If this version equals @p last_committed+1 when we reach the final
452 * steps of recovery, then the algorithm will assume this is a value
453 * the Leader does not know about, and trustingly the Leader will
454 * propose this version's value.
455 */
456 version_t uncommitted_v;
457 /**
458 * Uncommitted value's Proposal Number.
459 *
460 * We use this variable to assess if the Leader should take into consideration
461 * an uncommitted value sent by a Peon. Given that the Peon will send back to
462 * the Leader the last Proposal Number it accepted, the Leader will be able
463 * to infer if this value is more recent than the one the Leader has, thus
464 * more relevant.
465 */
466 version_t uncommitted_pn;
467 /**
468 * Uncommitted Value.
469 *
470 * If the system fails in-between the accept replies from the Peons and the
471 * instruction to commit from the Leader, then we may end up with accepted
472 * but yet-uncommitted values. During the Leader's recovery, it will attempt
473 * to bring the whole system to the latest state, and that means committing
474 * past accepted but uncommitted values.
475 *
476 * This variable will hold an uncommitted value, which may originate either
477 * on the Leader, or learnt by the Leader from a Peon during the collect
478 * phase.
479 */
480 bufferlist uncommitted_value;
481 /**
482 * Used to specify when an on-going collect phase times out.
483 */
484 Context *collect_timeout_event;
485 /**
486 * @}
487 */
488
489 // active
490 /**
491 * @defgroup Paxos_h_leader_active Leader-specific Active-related vars
492 * @{
493 */
494 /**
495 * Set of participants (Leader & Peons) that have acked a lease extension.
496 *
497 * Each Peon that acknowledges a lease extension will have its place in this
498 * set, which will be used to account for all the acks from all the quorum
499 * members, guaranteeing that we trigger new elections if some don't ack in
500 * the expected timeframe.
501 */
502 set<int> acked_lease;
503 /**
504 * Callback responsible for extending the lease periodically.
505 */
506 Context *lease_renew_event;
507 /**
508 * Callback to trigger new elections once the time for acks is out.
509 */
510 Context *lease_ack_timeout_event;
511 /**
512 * @}
513 */
514 /**
515 * @defgroup Paxos_h_peon_active Peon-specific Active-related vars
516 * @{
517 */
518 /**
519 * Callback to trigger new elections when the Peon's lease times out.
520 *
521 * If the Peon's lease is extended, this callback will be reset (i.e.,
522 * we cancel the event and reschedule a new one with starting from the
523 * beginning).
524 */
525 Context *lease_timeout_event;
526 /**
527 * @}
528 */
529
530 // updating (paxos phase 2)
531 /**
532 * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars
533 * @{
534 */
535 /**
536 * New Value being proposed to the Peons.
537 *
538 * This bufferlist holds the value the Leader is proposing to the Peons, and
539 * that will be committed if the Peons do accept the proposal.
540 */
541 bufferlist new_value;
542 /**
543 * Set of participants (Leader & Peons) that accepted the new proposed value.
544 *
545 * This set is used to keep track of those who have accepted the proposed
546 * value, so the leader may know when to issue a commit (when a majority of
547 * participants has accepted the proposal), and when to extend the lease
548 * (when all the quorum members have accepted the proposal).
549 */
550 set<int> accepted;
551 /**
552 * Callback to trigger a new election if the proposal is not accepted by the
553 * full quorum within a given timeframe.
554 *
555 * If the full quorum does not accept the proposal, then it means that the
556 * Leader may no longer be recognized as the leader, or that the quorum has
557 * changed, and the value may have not reached all the participants. Thus,
558 * the leader must call new elections, and go through a recovery phase in
559 * order to propagate the new value throughout the system.
560 *
561 * This does not mean that we won't commit. We will commit as soon as we
562 * have a majority of acceptances. But if we do not have full acceptance
563 * from the quorum, then we cannot extend the lease, as some participants
564 * may not have the latest committed value.
565 */
566 Context *accept_timeout_event;
567
568 /**
569 * List of callbacks waiting for it to be possible to write again.
570 *
571 * @remarks It is not possible to write if we are not the Leader, or we are
572 * not on the active state, or if the lease has expired.
573 */
574 list<Context*> waiting_for_writeable;
575 /**
576 * List of callbacks waiting for a commit to finish.
577 *
578 * @remarks This may be used to a) wait for an on-going commit to finish
579 * before we proceed with, say, a new proposal; or b) wait for the
580 * next commit to be finished so we are sure that our value was
581 * fully committed.
582 */
583 list<Context*> waiting_for_commit;
584
585 /**
586 * Pending proposal transaction
587 *
588 * This is the transaction that is under construction and pending
589 * proposal. We will add operations to it until we decide it is
590 * time to start a paxos round.
591 */
592 MonitorDBStore::TransactionRef pending_proposal;
593
594 /**
595 * Finishers for pending transaction
596 *
597 * These are waiting for updates in the pending proposal/transaction
598 * to be committed.
599 */
600 list<Context*> pending_finishers;
601
602 /**
603 * Finishers for committing transaction
604 *
605 * When the pending_proposal is submitted, pending_finishers move to
606 * this list. When it commits, these finishers are notified.
607 */
608 list<Context*> committing_finishers;
609
610 /**
611 * @defgroup Paxos_h_sync_warns Synchronization warnings
612 * @todo Describe these variables
613 * @{
614 */
615 utime_t last_clock_drift_warn;
616 int clock_drift_warned;
617 /**
618 * @}
619 */
620
621 /**
622 * Should be true if we have proposed to trim, or are in the middle of
623 * trimming; false otherwise.
624 */
625 bool trimming;
626
31f18b77
FG
627 /**
628 * true if we want trigger_propose to *not* propose (yet)
629 */
630 bool plugged = false;
631
7c673cae
FG
632 /**
633 * @defgroup Paxos_h_callbacks Callback classes.
634 * @{
635 */
636 /**
637 * Callback class responsible for handling a Collect Timeout.
638 */
639 class C_CollectTimeout;
640 /**
641 * Callback class responsible for handling an Accept Timeout.
642 */
643 class C_AcceptTimeout;
644 /**
645 * Callback class responsible for handling a Lease Ack Timeout.
646 */
647 class C_LeaseAckTimeout;
648
649 /**
650 * Callback class responsible for handling a Lease Timeout.
651 */
652 class C_LeaseTimeout;
653
654 /**
655 * Callback class responsible for handling a Lease Renew Timeout.
656 */
657 class C_LeaseRenew;
658
659 class C_Trimmed;
660 /**
661 *
662 */
663public:
664 class C_Proposal : public Context {
665 Context *proposer_context;
666 public:
667 bufferlist bl;
668 // for debug purposes. Will go away. Soon.
669 bool proposed;
670 utime_t proposal_time;
671
672 C_Proposal(Context *c, bufferlist& proposal_bl) :
673 proposer_context(c),
674 bl(proposal_bl),
675 proposed(false),
676 proposal_time(ceph_clock_now())
677 { }
678
679 void finish(int r) override {
680 if (proposer_context) {
681 proposer_context->complete(r);
682 proposer_context = NULL;
683 }
684 }
685 };
686 /**
687 * @}
688 */
689private:
690 /**
691 * @defgroup Paxos_h_election_triggered Steps triggered by an election.
692 *
693 * @note All these functions play a significant role in the Recovery Phase,
694 * which is triggered right after an election once someone becomes
695 * the Leader.
696 * @{
697 */
698 /**
699 * Create a new Proposal Number and propose it to the Peons.
700 *
701 * This function starts the Recovery Phase, which can be directly mapped
702 * onto the original Paxos' Prepare phase. Basically, we'll generate a
703 * Proposal Number, taking @p oldpn into consideration, and we will send
704 * it to a quorum, along with our first and last committed versions. By
705 * sending these information in a message to the quorum, we expect to
706 * obtain acceptances from a majority, allowing us to commit, or be
707 * informed of a higher Proposal Number known by one or more of the Peons
708 * in the quorum.
709 *
710 * @pre We are the Leader.
711 * @post Recovery Phase initiated by sending messages to the quorum.
712 *
713 * @param oldpn A proposal number taken as the highest known so far, that
714 * should be taken into consideration when generating a new
715 * Proposal Number for the Recovery Phase.
716 */
717 void collect(version_t oldpn);
718 /**
719 * Handle the reception of a collect message from the Leader and reply
720 * accordingly.
721 *
722 * Once a Peon receives a collect message from the Leader it will reply
723 * with its first and last committed versions, as well as information so
724 * the Leader may know if its Proposal Number was, or was not, accepted by
31f18b77 725 * the Peon. The Peon will accept the Leader's Proposal Number if it is
7c673cae
FG
726 * higher than the Peon's currently accepted Proposal Number. The Peon may
727 * also inform the Leader of accepted but uncommitted values.
728 *
729 * @invariant The message is an operation of type OP_COLLECT.
730 * @pre We are a Peon.
731 * @post Replied to the Leader, accepting or not accepting its PN.
732 *
733 * @param collect The collect message sent by the Leader to the Peon.
734 */
735 void handle_collect(MonOpRequestRef op);
736 /**
737 * Handle a response from a Peon to the Leader's collect phase.
738 *
739 * The received message will state the Peon's last committed version, as
740 * well as its last proposal number. This will lead to one of the following
741 * scenarios: if the replied Proposal Number is equal to the one we proposed,
742 * then the Peon has accepted our proposal, and if all the Peons do accept
743 * our Proposal Number, then we are allowed to proceed with the commit;
744 * however, if a Peon replies with a higher Proposal Number, we assume he
745 * knows something we don't and the Leader will have to abort the current
746 * proposal in order to retry with the Proposal Number specified by the Peon.
747 * It may also occur that the Peon replied with a lower Proposal Number, in
748 * which case we assume it is a reply to an older value and we'll simply
749 * drop it.
750 * This function will also check if the Peon replied with an accepted but
751 * yet uncommitted value. In this case, if its version is higher than our
752 * last committed value by one, we assume that the Peon knows a value from a
753 * previous proposal that has never been committed, and we should try to
754 * commit that value by proposing it next. On the other hand, if that is
755 * not the case, we'll assume it is an old, uncommitted value, we do not
756 * care about and we'll consider the system active by extending the leases.
757 *
758 * @invariant The message is an operation of type OP_LAST.
759 * @pre We are the Leader.
760 * @post We initiate a commit, or we retry with a higher Proposal Number,
761 * or we drop the message.
762 * @post We move from STATE_RECOVERING to STATE_ACTIVE.
763 *
764 * @param last The message sent by the Peon to the Leader.
765 */
766 void handle_last(MonOpRequestRef op);
767 /**
768 * The Recovery Phase timed out, meaning that a significant part of the
769 * quorum does not believe we are the Leader, and we thus should trigger new
770 * elections.
771 *
772 * @pre We believe to be the Leader.
773 * @post Trigger new elections.
774 */
775 void collect_timeout();
776 /**
777 * @}
778 */
779
780 /**
781 * @defgroup Paxos_h_updating_funcs Functions used during the Updating State
782 *
783 * These functions may easily be mapped to the original Paxos Algorithm's
784 * phases.
785 *
786 * Taking into account the algorithm can be divided in 4 phases (Prepare,
787 * Promise, Accept Request and Accepted), we can easily map Paxos::begin to
788 * both the Prepare and Accept Request phases; the Paxos::handle_begin to
789 * the Promise phase; and the Paxos::handle_accept to the Accepted phase.
790 * @{
791 */
792 /**
793 * Start a new proposal with the intent of committing @p value.
794 *
795 * If we are alone on the system (i.e., a quorum of one), then we will
796 * simply commit the value, but if we are not alone, then we need to propose
797 * the value to the quorum.
798 *
799 * @pre We are the Leader
800 * @pre We are on STATE_ACTIVE
31f18b77 801 * @post We commit, if we are alone, or we send a message to each quorum
7c673cae 802 * member
31f18b77 803 * @post We are on STATE_ACTIVE, if we are alone, or on
7c673cae
FG
804 * STATE_UPDATING otherwise
805 *
806 * @param value The value being proposed to the quorum
807 */
808 void begin(bufferlist& value);
809 /**
810 * Accept or decline (by ignoring) a proposal from the Leader.
811 *
812 * We will decline the proposal (by ignoring it) if we have promised to
813 * accept a higher numbered proposal. If that is not the case, we will
814 * accept it and accordingly reply to the Leader.
815 *
816 * @pre We are a Peon
817 * @pre We are on STATE_ACTIVE
31f18b77
FG
818 * @post We are on STATE_UPDATING if we accept the Leader's proposal
819 * @post We send a reply message to the Leader if we accept its proposal
7c673cae
FG
820 *
821 * @invariant The received message is an operation of type OP_BEGIN
822 *
823 * @param begin The message sent by the Leader to the Peon during the
824 * Paxos::begin function
825 *
826 */
827 void handle_begin(MonOpRequestRef op);
828 /**
829 * Handle an Accept message sent by a Peon.
830 *
831 * In order to commit, the Leader has to receive accepts from a majority of
832 * the quorum. If that does happen, then the Leader may proceed with the
833 * commit. However, the Leader needs the accepts from all the quorum members
834 * in order to extend the lease and move on to STATE_ACTIVE.
835 *
836 * This function handles these two situations, accounting for the amount of
837 * received accepts.
838 *
839 * @pre We are the Leader
840 * @pre We are on STATE_UPDATING
31f18b77
FG
841 * @post We are on STATE_ACTIVE if we received accepts from the full quorum
842 * @post We extended the lease if we moved on to STATE_ACTIVE
843 * @post We are on STATE_UPDATING if we didn't received accepts from the
7c673cae 844 * full quorum
31f18b77 845 * @post We have committed if we received accepts from a majority
7c673cae
FG
846 *
847 * @invariant The received message is an operation of type OP_ACCEPT
848 *
849 * @param accept The message sent by the Peons to the Leader during the
850 * Paxos::handle_begin function
851 */
852 void handle_accept(MonOpRequestRef op);
853 /**
854 * Trigger a fresh election.
855 *
856 * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in
857 * order to limit the amount of time we spend waiting for Accept replies.
858 * This callback will call Paxos::accept_timeout when it is fired.
859 *
860 * This is essential to the algorithm because there may be the chance that
861 * we are no longer the Leader (i.e., others don't believe in us) and we
862 * are getting ignored, or we dropped out of the quorum and haven't realised
863 * it. So, our only option is to trigger fresh elections.
864 *
865 * @pre We are the Leader
866 * @pre We are on STATE_UPDATING
867 * @post Triggered fresh elections
868 */
869 void accept_timeout();
870 /**
871 * @}
872 */
873
874
875 utime_t commit_start_stamp;
876 friend struct C_Committed;
877
878 /**
879 * Commit a value throughout the system.
880 *
881 * The Leader will cancel the current lease (as it was for the old value),
882 * and will store the committed value locally. It will then instruct every
883 * quorum member to do so as well.
884 *
885 * @pre We are the Leader
886 * @pre We are on STATE_UPDATING
887 * @pre A majority of quorum members accepted our proposal
888 * @post Value locally stored
889 * @post Quorum members instructed to commit the new value.
890 */
891 void commit_start();
892 void commit_finish(); ///< finish a commit after txn becomes durable
35e4c445 893 void abort_commit(); ///< Handle commit finish after shutdown started
7c673cae
FG
894 /**
895 * Commit the new value to stable storage as being the latest available
896 * version.
897 *
898 * @pre We are a Peon
899 * @post The new value is locally stored
900 * @post Fire up the callbacks waiting on waiting_for_commit
901 *
902 * @invariant The received message is an operation of type OP_COMMIT
903 *
904 * @param commit The message sent by the Leader to the Peon during
905 * Paxos::commit
906 */
907 void handle_commit(MonOpRequestRef op);
908 /**
909 * Extend the system's lease.
910 *
911 * This means that the Leader considers that it should now safe to read from
912 * any node on the system, since every quorum member is now in possession of
913 * the latest version. Therefore, the Leader will send a message stating just
914 * this to each quorum member, and will impose a limited timeframe during
915 * which acks will be accepted. If there aren't as many acks as expected
916 * (i.e, if at least one quorum member does not ack the lease) during this
917 * timeframe, then we will force fresh elections.
918 *
919 * @pre We are the Leader
920 * @pre We are on STATE_ACTIVE
921 * @post A message extending the lease is sent to each quorum member
922 * @post A timeout callback is set to limit the amount of time we will wait
923 * for lease acks.
924 * @post A timer is set in order to renew the lease after a certain amount
925 * of time.
926 */
927 void extend_lease();
928 /**
929 * Update the lease on the Peon's side of things.
930 *
931 * Once a Peon receives a Lease message, it will update its lease_expire
932 * variable, reply to the Leader acknowledging the lease update and set a
933 * timeout callback to be fired upon the lease's expiration. Finally, the
934 * Peon will fire up all the callbacks waiting for it to become active,
935 * which it just did, and all those waiting for it to become readable,
936 * which should be true if the Peon's lease didn't expire in the mean time.
937 *
938 * @pre We are a Peon
939 * @post We update the lease accordingly
940 * @post A lease timeout callback is set
941 * @post Move to STATE_ACTIVE
942 * @post Fire up all the callbacks waiting for STATE_ACTIVE
31f18b77 943 * @post Fire up all the callbacks waiting for readable if we are readable
7c673cae
FG
944 * @post Ack the lease to the Leader
945 *
946 * @invariant The received message is an operation of type OP_LEASE
947 *
948 * @param lease The message sent by the Leader to the Peon during the
949 * Paxos::extend_lease function
950 */
951 void handle_lease(MonOpRequestRef op);
952 /**
953 * Account for all the Lease Acks the Leader receives from the Peons.
954 *
955 * Once the Leader receives all the Lease Acks from the Peons, it will be
956 * able to cancel the Lease Ack timeout callback, thus avoiding calling
957 * fresh elections.
958 *
959 * @pre We are the Leader
31f18b77 960 * @post Cancel the Lease Ack timeout callback if we receive acks from all
7c673cae
FG
961 * the quorum members
962 *
963 * @invariant The received message is an operation of type OP_LEASE_ACK
964 *
965 * @param ack The message sent by a Peon to the Leader during the
966 * Paxos::handle_lease function
967 */
968 void handle_lease_ack(MonOpRequestRef op);
969 /**
970 * Call fresh elections because at least one Peon didn't acked our lease.
971 *
972 * @pre We are the Leader
973 * @pre We are on STATE_ACTIVE
974 * @post Trigger fresh elections
975 */
976 void lease_ack_timeout();
977 /**
978 * Extend lease since we haven't had new committed values meanwhile.
979 *
980 * @pre We are the Leader
981 * @pre We are on STATE_ACTIVE
982 * @post Go through with Paxos::extend_lease
983 */
984 void lease_renew_timeout();
985 /**
986 * Call fresh elections because the Peon's lease expired without being
987 * renewed or receiving a fresh lease.
988 *
989 * This means that the Peon is no longer assumed as being in the quorum
990 * (or there is no Leader to speak of), so just trigger fresh elections
991 * to circumvent this issue.
992 *
993 * @pre We are a Peon
994 * @post Trigger fresh elections
995 */
996 void lease_timeout(); // on peon, if lease isn't extended
997
998 /// restart the lease timeout timer
999 void reset_lease_timeout();
1000
1001 /**
1002 * Cancel all of Paxos' timeout/renew events.
1003 */
1004 void cancel_events();
1005 /**
1006 * Shutdown this Paxos machine
1007 */
1008 void shutdown();
1009
1010 /**
1011 * Generate a new Proposal Number based on @p gt
1012 *
1013 * @todo Check what @p gt actually means and what its usage entails
1014 * @param gt A hint for the geration of the Proposal Number
1015 * @return A globally unique, monotonically increasing Proposal Number
1016 */
1017 version_t get_new_proposal_number(version_t gt=0);
1018
1019 /**
1020 * @todo document sync function
1021 */
1022 void warn_on_future_time(utime_t t, entity_name_t from);
1023
1024 /**
1025 * Begin proposing the pending_proposal.
1026 */
1027 void propose_pending();
1028
1029 /**
1030 * refresh state from store
1031 *
1032 * Called when we have new state for the mon to consume. If we return false,
1033 * abort (we triggered a bootstrap).
1034 *
1035 * @returns true on success, false if we are now bootstrapping
1036 */
1037 bool do_refresh();
1038
1039 void commit_proposal();
1040 void finish_round();
1041
1042public:
1043 /**
1044 * @param m A monitor
1045 * @param name A name for the paxos service. It serves as the naming space
1046 * of the underlying persistent storage for this service.
1047 */
1048 Paxos(Monitor *m, const string &name)
1049 : mon(m),
1050 logger(NULL),
1051 paxos_name(name),
1052 state(STATE_RECOVERING),
1053 first_committed(0),
1054 last_pn(0),
1055 last_committed(0),
1056 accepted_pn(0),
1057 accepted_pn_from(0),
1058 num_last(0),
1059 uncommitted_v(0), uncommitted_pn(0),
1060 collect_timeout_event(0),
1061 lease_renew_event(0),
1062 lease_ack_timeout_event(0),
1063 lease_timeout_event(0),
1064 accept_timeout_event(0),
1065 clock_drift_warned(0),
1066 trimming(false) { }
1067
1068 const string get_name() const {
1069 return paxos_name;
1070 }
1071
1072 void dispatch(MonOpRequestRef op);
1073
1074 void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx,
1075 version_t from, version_t last);
1076
1077 void init();
1078
1079 /**
1080 * dump state info to a formatter
1081 */
1082 void dump_info(Formatter *f);
1083
1084 /**
1085 * This function runs basic consistency checks. Importantly, if
1086 * it is inconsistent and shouldn't be, it asserts out.
1087 *
1088 * @return True if consistent, false if not.
1089 */
1090 bool is_consistent();
1091
1092 void restart();
1093 /**
1094 * Initiate the Leader after it wins an election.
1095 *
1096 * Once an election is won, the Leader will be initiated and there are two
1097 * possible outcomes of this method: the Leader directly jumps to the active
1098 * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or
1099 * will start recovering (STATE_RECOVERING) by initiating the collect phase.
1100 *
1101 * @pre Our monitor is the Leader.
1102 * @post We are either on STATE_ACTIVE if we're the only one in the quorum,
1103 * or on STATE_RECOVERING otherwise.
1104 */
1105 void leader_init();
1106 /**
1107 * Initiate a Peon after it loses an election.
1108 *
1109 * If we are a Peon, then there must be a Leader and we are not alone in the
1110 * quorum, thus automatically assume we are on STATE_RECOVERING, which means
1111 * we will soon be enrolled into the Leader's collect phase.
1112 *
1113 * @pre There is a Leader, and it?s about to start the collect phase.
1114 * @post We are on STATE_RECOVERING and will soon receive collect phase's
1115 * messages.
1116 */
1117 void peon_init();
1118
1119 /**
1120 * Include an incremental state of values, ranging from peer_first_committed
1121 * to the last committed value, on the message m
1122 *
1123 * @param m A message
1124 * @param peer_first_committed Lowest version to take into account
1125 * @param peer_last_committed Highest version to take into account
1126 */
1127 void share_state(MMonPaxos *m, version_t peer_first_committed,
1128 version_t peer_last_committed);
1129 /**
1130 * Store on disk a state that was shared with us
1131 *
1132 * Basically, we received a set of version. Or just one. It doesn't matter.
1133 * What matters is that we have to stash it in the store. So, we will simply
1134 * write every single bufferlist into their own versions on our side (i.e.,
1135 * onto paxos-related keys), and then we will decode those same bufferlists
1136 * we just wrote and apply the transactions they hold. We will also update
1137 * our first and last committed values to point to the new values, if need
1138 * be. All this is done tightly wrapped in a transaction to ensure we
1139 * enjoy the atomicity guarantees given by our awesome k/v store.
1140 *
1141 * @param m A message
1142 * @returns true if we stored something new; false otherwise
1143 */
1144 bool store_state(MMonPaxos *m);
1145 void _sanity_check_store();
1146
1147 /**
1148 * Helper function to decode a bufferlist into a transaction and append it
1149 * to another transaction.
1150 *
1151 * This function is used during the Leader's commit and during the
1152 * Paxos::store_state in order to apply the bufferlist's transaction onto
1153 * the store.
1154 *
1155 * @param t The transaction to which we will append the operations
1156 * @param bl A bufferlist containing an encoded transaction
1157 */
1158 static void decode_append_transaction(MonitorDBStore::TransactionRef t,
1159 bufferlist& bl) {
1160 auto vt(std::make_shared<MonitorDBStore::Transaction>());
1161 bufferlist::iterator it = bl.begin();
1162 vt->decode(it);
1163 t->append(vt);
1164 }
1165
1166 /**
1167 * @todo This appears to be used only by the OSDMonitor, and I would say
1168 * its objective is to allow a third-party to have a "private"
1169 * state dir. -JL
1170 */
1171 void add_extra_state_dir(string s) {
1172 extra_state_dirs.push_back(s);
1173 }
1174
1175 // -- service interface --
1176 /**
1177 * Add c to the list of callbacks waiting for us to become active.
1178 *
1179 * @param c A callback
1180 */
1181 void wait_for_active(MonOpRequestRef op, Context *c) {
1182 if (op)
1183 op->mark_event("paxos:wait_for_active");
1184 waiting_for_active.push_back(c);
1185 }
1186 void wait_for_active(Context *c) {
1187 MonOpRequestRef o;
1188 wait_for_active(o, c);
1189 }
1190
1191 /**
1192 * Trim the Paxos state as much as we can.
1193 */
1194 void trim();
1195
1196 /**
1197 * Check if we should trim.
1198 *
1199 * If trimming is disabled, we must take that into consideration and only
1200 * return true if we are positively sure that we should trim soon.
1201 *
1202 * @returns true if we should trim; false otherwise.
1203 */
1204 bool should_trim() {
1205 int available_versions = get_version() - get_first_committed();
1206 int maximum_versions = g_conf->paxos_min + g_conf->paxos_trim_min;
1207
1208 if (trimming || (available_versions <= maximum_versions))
1209 return false;
1210
1211 return true;
1212 }
31f18b77
FG
1213
1214 bool is_plugged() const {
1215 return plugged;
1216 }
1217 void plug() {
1218 assert(plugged == false);
1219 plugged = true;
1220 }
1221 void unplug() {
1222 assert(plugged == true);
1223 plugged = false;
1224 }
1225
7c673cae
FG
1226 // read
1227 /**
1228 * @defgroup Paxos_h_read_funcs Read-related functions
1229 * @{
1230 */
1231 /**
1232 * Get latest committed version
1233 *
1234 * @return latest committed version
1235 */
1236 version_t get_version() { return last_committed; }
1237 /**
1238 * Get first committed version
1239 *
1240 * @return the first committed version
1241 */
1242 version_t get_first_committed() { return first_committed; }
7c673cae
FG
1243 /**
1244 * Check if a given version is readable.
1245 *
1246 * A version may not be readable for a myriad of reasons:
1247 * @li the version @e v is higher that the last committed version
1248 * @li we are not the Leader nor a Peon (election may be on-going)
1249 * @li we do not have a committed value yet
1250 * @li we do not have a valid lease
1251 *
1252 * @param seen The version we want to check if it is readable.
1253 * @return 'true' if the version is readable; 'false' otherwise.
1254 */
1255 bool is_readable(version_t seen=0);
1256 /**
1257 * Read version @e v and store its value in @e bl
1258 *
1259 * @param[in] v The version we want to read
1260 * @param[out] bl The version's value
1261 * @return 'true' if we successfully read the value; 'false' otherwise
1262 */
1263 bool read(version_t v, bufferlist &bl);
1264 /**
1265 * Read the latest committed version
1266 *
1267 * @param[out] bl The version's value
1268 * @return the latest committed version if we successfully read the value;
1269 * or 0 (zero) otherwise.
1270 */
1271 version_t read_current(bufferlist &bl);
1272 /**
1273 * Add onreadable to the list of callbacks waiting for us to become readable.
1274 *
1275 * @param onreadable A callback
1276 */
1277 void wait_for_readable(MonOpRequestRef op, Context *onreadable) {
1278 assert(!is_readable());
1279 if (op)
1280 op->mark_event("paxos:wait_for_readable");
1281 waiting_for_readable.push_back(onreadable);
1282 }
1283 void wait_for_readable(Context *onreadable) {
1284 MonOpRequestRef o;
1285 wait_for_readable(o, onreadable);
1286 }
1287 /**
1288 * @}
1289 */
1290
1291 /**
1292 * Check if we have a valid lease.
1293 *
1294 * @returns true if the lease is still valid; false otherwise.
1295 */
1296 bool is_lease_valid();
1297 // write
1298 /**
1299 * @defgroup Paxos_h_write_funcs Write-related functions
1300 * @{
1301 */
1302 /**
1303 * Check if we are writeable.
1304 *
1305 * We are writeable if we are alone (i.e., a quorum of one), or if we match
1306 * all the following conditions:
1307 * @li We are the Leader
1308 * @li We are on STATE_ACTIVE
1309 * @li We have a valid lease
1310 *
1311 * @return 'true' if we are writeable; 'false' otherwise.
1312 */
1313 bool is_writeable();
1314 /**
1315 * Add c to the list of callbacks waiting for us to become writeable.
1316 *
1317 * @param c A callback
1318 */
1319 void wait_for_writeable(MonOpRequestRef op, Context *c) {
1320 assert(!is_writeable());
1321 if (op)
1322 op->mark_event("paxos:wait_for_writeable");
1323 waiting_for_writeable.push_back(c);
1324 }
1325 void wait_for_writeable(Context *c) {
1326 MonOpRequestRef o;
1327 wait_for_writeable(o, c);
1328 }
1329
1330 /**
1331 * Get a transaction to submit operations to propose against
1332 *
1333 * Apply operations to this transaction. It will eventually be proposed
1334 * to paxos.
1335 */
1336 MonitorDBStore::TransactionRef get_pending_transaction();
1337
1338 /**
1339 * Queue a completion for the pending proposal
1340 *
1341 * This completion will get triggered when the pending proposal
1342 * transaction commits.
1343 */
1344 void queue_pending_finisher(Context *onfinished);
1345
1346 /**
1347 * (try to) trigger a proposal
1348 *
1349 * Tell paxos that it should submit the pending proposal. Note that if it
1350 * is not active (e.g., because it is already in the midst of committing
1351 * something) that will be deferred (e.g., until the current round finishes).
1352 */
1353 bool trigger_propose();
1354
1355 /**
1356 * Add oncommit to the back of the list of callbacks waiting for us to
1357 * finish committing.
1358 *
1359 * @param oncommit A callback
1360 */
1361 void wait_for_commit(Context *oncommit) {
1362 waiting_for_commit.push_back(oncommit);
1363 }
1364 /**
1365 * Add oncommit to the front of the list of callbacks waiting for us to
1366 * finish committing.
1367 *
1368 * @param oncommit A callback
1369 */
1370 void wait_for_commit_front(Context *oncommit) {
1371 waiting_for_commit.push_front(oncommit);
1372 }
1373 /**
1374 * @}
1375 */
1376
1377 /**
1378 * @}
1379 */
1380 protected:
1381 MonitorDBStore *get_store();
1382};
1383
1384inline ostream& operator<<(ostream& out, Paxos::C_Proposal& p)
1385{
1386 string proposed = (p.proposed ? "proposed" : "unproposed");
1387 out << " " << proposed
1388 << " queued " << (ceph_clock_now() - p.proposal_time)
1389 << " tx dump:\n";
1390 auto t(std::make_shared<MonitorDBStore::Transaction>());
1391 bufferlist::iterator p_it = p.bl.begin();
1392 t->decode(p_it);
1393 JSONFormatter f(true);
1394 t->dump(&f);
1395 f.flush(out);
1396 return out;
1397}
1398
1399#endif
1400