1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 cccccccccccccccccca????????????????????????????????????????
19 cccccccccccccccccca????????????????????????????????????????
20 cccccccccccccccccca???????????????????????????????????????? leader
21 cccccccccccccccccc?????????????????????????????????????????
22 ccccc??????????????????????????????????????????????????????
37 * Paxos storage layout and behavior
39 * Currently, we use a key/value store to hold all the Paxos-related data, but
40 * it can logically be depicted as this:
43 * first_committed -> 1
50 * Since we are relying on a k/v store supporting atomic transactions, we can
51 * guarantee that if 'last_committed' has a value of '4', then we have up to
52 * version 4 on the store, and no more than that; the same applies to
53 * 'first_committed', which holding '1' will strictly meaning that our lowest
56 * Each version's value (value_1, value_2, ..., value_n) is a blob of data,
57 * incomprehensible to the Paxos. These values are proposed to the Paxos on
58 * propose_new_value() and each one is a transaction encoded in a bufferlist.
60 * The Paxos will write the value to disk, associating it with its version,
61 * but will take a step further: the value shall be decoded, and the operations
62 * on that transaction shall be applied during the same transaction that will
63 * write the value's encoded bufferlist to disk. This behavior ensures that
64 * whatever is being proposed will only be available on the store when it is
65 * applied by Paxos, which will then be aware of such new values, guaranteeing
66 * the store state is always consistent without requiring shady workarounds.
68 * So, let's say that FooMonitor proposes the following transaction, neatly
69 * encoded on a bufferlist of course:
72 * put(foo, last_committed, 3)
73 * put(foo, 3, foo_value_3)
76 * put(foo, first_committed, 3)
78 * And knowing that the Paxos is proposed Tx_Foo as a bufferlist, once it is
79 * ready to commit, and assuming we are now committing version 5 of the Paxos,
80 * we will do something along the lines of:
83 * proposed_tx.decode(Tx_foo_bufferlist);
86 * our_tx.put(paxos, last_committed, 5);
87 * our_tx.put(paxos, 5, Tx_foo_bufferlist);
88 * our_tx.append(proposed_tx);
90 * store_apply(our_tx);
92 * And the store should look like this after we apply 'our_tx':
95 * first_committed -> 1
101 * 5 -> Tx_foo_bufferlist
103 * first_committed -> 3
104 * last_committed -> 3
109 #ifndef CEPH_MON_PAXOS_H
110 #define CEPH_MON_PAXOS_H
112 #include "include/types.h"
113 #include "mon_types.h"
114 #include "include/buffer.h"
115 #include "msg/msg_types.h"
116 #include "include/Context.h"
117 #include "common/perf_counters.h"
120 #include "MonitorDBStore.h"
121 #include "mon/MonOpRequest.h"
127 l_paxos_first
= 45800,
128 l_paxos_start_leader
,
132 l_paxos_refresh_latency
,
136 l_paxos_begin_latency
,
139 l_paxos_commit_bytes
,
140 l_paxos_commit_latency
,
142 l_paxos_collect_keys
,
143 l_paxos_collect_bytes
,
144 l_paxos_collect_latency
,
145 l_paxos_collect_uncommitted
,
146 l_paxos_collect_timeout
,
147 l_paxos_accept_timeout
,
148 l_paxos_lease_ack_timeout
,
149 l_paxos_lease_timeout
,
151 l_paxos_store_state_keys
,
152 l_paxos_store_state_bytes
,
153 l_paxos_store_state_latency
,
155 l_paxos_share_state_keys
,
156 l_paxos_share_state_bytes
,
158 l_paxos_new_pn_latency
,
163 // i am one state machine.
165 * This library is based on the Paxos algorithm, but varies in a few key ways:
166 * 1- Only a single new value is generated at a time, simplifying the recovery logic.
167 * 2- Nodes track "committed" values, and share them generously (and trustingly)
168 * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is
169 * safe to "read" their copy of the last committed value.
171 * This provides a simple replication substrate that services can be built on top of.
176 * @defgroup Paxos_h_class Paxos
180 * The Monitor to which this Paxos class is associated with.
184 /// perf counter for internal instrumentations
185 PerfCounters
*logger
;
189 // my state machine info
190 const string paxos_name
;
192 friend class Monitor
;
193 friend class PaxosService
;
195 list
<std::string
> extra_state_dirs
;
199 // -- generic state --
202 * @defgroup Paxos_h_states States on which the leader/peon may be.
207 * Leader/Peon is in Paxos' Recovery state
211 * Leader/Peon is idle, and the Peon may or may not have a valid lease.
215 * Leader/Peon is updating to a new value.
219 * Leader proposing an old value
221 STATE_UPDATING_PREVIOUS
,
223 * Leader/Peon is writing a new commit. readable, but not
228 * Leader/Peon is writing a new commit from a previous round.
230 STATE_WRITING_PREVIOUS
,
231 // leader: refresh following a commit
233 // Shutdown after WRITING or WRITING_PREVIOUS
238 * Obtain state name from constant value.
240 * @note This function will raise a fatal error if @p s is not
241 * a valid state value.
243 * @param s State value.
244 * @return The state's name.
246 static const string
get_statename(int s
) {
248 case STATE_RECOVERING
:
254 case STATE_UPDATING_PREVIOUS
:
255 return "updating-previous";
258 case STATE_WRITING_PREVIOUS
:
259 return "writing-previous";
271 * The state we are in.
277 int commits_started
= 0;
283 * Check if we are recovering.
285 * @return 'true' if we are on the Recovering state; 'false' otherwise.
287 bool is_recovering() const { return (state
== STATE_RECOVERING
); }
289 * Check if we are active.
291 * @return 'true' if we are on the Active state; 'false' otherwise.
293 bool is_active() const { return state
== STATE_ACTIVE
; }
295 * Check if we are updating.
297 * @return 'true' if we are on the Updating state; 'false' otherwise.
299 bool is_updating() const { return state
== STATE_UPDATING
; }
302 * Check if we are updating/proposing a previous value from a
305 bool is_updating_previous() const { return state
== STATE_UPDATING_PREVIOUS
; }
307 /// @return 'true' if we are writing an update to disk
308 bool is_writing() const { return state
== STATE_WRITING
; }
310 /// @return 'true' if we are writing an update-previous to disk
311 bool is_writing_previous() const { return state
== STATE_WRITING_PREVIOUS
; }
313 /// @return 'true' if we are refreshing an update just committed
314 bool is_refresh() const { return state
== STATE_REFRESH
; }
316 /// @return 'true' if we are in the process of shutting down
317 bool is_shutdown() const { return state
== STATE_SHUTDOWN
; }
321 * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
322 * @note These variables are common to both the Leader and the Peons.
328 version_t first_committed
;
330 * Last Proposal Number
332 * @todo Expand description
336 * Last committed value's version.
338 * On both the Leader and the Peons, this is the last value's version that
339 * was accepted by a given quorum and thus committed, that this instance
342 * @note It may not be the last committed value's version throughout the
343 * system. If we are a Peon, we may have not been part of the quorum
344 * that accepted the value, and for this very same reason we may still
345 * be a (couple of) version(s) behind, until we learn about the most
346 * recent version. This should only happen if we are not active (i.e.,
347 * part of the quorum), which should not happen if we are up, running
348 * and able to communicate with others -- thus able to be part of the
349 * monmap and trigger new elections.
351 version_t last_committed
;
353 * Last committed value's time.
355 * When the commit finished.
357 utime_t last_commit_time
;
359 * The last Proposal Number we have accepted.
361 * On the Leader, it will be the Proposal Number picked by the Leader
362 * itself. On the Peon, however, it will be the proposal sent by the Leader
363 * and it will only be updated if its value is higher than the one
364 * already known by the Peon.
366 version_t accepted_pn
;
368 * The last_committed epoch of the leader at the time we accepted the last pn.
370 * This has NO SEMANTIC MEANING, and is there only for the debug output.
372 version_t accepted_pn_from
;
374 * Map holding the first committed version by each quorum member.
376 * The versions kept in this map are updated during the collect phase.
377 * When the Leader starts the collect phase, each Peon will reply with its
378 * first committed version, which will then be kept in this map.
380 map
<int,version_t
> peer_first_committed
;
382 * Map holding the last committed version by each quorum member.
384 * The versions kept in this map are updated during the collect phase.
385 * When the Leader starts the collect phase, each Peon will reply with its
386 * last committed version, which will then be kept in this map.
388 map
<int,version_t
> peer_last_committed
;
395 * @defgroup Paxos_h_active_vars Common active-related member variables
399 * When does our read lease expires.
401 * Instead of performing a full commit each time a read is requested, we
402 * keep leases. Each lease will have an expiration date, which may or may
405 utime_t lease_expire
;
407 * List of callbacks waiting for our state to change into STATE_ACTIVE.
409 list
<Context
*> waiting_for_active
;
411 * List of callbacks waiting for the chance to read a version from us.
413 * Each entry on the list may result from an attempt to read a version that
414 * wasn't available at the time, or an attempt made during a period during
415 * which we could not satisfy the read request. The first case happens if
416 * the requested version is greater than our last committed version. The
417 * second scenario may happen if we are recovering, or if we don't have a
420 * The list will be woken up once we change to STATE_ACTIVE with an extended
421 * lease -- which can be achieved if we have everyone on the quorum on board
422 * with the latest proposal, or if we don't really care about the remaining
423 * uncommitted values --, or if we're on a quorum of one.
425 list
<Context
*> waiting_for_readable
;
431 // recovery (paxos phase 1)
433 * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars
437 * Number of replies to the collect phase we've received so far.
439 * This variable is reset to 1 each time we start a collect phase; it is
440 * incremented each time we receive a reply to the collect message, and
441 * is used to determine whether or not we have received replies from the
446 * Uncommitted value's version.
448 * If we have, or end up knowing about, an uncommitted value, then its
449 * version will be kept in this variable.
451 * @note If this version equals @p last_committed+1 when we reach the final
452 * steps of recovery, then the algorithm will assume this is a value
453 * the Leader does not know about, and trustingly the Leader will
454 * propose this version's value.
456 version_t uncommitted_v
;
458 * Uncommitted value's Proposal Number.
460 * We use this variable to assess if the Leader should take into consideration
461 * an uncommitted value sent by a Peon. Given that the Peon will send back to
462 * the Leader the last Proposal Number it accepted, the Leader will be able
463 * to infer if this value is more recent than the one the Leader has, thus
466 version_t uncommitted_pn
;
470 * If the system fails in-between the accept replies from the Peons and the
471 * instruction to commit from the Leader, then we may end up with accepted
472 * but yet-uncommitted values. During the Leader's recovery, it will attempt
473 * to bring the whole system to the latest state, and that means committing
474 * past accepted but uncommitted values.
476 * This variable will hold an uncommitted value, which may originate either
477 * on the Leader, or learnt by the Leader from a Peon during the collect
480 bufferlist uncommitted_value
;
482 * Used to specify when an on-going collect phase times out.
484 Context
*collect_timeout_event
;
491 * @defgroup Paxos_h_leader_active Leader-specific Active-related vars
495 * Set of participants (Leader & Peons) that have acked a lease extension.
497 * Each Peon that acknowledges a lease extension will have its place in this
498 * set, which will be used to account for all the acks from all the quorum
499 * members, guaranteeing that we trigger new elections if some don't ack in
500 * the expected timeframe.
502 set
<int> acked_lease
;
504 * Callback responsible for extending the lease periodically.
506 Context
*lease_renew_event
;
508 * Callback to trigger new elections once the time for acks is out.
510 Context
*lease_ack_timeout_event
;
515 * @defgroup Paxos_h_peon_active Peon-specific Active-related vars
519 * Callback to trigger new elections when the Peon's lease times out.
521 * If the Peon's lease is extended, this callback will be reset (i.e.,
522 * we cancel the event and reschedule a new one with starting from the
525 Context
*lease_timeout_event
;
530 // updating (paxos phase 2)
532 * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars
536 * New Value being proposed to the Peons.
538 * This bufferlist holds the value the Leader is proposing to the Peons, and
539 * that will be committed if the Peons do accept the proposal.
541 bufferlist new_value
;
543 * Set of participants (Leader & Peons) that accepted the new proposed value.
545 * This set is used to keep track of those who have accepted the proposed
546 * value, so the leader may know when to issue a commit (when a majority of
547 * participants has accepted the proposal), and when to extend the lease
548 * (when all the quorum members have accepted the proposal).
552 * Callback to trigger a new election if the proposal is not accepted by the
553 * full quorum within a given timeframe.
555 * If the full quorum does not accept the proposal, then it means that the
556 * Leader may no longer be recognized as the leader, or that the quorum has
557 * changed, and the value may have not reached all the participants. Thus,
558 * the leader must call new elections, and go through a recovery phase in
559 * order to propagate the new value throughout the system.
561 * This does not mean that we won't commit. We will commit as soon as we
562 * have a majority of acceptances. But if we do not have full acceptance
563 * from the quorum, then we cannot extend the lease, as some participants
564 * may not have the latest committed value.
566 Context
*accept_timeout_event
;
569 * List of callbacks waiting for it to be possible to write again.
571 * @remarks It is not possible to write if we are not the Leader, or we are
572 * not on the active state, or if the lease has expired.
574 list
<Context
*> waiting_for_writeable
;
577 * Pending proposal transaction
579 * This is the transaction that is under construction and pending
580 * proposal. We will add operations to it until we decide it is
581 * time to start a paxos round.
583 MonitorDBStore::TransactionRef pending_proposal
;
586 * Finishers for pending transaction
588 * These are waiting for updates in the pending proposal/transaction
591 list
<Context
*> pending_finishers
;
594 * Finishers for committing transaction
596 * When the pending_proposal is submitted, pending_finishers move to
597 * this list. When it commits, these finishers are notified.
599 list
<Context
*> committing_finishers
;
602 * @defgroup Paxos_h_sync_warns Synchronization warnings
603 * @todo Describe these variables
606 utime_t last_clock_drift_warn
;
607 int clock_drift_warned
;
613 * Should be true if we have proposed to trim, or are in the middle of
614 * trimming; false otherwise.
619 * true if we want trigger_propose to *not* propose (yet)
621 bool plugged
= false;
624 * @defgroup Paxos_h_callbacks Callback classes.
628 * Callback class responsible for handling a Collect Timeout.
630 class C_CollectTimeout
;
632 * Callback class responsible for handling an Accept Timeout.
634 class C_AcceptTimeout
;
636 * Callback class responsible for handling a Lease Ack Timeout.
638 class C_LeaseAckTimeout
;
641 * Callback class responsible for handling a Lease Timeout.
643 class C_LeaseTimeout
;
646 * Callback class responsible for handling a Lease Renew Timeout.
655 class C_Proposal
: public Context
{
656 Context
*proposer_context
;
659 // for debug purposes. Will go away. Soon.
661 utime_t proposal_time
;
663 C_Proposal(Context
*c
, bufferlist
& proposal_bl
) :
667 proposal_time(ceph_clock_now())
670 void finish(int r
) override
{
671 if (proposer_context
) {
672 proposer_context
->complete(r
);
673 proposer_context
= NULL
;
682 * @defgroup Paxos_h_election_triggered Steps triggered by an election.
684 * @note All these functions play a significant role in the Recovery Phase,
685 * which is triggered right after an election once someone becomes
690 * Create a new Proposal Number and propose it to the Peons.
692 * This function starts the Recovery Phase, which can be directly mapped
693 * onto the original Paxos' Prepare phase. Basically, we'll generate a
694 * Proposal Number, taking @p oldpn into consideration, and we will send
695 * it to a quorum, along with our first and last committed versions. By
696 * sending these information in a message to the quorum, we expect to
697 * obtain acceptances from a majority, allowing us to commit, or be
698 * informed of a higher Proposal Number known by one or more of the Peons
701 * @pre We are the Leader.
702 * @post Recovery Phase initiated by sending messages to the quorum.
704 * @param oldpn A proposal number taken as the highest known so far, that
705 * should be taken into consideration when generating a new
706 * Proposal Number for the Recovery Phase.
708 void collect(version_t oldpn
);
710 * Handle the reception of a collect message from the Leader and reply
713 * Once a Peon receives a collect message from the Leader it will reply
714 * with its first and last committed versions, as well as information so
715 * the Leader may know if its Proposal Number was, or was not, accepted by
716 * the Peon. The Peon will accept the Leader's Proposal Number if it is
717 * higher than the Peon's currently accepted Proposal Number. The Peon may
718 * also inform the Leader of accepted but uncommitted values.
720 * @invariant The message is an operation of type OP_COLLECT.
721 * @pre We are a Peon.
722 * @post Replied to the Leader, accepting or not accepting its PN.
724 * @param collect The collect message sent by the Leader to the Peon.
726 void handle_collect(MonOpRequestRef op
);
728 * Handle a response from a Peon to the Leader's collect phase.
730 * The received message will state the Peon's last committed version, as
731 * well as its last proposal number. This will lead to one of the following
732 * scenarios: if the replied Proposal Number is equal to the one we proposed,
733 * then the Peon has accepted our proposal, and if all the Peons do accept
734 * our Proposal Number, then we are allowed to proceed with the commit;
735 * however, if a Peon replies with a higher Proposal Number, we assume he
736 * knows something we don't and the Leader will have to abort the current
737 * proposal in order to retry with the Proposal Number specified by the Peon.
738 * It may also occur that the Peon replied with a lower Proposal Number, in
739 * which case we assume it is a reply to an older value and we'll simply
741 * This function will also check if the Peon replied with an accepted but
742 * yet uncommitted value. In this case, if its version is higher than our
743 * last committed value by one, we assume that the Peon knows a value from a
744 * previous proposal that has never been committed, and we should try to
745 * commit that value by proposing it next. On the other hand, if that is
746 * not the case, we'll assume it is an old, uncommitted value, we do not
747 * care about and we'll consider the system active by extending the leases.
749 * @invariant The message is an operation of type OP_LAST.
750 * @pre We are the Leader.
751 * @post We initiate a commit, or we retry with a higher Proposal Number,
752 * or we drop the message.
753 * @post We move from STATE_RECOVERING to STATE_ACTIVE.
755 * @param last The message sent by the Peon to the Leader.
757 void handle_last(MonOpRequestRef op
);
759 * The Recovery Phase timed out, meaning that a significant part of the
760 * quorum does not believe we are the Leader, and we thus should trigger new
763 * @pre We believe to be the Leader.
764 * @post Trigger new elections.
766 void collect_timeout();
772 * @defgroup Paxos_h_updating_funcs Functions used during the Updating State
774 * These functions may easily be mapped to the original Paxos Algorithm's
777 * Taking into account the algorithm can be divided in 4 phases (Prepare,
778 * Promise, Accept Request and Accepted), we can easily map Paxos::begin to
779 * both the Prepare and Accept Request phases; the Paxos::handle_begin to
780 * the Promise phase; and the Paxos::handle_accept to the Accepted phase.
784 * Start a new proposal with the intent of committing @p value.
786 * If we are alone on the system (i.e., a quorum of one), then we will
787 * simply commit the value, but if we are not alone, then we need to propose
788 * the value to the quorum.
790 * @pre We are the Leader
791 * @pre We are on STATE_ACTIVE
792 * @post We commit, if we are alone, or we send a message to each quorum
794 * @post We are on STATE_ACTIVE, if we are alone, or on
795 * STATE_UPDATING otherwise
797 * @param value The value being proposed to the quorum
799 void begin(bufferlist
& value
);
801 * Accept or decline (by ignoring) a proposal from the Leader.
803 * We will decline the proposal (by ignoring it) if we have promised to
804 * accept a higher numbered proposal. If that is not the case, we will
805 * accept it and accordingly reply to the Leader.
808 * @pre We are on STATE_ACTIVE
809 * @post We are on STATE_UPDATING if we accept the Leader's proposal
810 * @post We send a reply message to the Leader if we accept its proposal
812 * @invariant The received message is an operation of type OP_BEGIN
814 * @param begin The message sent by the Leader to the Peon during the
815 * Paxos::begin function
818 void handle_begin(MonOpRequestRef op
);
820 * Handle an Accept message sent by a Peon.
822 * In order to commit, the Leader has to receive accepts from a majority of
823 * the quorum. If that does happen, then the Leader may proceed with the
824 * commit. However, the Leader needs the accepts from all the quorum members
825 * in order to extend the lease and move on to STATE_ACTIVE.
827 * This function handles these two situations, accounting for the amount of
830 * @pre We are the Leader
831 * @pre We are on STATE_UPDATING
832 * @post We are on STATE_ACTIVE if we received accepts from the full quorum
833 * @post We extended the lease if we moved on to STATE_ACTIVE
834 * @post We are on STATE_UPDATING if we didn't received accepts from the
836 * @post We have committed if we received accepts from a majority
838 * @invariant The received message is an operation of type OP_ACCEPT
840 * @param accept The message sent by the Peons to the Leader during the
841 * Paxos::handle_begin function
843 void handle_accept(MonOpRequestRef op
);
845 * Trigger a fresh election.
847 * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in
848 * order to limit the amount of time we spend waiting for Accept replies.
849 * This callback will call Paxos::accept_timeout when it is fired.
851 * This is essential to the algorithm because there may be the chance that
852 * we are no longer the Leader (i.e., others don't believe in us) and we
853 * are getting ignored, or we dropped out of the quorum and haven't realised
854 * it. So, our only option is to trigger fresh elections.
856 * @pre We are the Leader
857 * @pre We are on STATE_UPDATING
858 * @post Triggered fresh elections
860 void accept_timeout();
866 utime_t commit_start_stamp
;
867 friend struct C_Committed
;
870 * Commit a value throughout the system.
872 * The Leader will cancel the current lease (as it was for the old value),
873 * and will store the committed value locally. It will then instruct every
874 * quorum member to do so as well.
876 * @pre We are the Leader
877 * @pre We are on STATE_UPDATING
878 * @pre A majority of quorum members accepted our proposal
879 * @post Value locally stored
880 * @post Quorum members instructed to commit the new value.
883 void commit_finish(); ///< finish a commit after txn becomes durable
884 void abort_commit(); ///< Handle commit finish after shutdown started
886 * Commit the new value to stable storage as being the latest available
890 * @post The new value is locally stored
891 * @post Fire up the callbacks waiting on waiting_for_commit
893 * @invariant The received message is an operation of type OP_COMMIT
895 * @param commit The message sent by the Leader to the Peon during
898 void handle_commit(MonOpRequestRef op
);
900 * Extend the system's lease.
902 * This means that the Leader considers that it should now safe to read from
903 * any node on the system, since every quorum member is now in possession of
904 * the latest version. Therefore, the Leader will send a message stating just
905 * this to each quorum member, and will impose a limited timeframe during
906 * which acks will be accepted. If there aren't as many acks as expected
907 * (i.e, if at least one quorum member does not ack the lease) during this
908 * timeframe, then we will force fresh elections.
910 * @pre We are the Leader
911 * @pre We are on STATE_ACTIVE
912 * @post A message extending the lease is sent to each quorum member
913 * @post A timeout callback is set to limit the amount of time we will wait
915 * @post A timer is set in order to renew the lease after a certain amount
920 * Update the lease on the Peon's side of things.
922 * Once a Peon receives a Lease message, it will update its lease_expire
923 * variable, reply to the Leader acknowledging the lease update and set a
924 * timeout callback to be fired upon the lease's expiration. Finally, the
925 * Peon will fire up all the callbacks waiting for it to become active,
926 * which it just did, and all those waiting for it to become readable,
927 * which should be true if the Peon's lease didn't expire in the mean time.
930 * @post We update the lease accordingly
931 * @post A lease timeout callback is set
932 * @post Move to STATE_ACTIVE
933 * @post Fire up all the callbacks waiting for STATE_ACTIVE
934 * @post Fire up all the callbacks waiting for readable if we are readable
935 * @post Ack the lease to the Leader
937 * @invariant The received message is an operation of type OP_LEASE
939 * @param lease The message sent by the Leader to the Peon during the
940 * Paxos::extend_lease function
942 void handle_lease(MonOpRequestRef op
);
944 * Account for all the Lease Acks the Leader receives from the Peons.
946 * Once the Leader receives all the Lease Acks from the Peons, it will be
947 * able to cancel the Lease Ack timeout callback, thus avoiding calling
950 * @pre We are the Leader
951 * @post Cancel the Lease Ack timeout callback if we receive acks from all
954 * @invariant The received message is an operation of type OP_LEASE_ACK
956 * @param ack The message sent by a Peon to the Leader during the
957 * Paxos::handle_lease function
959 void handle_lease_ack(MonOpRequestRef op
);
961 * Call fresh elections because at least one Peon didn't acked our lease.
963 * @pre We are the Leader
964 * @pre We are on STATE_ACTIVE
965 * @post Trigger fresh elections
967 void lease_ack_timeout();
969 * Extend lease since we haven't had new committed values meanwhile.
971 * @pre We are the Leader
972 * @pre We are on STATE_ACTIVE
973 * @post Go through with Paxos::extend_lease
975 void lease_renew_timeout();
977 * Call fresh elections because the Peon's lease expired without being
978 * renewed or receiving a fresh lease.
980 * This means that the Peon is no longer assumed as being in the quorum
981 * (or there is no Leader to speak of), so just trigger fresh elections
982 * to circumvent this issue.
985 * @post Trigger fresh elections
987 void lease_timeout(); // on peon, if lease isn't extended
989 /// restart the lease timeout timer
990 void reset_lease_timeout();
993 * Cancel all of Paxos' timeout/renew events.
995 void cancel_events();
997 * Shutdown this Paxos machine
1002 * Generate a new Proposal Number based on @p gt
1004 * @todo Check what @p gt actually means and what its usage entails
1005 * @param gt A hint for the geration of the Proposal Number
1006 * @return A globally unique, monotonically increasing Proposal Number
1008 version_t
get_new_proposal_number(version_t gt
=0);
1011 * @todo document sync function
1013 void warn_on_future_time(utime_t t
, entity_name_t from
);
1016 * Begin proposing the pending_proposal.
1018 void propose_pending();
1021 * refresh state from store
1023 * Called when we have new state for the mon to consume. If we return false,
1024 * abort (we triggered a bootstrap).
1026 * @returns true on success, false if we are now bootstrapping
1030 void commit_proposal();
1031 void finish_round();
1035 * @param m A monitor
1036 * @param name A name for the paxos service. It serves as the naming space
1037 * of the underlying persistent storage for this service.
1039 Paxos(Monitor
*m
, const string
&name
)
1043 state(STATE_RECOVERING
),
1048 accepted_pn_from(0),
1050 uncommitted_v(0), uncommitted_pn(0),
1051 collect_timeout_event(0),
1052 lease_renew_event(0),
1053 lease_ack_timeout_event(0),
1054 lease_timeout_event(0),
1055 accept_timeout_event(0),
1056 clock_drift_warned(0),
1059 const string
get_name() const {
1063 void dispatch(MonOpRequestRef op
);
1065 void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx
,
1066 version_t from
, version_t last
);
1071 * dump state info to a formatter
1073 void dump_info(Formatter
*f
);
1076 * This function runs basic consistency checks. Importantly, if
1077 * it is inconsistent and shouldn't be, it asserts out.
1079 * @return True if consistent, false if not.
1081 bool is_consistent();
1085 * Initiate the Leader after it wins an election.
1087 * Once an election is won, the Leader will be initiated and there are two
1088 * possible outcomes of this method: the Leader directly jumps to the active
1089 * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or
1090 * will start recovering (STATE_RECOVERING) by initiating the collect phase.
1092 * @pre Our monitor is the Leader.
1093 * @post We are either on STATE_ACTIVE if we're the only one in the quorum,
1094 * or on STATE_RECOVERING otherwise.
1098 * Initiate a Peon after it loses an election.
1100 * If we are a Peon, then there must be a Leader and we are not alone in the
1101 * quorum, thus automatically assume we are on STATE_RECOVERING, which means
1102 * we will soon be enrolled into the Leader's collect phase.
1104 * @pre There is a Leader, and it?s about to start the collect phase.
1105 * @post We are on STATE_RECOVERING and will soon receive collect phase's
1111 * Include an incremental state of values, ranging from peer_first_committed
1112 * to the last committed value, on the message m
1114 * @param m A message
1115 * @param peer_first_committed Lowest version to take into account
1116 * @param peer_last_committed Highest version to take into account
1118 void share_state(MMonPaxos
*m
, version_t peer_first_committed
,
1119 version_t peer_last_committed
);
1121 * Store on disk a state that was shared with us
1123 * Basically, we received a set of version. Or just one. It doesn't matter.
1124 * What matters is that we have to stash it in the store. So, we will simply
1125 * write every single bufferlist into their own versions on our side (i.e.,
1126 * onto paxos-related keys), and then we will decode those same bufferlists
1127 * we just wrote and apply the transactions they hold. We will also update
1128 * our first and last committed values to point to the new values, if need
1129 * be. All this is done tightly wrapped in a transaction to ensure we
1130 * enjoy the atomicity guarantees given by our awesome k/v store.
1132 * @param m A message
1133 * @returns true if we stored something new; false otherwise
1135 bool store_state(MMonPaxos
*m
);
1136 void _sanity_check_store();
1139 * Helper function to decode a bufferlist into a transaction and append it
1140 * to another transaction.
1142 * This function is used during the Leader's commit and during the
1143 * Paxos::store_state in order to apply the bufferlist's transaction onto
1146 * @param t The transaction to which we will append the operations
1147 * @param bl A bufferlist containing an encoded transaction
1149 static void decode_append_transaction(MonitorDBStore::TransactionRef t
,
1151 auto vt(std::make_shared
<MonitorDBStore::Transaction
>());
1152 auto it
= bl
.cbegin();
1158 * @todo This appears to be used only by the OSDMonitor, and I would say
1159 * its objective is to allow a third-party to have a "private"
1162 void add_extra_state_dir(string s
) {
1163 extra_state_dirs
.push_back(s
);
1166 // -- service interface --
1168 * Add c to the list of callbacks waiting for us to become active.
1170 * @param c A callback
1172 void wait_for_active(MonOpRequestRef op
, Context
*c
) {
1174 op
->mark_event("paxos:wait_for_active");
1175 waiting_for_active
.push_back(c
);
1177 void wait_for_active(Context
*c
) {
1179 wait_for_active(o
, c
);
1183 * Trim the Paxos state as much as we can.
1188 * Check if we should trim.
1190 * If trimming is disabled, we must take that into consideration and only
1191 * return true if we are positively sure that we should trim soon.
1193 * @returns true if we should trim; false otherwise.
1195 bool should_trim() {
1196 int available_versions
= get_version() - get_first_committed();
1197 int maximum_versions
= g_conf()->paxos_min
+ g_conf()->paxos_trim_min
;
1199 if (trimming
|| (available_versions
<= maximum_versions
))
1205 bool is_plugged() const {
1209 ceph_assert(plugged
== false);
1213 ceph_assert(plugged
== true);
1219 * @defgroup Paxos_h_read_funcs Read-related functions
1223 * Get latest committed version
1225 * @return latest committed version
1227 version_t
get_version() { return last_committed
; }
1229 * Get first committed version
1231 * @return the first committed version
1233 version_t
get_first_committed() { return first_committed
; }
1235 * Check if a given version is readable.
1237 * A version may not be readable for a myriad of reasons:
1238 * @li the version @e v is higher that the last committed version
1239 * @li we are not the Leader nor a Peon (election may be on-going)
1240 * @li we do not have a committed value yet
1241 * @li we do not have a valid lease
1243 * @param seen The version we want to check if it is readable.
1244 * @return 'true' if the version is readable; 'false' otherwise.
1246 bool is_readable(version_t seen
=0);
1248 * Read version @e v and store its value in @e bl
1250 * @param[in] v The version we want to read
1251 * @param[out] bl The version's value
1252 * @return 'true' if we successfully read the value; 'false' otherwise
1254 bool read(version_t v
, bufferlist
&bl
);
1256 * Read the latest committed version
1258 * @param[out] bl The version's value
1259 * @return the latest committed version if we successfully read the value;
1260 * or 0 (zero) otherwise.
1262 version_t
read_current(bufferlist
&bl
);
1264 * Add onreadable to the list of callbacks waiting for us to become readable.
1266 * @param onreadable A callback
1268 void wait_for_readable(MonOpRequestRef op
, Context
*onreadable
) {
1269 ceph_assert(!is_readable());
1271 op
->mark_event("paxos:wait_for_readable");
1272 waiting_for_readable
.push_back(onreadable
);
1274 void wait_for_readable(Context
*onreadable
) {
1276 wait_for_readable(o
, onreadable
);
1283 * Check if we have a valid lease.
1285 * @returns true if the lease is still valid; false otherwise.
1287 bool is_lease_valid();
1290 * @defgroup Paxos_h_write_funcs Write-related functions
1294 * Check if we are writeable.
1296 * We are writeable if we are alone (i.e., a quorum of one), or if we match
1297 * all the following conditions:
1298 * @li We are the Leader
1299 * @li We are on STATE_ACTIVE
1300 * @li We have a valid lease
1302 * @return 'true' if we are writeable; 'false' otherwise.
1304 bool is_writeable();
1306 * Add c to the list of callbacks waiting for us to become writeable.
1308 * @param c A callback
1310 void wait_for_writeable(MonOpRequestRef op
, Context
*c
) {
1311 ceph_assert(!is_writeable());
1313 op
->mark_event("paxos:wait_for_writeable");
1314 waiting_for_writeable
.push_back(c
);
1316 void wait_for_writeable(Context
*c
) {
1318 wait_for_writeable(o
, c
);
1322 * Get a transaction to submit operations to propose against
1324 * Apply operations to this transaction. It will eventually be proposed
1327 MonitorDBStore::TransactionRef
get_pending_transaction();
1330 * Queue a completion for the pending proposal
1332 * This completion will get triggered when the pending proposal
1333 * transaction commits.
1335 void queue_pending_finisher(Context
*onfinished
);
1338 * (try to) trigger a proposal
1340 * Tell paxos that it should submit the pending proposal. Note that if it
1341 * is not active (e.g., because it is already in the midst of committing
1342 * something) that will be deferred (e.g., until the current round finishes).
1344 bool trigger_propose();
1353 MonitorDBStore
*get_store();
1356 inline ostream
& operator<<(ostream
& out
, Paxos::C_Proposal
& p
)
1358 string proposed
= (p
.proposed
? "proposed" : "unproposed");
1359 out
<< " " << proposed
1360 << " queued " << (ceph_clock_now() - p
.proposal_time
)
1362 auto t(std::make_shared
<MonitorDBStore::Transaction
>());
1363 auto p_it
= p
.bl
.cbegin();
1365 JSONFormatter
f(true);