1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_PAXOSSERVICE_H
16 #define CEPH_PAXOSSERVICE_H
18 #include "include/Context.h"
21 #include "MonitorDBStore.h"
27 * A Paxos Service is an abstraction that easily allows one to obtain an
28 * association between a Monitor and a Paxos class, in order to implement any
33 * @defgroup PaxosService_h_class Paxos Service
38 * The Monitor to which this class is associated with
42 * The Paxos instance to which this class is associated with
46 * Our name. This will be associated with the class implementing us, and will
47 * be used mainly for store-related operations.
51 * If we are or have queued anything for proposal, this variable will be true
52 * until our proposal has been finished.
56 bool need_immediate_propose
= false;
60 * Services implementing us used to depend on the Paxos version, back when
61 * each service would have a Paxos instance for itself. However, now we only
62 * have a single Paxos instance, shared by all the services. Each service now
63 * must keep its own version, if so they wish. This variable should be used
66 version_t service_version
;
70 * Event callback responsible for proposing our pending value once a timer
73 Context
*proposal_timer
;
75 * If the implementation class has anything pending to be proposed to Paxos,
76 * then have_pending should be true; otherwise, false.
83 * format of our state in leveldb, 0 for default
85 version_t format_version
;
90 * @defgroup PaxosService_h_callbacks Callback classes
94 * Retry dispatching a given service message
96 * This callback class is used when we had to wait for some condition to
97 * become true while we were dispatching it.
99 * For instance, if the message's version isn't readable, according to Paxos,
100 * then we must wait for it to become readable. So, we just queue an
101 * instance of this class onto the Paxos::wait_for_readable function, and
102 * we will retry the whole dispatch again once the callback is fired.
104 class C_RetryMessage
: public C_MonOp
{
107 C_RetryMessage(PaxosService
*s
, MonOpRequestRef op_
) :
108 C_MonOp(op_
), svc(s
) { }
109 void _finish(int r
) override
{
110 if (r
== -EAGAIN
|| r
>= 0)
112 else if (r
== -ECANCELED
)
115 assert(0 == "bad C_RetryMessage return value");
125 * @param mn A Monitor instance
126 * @param p A Paxos instance
127 * @param name Our service's name.
129 PaxosService(Monitor
*mn
, Paxos
*p
, string name
)
130 : mon(mn
), paxos(p
), service_name(name
),
132 service_version(0), proposal_timer(0), have_pending(false),
134 last_committed_name("last_committed"),
135 first_committed_name("first_committed"),
136 full_prefix_name("full"), full_latest_name("latest"),
137 cached_first_committed(0), cached_last_committed(0)
141 virtual ~PaxosService() {}
144 * Get the service's name.
146 * @returns The service's name.
148 string
get_service_name() { return service_name
; }
151 * Get the store prefixes we utilize
153 virtual void get_store_prefixes(set
<string
>& s
) {
154 s
.insert(service_name
);
157 // i implement and you ignore
159 * Informs this instance that it should consider itself restarted.
161 * This means that we will cancel our proposal_timer event, if any exists.
165 * Informs this instance that an election has finished.
167 * This means that we will invoke a PaxosService::discard_pending while
168 * setting have_pending to false (basically, ignore our pending state) and
169 * we will then make sure we obtain a new state.
171 * Our state shall be updated by PaxosService::_active if the Paxos is
172 * active; otherwise, we will wait for it to become active by adding a
173 * PaxosService::C_Active callback to it.
175 void election_finished();
177 * Informs this instance that it is supposed to shutdown.
179 * Basically, it will instruct Paxos to cancel all events/callbacks and then
180 * will cancel the proposal_timer event if any exists.
186 * Update our state by updating it from Paxos, and then creating a new
187 * pending state if need be.
189 * @remarks We only create a pending state we our Monitor is the Leader.
191 * @pre Paxos is active
192 * @post have_pending is true if our Monitor is the Leader and Paxos is
199 * Propose a new value through Paxos.
201 * This function should be called by the classes implementing
202 * PaxosService, in order to propose a new value through Paxos.
204 * @pre The implementation class implements the encode_pending function.
205 * @pre have_pending is true
206 * @pre Our monitor is the Leader
207 * @pre Paxos is active
208 * @post Cancel the proposal timer, if any
209 * @post have_pending is false
210 * @post propose pending value through Paxos
212 * @note This function depends on the implementation of encode_pending on
213 * the class that is implementing PaxosService
215 void propose_pending();
218 * Let others request us to propose.
220 * At the moment, this is just a wrapper to propose_pending() with an
221 * extra check for is_writeable(), but it's a good practice to dissociate
222 * requests for proposals from direct usage of propose_pending() for
223 * future use -- we might want to perform additional checks or put a
224 * request on hold, for instance.
226 void request_proposal() {
227 assert(is_writeable());
232 * Request service @p other to perform a proposal.
234 * We could simply use the function above, requesting @p other directly,
235 * but we might eventually want to do something to the request -- say,
236 * set a flag stating we're waiting on a cross-proposal to be finished.
238 void request_proposal(PaxosService
*other
) {
239 assert(other
!= NULL
);
240 assert(other
->is_writeable());
242 other
->request_proposal();
246 * Dispatch a message by passing it to several different functions that are
247 * either implemented directly by this service, or that should be implemented
248 * by the class implementing this service.
251 * @returns 'true' on successful dispatch; 'false' otherwise.
253 bool dispatch(MonOpRequestRef op
);
255 void refresh(bool *need_bootstrap
);
259 * @defgroup PaxosService_h_override_funcs Functions that should be
262 * These functions should be overridden at will by the class implementing
267 * Create the initial state for your system.
269 * In some of ours the state is actually set up elsewhere so this does
272 virtual void create_initial() = 0;
275 * Query the Paxos system for the latest state and apply it if it's newer
276 * than the current Monitor state.
278 virtual void update_from_paxos(bool *need_bootstrap
) = 0;
281 * Hook called after all services have refreshed their state from paxos
283 * This is useful for doing any update work that depends on other
284 * service's having up-to-date state.
286 virtual void post_paxos_update() {}
291 * This is called on mon startup, after all of the PaxosService instances'
292 * update_from_paxos() methods have been called
294 virtual void init() {}
297 * Create the pending state.
299 * @invariant This function is only called on a Leader.
300 * @remarks This created state is then modified by incoming messages.
301 * @remarks Called at startup and after every Paxos ratification round.
303 virtual void create_pending() = 0;
306 * Encode the pending state into a bufferlist for ratification and
307 * transmission as the next state.
309 * @invariant This function is only called on a Leader.
311 * @param t The transaction to hold all changes.
313 virtual void encode_pending(MonitorDBStore::TransactionRef t
) = 0;
316 * Discard the pending state
318 * @invariant This function is only called on a Leader.
320 * @remarks This function is NOT overridden in any of our code, but it is
321 * called in PaxosService::election_finished if have_pending is
324 virtual void discard_pending() { }
327 * Look at the query; if the query can be handled without changing state,
330 * @param m A query message
331 * @returns 'true' if the query was handled (e.g., was a read that got
332 * answered, was a state change that has no effect); 'false'
335 virtual bool preprocess_query(MonOpRequestRef op
) = 0;
338 * Apply the message to the pending state.
340 * @invariant This function is only called on a Leader.
342 * @param m An update message
343 * @returns 'true' if the update message was handled (e.g., a command that
344 * went through); 'false' otherwise.
346 virtual bool prepare_update(MonOpRequestRef op
) = 0;
352 * Determine if the Paxos system should vote on pending, and if so how long
353 * it should wait to vote.
355 * @param[out] delay The wait time, used so we can limit the update traffic
357 * @returns 'true' if the Paxos system should propose; 'false' otherwise.
359 virtual bool should_propose(double &delay
);
362 * force an immediate propose.
364 * This is meant to be called from prepare_update(op).
366 void force_immediate_propose() {
367 need_immediate_propose
= true;
371 * @defgroup PaxosService_h_courtesy Courtesy functions
373 * Courtesy functions, in case the class implementing this service has
374 * anything it wants/needs to do at these times.
378 * This is called when the Paxos state goes to active.
380 * On the peon, this is after each election.
381 * On the leader, this is after each election, *and* after each completed
384 * @note This function may get called twice in certain recovery cases.
386 virtual void on_active() { }
389 * This is called when we are shutting down
391 virtual void on_shutdown() {}
394 * this is called when activating on the leader
396 * it should conditionally upgrade the on-disk format by proposing a transaction
398 virtual void upgrade_format() { }
401 * this is called when we detect the store has just upgraded underneath us
403 virtual void on_upgrade() {}
406 * Called when the Paxos system enters a Leader election.
408 * @remarks It's a courtesy method, in case the class implementing this
409 * service has anything it wants/needs to do at that time.
411 virtual void on_restart() { }
419 virtual void tick() {}
422 * Get health information
424 * @param summary list of summary strings and associated severity
425 * @param detail optional list of detailed problem reports; may be NULL
427 virtual void get_health(list
<pair
<health_status_t
,string
> >& summary
,
428 list
<pair
<health_status_t
,string
> > *detail
,
429 CephContext
*cct
) const { }
433 * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
434 * all the services implementing this
435 * class, and, being almost the only keys
436 * used, should be standardized to avoid
440 const string last_committed_name
;
441 const string first_committed_name
;
442 const string full_prefix_name
;
443 const string full_latest_name
;
449 * @defgroup PaxosService_h_version_cache Variables holding cached values
450 * for the most used versions (first
451 * and last committed); we only have
452 * to read them when the store is
453 * updated, so in-between updates we
454 * may very well use cached versions
455 * and avoid the overhead.
458 version_t cached_first_committed
;
459 version_t cached_last_committed
;
465 * Callback list to be used whenever we are running a proposal through
466 * Paxos. These callbacks will be awaken whenever the said proposal
469 list
<Context
*> waiting_for_finished_proposal
;
474 * Check if we are proposing a value through Paxos
476 * @returns true if we are proposing; false otherwise.
478 bool is_proposing() {
483 * Check if we are in the Paxos ACTIVE state.
485 * @note This function is a wrapper for Paxos::is_active
487 * @returns true if in state ACTIVE; false otherwise.
492 (paxos
->is_active() || paxos
->is_updating() || paxos
->is_writing());
496 * Check if we are readable.
498 * This mirrors on the paxos check, except that we also verify that
500 * - the client hasn't seen the future relative to this PaxosService
501 * - this service isn't proposing.
502 * - we have committed our initial state (last_committed > 0)
504 * @param ver The version we want to check if is readable
505 * @returns true if it is readable; false otherwise
507 bool is_readable(version_t ver
= 0) {
508 if (ver
> get_last_committed() ||
509 !paxos
->is_readable(0) ||
510 get_last_committed() == 0)
516 * Check if we are writeable.
518 * We consider to be writeable iff:
520 * - we are not proposing a new version;
521 * - we are ready to be written to -- i.e., we have a pending value.
522 * - paxos is (active or updating or writing or refresh)
524 * @returns true if writeable; false otherwise
526 bool is_writeable() {
527 return is_write_ready();
531 * Check if we are ready to be written to. This means we must have a
532 * pending value and be active.
534 * @returns true if we are ready to be written to; false otherwise.
536 bool is_write_ready() {
537 return is_active() && have_pending
;
541 * Wait for a proposal to finish.
543 * Add a callback to be awaken whenever our current proposal finishes being
544 * proposed through Paxos.
546 * @param c The callback to be awaken once the proposal is finished.
548 void wait_for_finished_proposal(MonOpRequestRef op
, Context
*c
) {
550 op
->mark_event_string(service_name
+ ":wait_for_finished_proposal");
551 waiting_for_finished_proposal
.push_back(c
);
553 void wait_for_finished_proposal_ctx(Context
*c
) {
555 wait_for_finished_proposal(o
, c
);
559 * Wait for us to become active
561 * @param c The callback to be awaken once we become active.
563 void wait_for_active(MonOpRequestRef op
, Context
*c
) {
565 op
->mark_event_string(service_name
+ ":wait_for_active");
567 if (!is_proposing()) {
568 paxos
->wait_for_active(op
, c
);
571 wait_for_finished_proposal(op
, c
);
573 void wait_for_active_ctx(Context
*c
) {
575 wait_for_active(o
, c
);
579 * Wait for us to become readable
581 * @param c The callback to be awaken once we become active.
582 * @param ver The version we want to wait on.
584 void wait_for_readable(MonOpRequestRef op
, Context
*c
, version_t ver
= 0) {
585 /* This is somewhat of a hack. We only do check if a version is readable on
586 * PaxosService::dispatch(), but, nonetheless, we must make sure that if that
587 * is why we are not readable, then we must wait on PaxosService and not on
588 * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it
589 * happens to be readable at that specific point in time.
592 op
->mark_event_string(service_name
+ ":wait_for_readable");
594 if (is_proposing() ||
595 ver
> get_last_committed() ||
596 get_last_committed() == 0)
597 wait_for_finished_proposal(op
, c
);
600 op
->mark_event_string(service_name
+ ":wait_for_readable/paxos");
602 paxos
->wait_for_readable(op
, c
);
606 void wait_for_readable_ctx(Context
*c
, version_t ver
= 0) {
607 MonOpRequestRef o
; // will initialize the shared_ptr to NULL
608 wait_for_readable(o
, c
, ver
);
612 * Wait for us to become writeable
614 * @param c The callback to be awaken once we become writeable.
616 void wait_for_writeable(MonOpRequestRef op
, Context
*c
) {
618 op
->mark_event_string(service_name
+ ":wait_for_writeable");
621 wait_for_finished_proposal(op
, c
);
622 else if (!is_write_ready())
623 wait_for_active(op
, c
);
625 paxos
->wait_for_writeable(op
, c
);
627 void wait_for_writeable_ctx(Context
*c
) {
629 wait_for_writeable(o
, c
);
634 * @defgroup PaxosService_h_Trim Functions for trimming states
638 * trim service states if appropriate
640 * Called at same interval as tick()
645 * Auxiliary function to trim our state from version @p from to version
646 * @p to, not including; i.e., the interval [from, to[
648 * @param t The transaction to which we will add the trim operations.
649 * @param from the lower limit of the interval to be trimmed
650 * @param to the upper limit of the interval to be trimmed (not including)
652 void trim(MonitorDBStore::TransactionRef t
, version_t from
, version_t to
);
655 * encode service-specific extra bits into trim transaction
657 * @param tx transaction
658 * @param first new first_committed value
660 virtual void encode_trim_extra(MonitorDBStore::TransactionRef tx
,
664 * Get the version we should trim to.
666 * Should be overloaded by service if it wants to trim states.
668 * @returns the version we should trim to; if we return zero, it should be
669 * assumed that there's no version to trim to.
671 virtual version_t
get_trim_to() {
679 * @defgroup PaxosService_h_Stash_Full
682 virtual bool should_stash_full();
684 * Encode a full version on @p t
686 * @note We force every service to implement this function, since we strongly
687 * desire the encoding of full versions.
688 * @note Services that do not trim their state, will be bound to only create
689 * one full version. Full version stashing is determined/controled by
690 * trimming: we stash a version each time a trim is bound to erase the
691 * latest full version.
693 * @param t Transaction on which the full version shall be encoded.
695 virtual void encode_full(MonitorDBStore::TransactionRef t
) = 0;
704 * @note This function is a wrapper for Paxos::cancel_events
706 void cancel_events() {
707 paxos
->cancel_events();
711 * @defgroup PaxosService_h_store_funcs Back storage interface functions
715 * @defgroup PaxosService_h_store_modify Wrapper function interface to access
716 * the back store for modification
720 void put_first_committed(MonitorDBStore::TransactionRef t
, version_t ver
) {
721 t
->put(get_service_name(), first_committed_name
, ver
);
724 * Set the last committed version to @p ver
726 * @param t A transaction to which we add this put operation
727 * @param ver The last committed version number being put
729 void put_last_committed(MonitorDBStore::TransactionRef t
, version_t ver
) {
730 t
->put(get_service_name(), last_committed_name
, ver
);
732 /* We only need to do this once, and that is when we are about to make our
733 * first proposal. There are some services that rely on first_committed
734 * being set -- and it should! -- so we need to guarantee that it is,
735 * specially because the services itself do not do it themselves. They do
736 * rely on it, but they expect us to deal with it, and so we shall.
738 if (!get_first_committed())
739 put_first_committed(t
, ver
);
742 * Put the contents of @p bl into version @p ver
744 * @param t A transaction to which we will add this put operation
745 * @param ver The version to which we will add the value
746 * @param bl A bufferlist containing the version's value
748 void put_version(MonitorDBStore::TransactionRef t
, version_t ver
,
750 t
->put(get_service_name(), ver
, bl
);
753 * Put the contents of @p bl into a full version key for this service, that
754 * will be created with @p ver in mind.
756 * @param t The transaction to which we will add this put operation
757 * @param ver A version number
758 * @param bl A bufferlist containing the version's value
760 void put_version_full(MonitorDBStore::TransactionRef t
,
761 version_t ver
, bufferlist
& bl
) {
762 string key
= mon
->store
->combine_strings(full_prefix_name
, ver
);
763 t
->put(get_service_name(), key
, bl
);
766 * Put the version number in @p ver into the key pointing to the latest full
767 * version of this service.
769 * @param t The transaction to which we will add this put operation
770 * @param ver A version number
772 void put_version_latest_full(MonitorDBStore::TransactionRef t
, version_t ver
) {
773 string key
= mon
->store
->combine_strings(full_prefix_name
, full_latest_name
);
774 t
->put(get_service_name(), key
, ver
);
777 * Put the contents of @p bl into the key @p key.
779 * @param t A transaction to which we will add this put operation
780 * @param key The key to which we will add the value
781 * @param bl A bufferlist containing the value
783 void put_value(MonitorDBStore::TransactionRef t
,
784 const string
& key
, bufferlist
& bl
) {
785 t
->put(get_service_name(), key
, bl
);
789 * Put integer value @v into the key @p key.
791 * @param t A transaction to which we will add this put operation
792 * @param key The key to which we will add the value
793 * @param v An integer
795 void put_value(MonitorDBStore::TransactionRef t
,
796 const string
& key
, version_t v
) {
797 t
->put(get_service_name(), key
, v
);
805 * @defgroup PaxosService_h_store_get Wrapper function interface to access
806 * the back store for reading purposes
811 * @defgroup PaxosService_h_version_cache Obtain cached versions for this
816 * Get the first committed version
818 * @returns Our first committed version (that is available)
820 version_t
get_first_committed() const{
821 return cached_first_committed
;
824 * Get the last committed version
826 * @returns Our last committed version
828 version_t
get_last_committed() const{
829 return cached_last_committed
;
837 * Get the contents of a given version @p ver
839 * @param ver The version being obtained
840 * @param bl The bufferlist to be populated
841 * @return 0 on success; <0 otherwise
843 virtual int get_version(version_t ver
, bufferlist
& bl
) {
844 return mon
->store
->get(get_service_name(), ver
, bl
);
847 * Get the contents of a given full version of this service.
849 * @param ver A version number
850 * @param bl The bufferlist to be populated
851 * @returns 0 on success; <0 otherwise
853 virtual int get_version_full(version_t ver
, bufferlist
& bl
) {
854 string key
= mon
->store
->combine_strings(full_prefix_name
, ver
);
855 return mon
->store
->get(get_service_name(), key
, bl
);
858 * Get the latest full version number
860 * @returns A version number
862 version_t
get_version_latest_full() {
863 string key
= mon
->store
->combine_strings(full_prefix_name
, full_latest_name
);
864 return mon
->store
->get(get_service_name(), key
);
868 * Get a value from a given key.
870 * @param[in] key The key
871 * @param[out] bl The bufferlist to be populated with the value
873 int get_value(const string
& key
, bufferlist
& bl
) {
874 return mon
->store
->get(get_service_name(), key
, bl
);
877 * Get an integer value from a given key.
879 * @param[in] key The key
881 version_t
get_value(const string
& key
) {
882 return mon
->store
->get(get_service_name(), key
);