]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/PaxosService.h
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / mon / PaxosService.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_PAXOSSERVICE_H
16#define CEPH_PAXOSSERVICE_H
17
18#include "include/Context.h"
19#include "Paxos.h"
20#include "Monitor.h"
21#include "MonitorDBStore.h"
22
23class Monitor;
24class Paxos;
25
26/**
27 * A Paxos Service is an abstraction that easily allows one to obtain an
28 * association between a Monitor and a Paxos class, in order to implement any
29 * service.
30 */
31class PaxosService {
32 /**
33 * @defgroup PaxosService_h_class Paxos Service
34 * @{
35 */
36 public:
37 /**
38 * The Monitor to which this class is associated with
39 */
40 Monitor *mon;
41 /**
42 * The Paxos instance to which this class is associated with
43 */
44 Paxos *paxos;
45 /**
46 * Our name. This will be associated with the class implementing us, and will
47 * be used mainly for store-related operations.
48 */
49 string service_name;
50 /**
51 * If we are or have queued anything for proposal, this variable will be true
52 * until our proposal has been finished.
53 */
54 bool proposing;
55
56 protected:
57 /**
58 * Services implementing us used to depend on the Paxos version, back when
59 * each service would have a Paxos instance for itself. However, now we only
60 * have a single Paxos instance, shared by all the services. Each service now
61 * must keep its own version, if so they wish. This variable should be used
62 * for that purpose.
63 */
64 version_t service_version;
65
66 private:
67 /**
68 * Event callback responsible for proposing our pending value once a timer
69 * runs out and fires.
70 */
71 Context *proposal_timer;
72 /**
73 * If the implementation class has anything pending to be proposed to Paxos,
74 * then have_pending should be true; otherwise, false.
75 */
76 bool have_pending;
77
78protected:
79
80 /**
81 * format of our state in leveldb, 0 for default
82 */
83 version_t format_version;
84
85
86
87 /**
88 * @defgroup PaxosService_h_callbacks Callback classes
89 * @{
90 */
91 /**
92 * Retry dispatching a given service message
93 *
94 * This callback class is used when we had to wait for some condition to
95 * become true while we were dispatching it.
96 *
97 * For instance, if the message's version isn't readable, according to Paxos,
98 * then we must wait for it to become readable. So, we just queue an
99 * instance of this class onto the Paxos::wait_for_readable function, and
100 * we will retry the whole dispatch again once the callback is fired.
101 */
102 class C_RetryMessage : public C_MonOp {
103 PaxosService *svc;
104 public:
105 C_RetryMessage(PaxosService *s, MonOpRequestRef op_) :
106 C_MonOp(op_), svc(s) { }
107 void _finish(int r) override {
108 if (r == -EAGAIN || r >= 0)
109 svc->dispatch(op);
110 else if (r == -ECANCELED)
111 return;
112 else
113 assert(0 == "bad C_RetryMessage return value");
114 }
115 };
116
117 /**
118 * @}
119 */
120
121public:
122 /**
123 * @param mn A Monitor instance
124 * @param p A Paxos instance
125 * @param name Our service's name.
126 */
127 PaxosService(Monitor *mn, Paxos *p, string name)
128 : mon(mn), paxos(p), service_name(name),
129 proposing(false),
130 service_version(0), proposal_timer(0), have_pending(false),
131 format_version(0),
132 last_committed_name("last_committed"),
133 first_committed_name("first_committed"),
134 full_prefix_name("full"), full_latest_name("latest"),
135 cached_first_committed(0), cached_last_committed(0)
136 {
137 }
138
139 virtual ~PaxosService() {}
140
141 /**
142 * Get the service's name.
143 *
144 * @returns The service's name.
145 */
146 string get_service_name() { return service_name; }
147
148 /**
149 * Get the store prefixes we utilize
150 */
151 virtual void get_store_prefixes(set<string>& s) {
152 s.insert(service_name);
153 }
154
155 // i implement and you ignore
156 /**
157 * Informs this instance that it should consider itself restarted.
158 *
159 * This means that we will cancel our proposal_timer event, if any exists.
160 */
161 void restart();
162 /**
163 * Informs this instance that an election has finished.
164 *
165 * This means that we will invoke a PaxosService::discard_pending while
166 * setting have_pending to false (basically, ignore our pending state) and
167 * we will then make sure we obtain a new state.
168 *
169 * Our state shall be updated by PaxosService::_active if the Paxos is
170 * active; otherwise, we will wait for it to become active by adding a
171 * PaxosService::C_Active callback to it.
172 */
173 void election_finished();
174 /**
175 * Informs this instance that it is supposed to shutdown.
176 *
177 * Basically, it will instruct Paxos to cancel all events/callbacks and then
178 * will cancel the proposal_timer event if any exists.
179 */
180 void shutdown();
181
182private:
183 /**
184 * Update our state by updating it from Paxos, and then creating a new
185 * pending state if need be.
186 *
187 * @remarks We only create a pending state we our Monitor is the Leader.
188 *
189 * @pre Paxos is active
190 * @post have_pending is true iif our Monitor is the Leader and Paxos is
191 * active
192 */
193 void _active();
194
195public:
196 /**
197 * Propose a new value through Paxos.
198 *
199 * This function should be called by the classes implementing
200 * PaxosService, in order to propose a new value through Paxos.
201 *
202 * @pre The implementation class implements the encode_pending function.
203 * @pre have_pending is true
204 * @pre Our monitor is the Leader
205 * @pre Paxos is active
206 * @post Cancel the proposal timer, if any
207 * @post have_pending is false
208 * @post propose pending value through Paxos
209 *
210 * @note This function depends on the implementation of encode_pending on
211 * the class that is implementing PaxosService
212 */
213 void propose_pending();
214
215 /**
216 * Let others request us to propose.
217 *
218 * At the moment, this is just a wrapper to propose_pending() with an
219 * extra check for is_writeable(), but it's a good practice to dissociate
220 * requests for proposals from direct usage of propose_pending() for
221 * future use -- we might want to perform additional checks or put a
222 * request on hold, for instance.
223 */
224 void request_proposal() {
225 assert(is_writeable());
226
227 propose_pending();
228 }
229 /**
230 * Request service @p other to perform a proposal.
231 *
232 * We could simply use the function above, requesting @p other directly,
233 * but we might eventually want to do something to the request -- say,
234 * set a flag stating we're waiting on a cross-proposal to be finished.
235 */
236 void request_proposal(PaxosService *other) {
237 assert(other != NULL);
238 assert(other->is_writeable());
239
240 other->request_proposal();
241 }
242
243 /**
244 * Dispatch a message by passing it to several different functions that are
245 * either implemented directly by this service, or that should be implemented
246 * by the class implementing this service.
247 *
248 * @param m A message
249 * @returns 'true' on successful dispatch; 'false' otherwise.
250 */
251 bool dispatch(MonOpRequestRef op);
252
253 void refresh(bool *need_bootstrap);
254 void post_refresh();
255
256 /**
257 * @defgroup PaxosService_h_override_funcs Functions that should be
258 * overridden.
259 *
260 * These functions should be overridden at will by the class implementing
261 * this service.
262 * @{
263 */
264 /**
265 * Create the initial state for your system.
266 *
267 * In some of ours the state is actually set up elsewhere so this does
268 * nothing.
269 */
270 virtual void create_initial() = 0;
271
272 /**
273 * Query the Paxos system for the latest state and apply it if it's newer
274 * than the current Monitor state.
275 */
276 virtual void update_from_paxos(bool *need_bootstrap) = 0;
277
278 /**
279 * Hook called after all services have refreshed their state from paxos
280 *
281 * This is useful for doing any update work that depends on other
282 * service's having up-to-date state.
283 */
284 virtual void post_paxos_update() {}
285
286 /**
287 * Init on startup
288 *
289 * This is called on mon startup, after all of the PaxosService instances'
290 * update_from_paxos() methods have been called
291 */
292 virtual void init() {}
293
294 /**
295 * Create the pending state.
296 *
297 * @invariant This function is only called on a Leader.
298 * @remarks This created state is then modified by incoming messages.
299 * @remarks Called at startup and after every Paxos ratification round.
300 */
301 virtual void create_pending() = 0;
302
303 /**
304 * Encode the pending state into a bufferlist for ratification and
305 * transmission as the next state.
306 *
307 * @invariant This function is only called on a Leader.
308 *
309 * @param t The transaction to hold all changes.
310 */
311 virtual void encode_pending(MonitorDBStore::TransactionRef t) = 0;
312
313 /**
314 * Discard the pending state
315 *
316 * @invariant This function is only called on a Leader.
317 *
318 * @remarks This function is NOT overridden in any of our code, but it is
319 * called in PaxosService::election_finished if have_pending is
320 * true.
321 */
322 virtual void discard_pending() { }
323
324 /**
325 * Look at the query; if the query can be handled without changing state,
326 * do so.
327 *
328 * @param m A query message
329 * @returns 'true' if the query was handled (e.g., was a read that got
330 * answered, was a state change that has no effect); 'false'
331 * otherwise.
332 */
333 virtual bool preprocess_query(MonOpRequestRef op) = 0;
334
335 /**
336 * Apply the message to the pending state.
337 *
338 * @invariant This function is only called on a Leader.
339 *
340 * @param m An update message
341 * @returns 'true' if the update message was handled (e.g., a command that
342 * went through); 'false' otherwise.
343 */
344 virtual bool prepare_update(MonOpRequestRef op) = 0;
345 /**
346 * @}
347 */
348
349 /**
350 * Determine if the Paxos system should vote on pending, and if so how long
351 * it should wait to vote.
352 *
353 * @param[out] delay The wait time, used so we can limit the update traffic
354 * spamming.
355 * @returns 'true' if the Paxos system should propose; 'false' otherwise.
356 */
357 virtual bool should_propose(double &delay);
358
359 /**
360 * @defgroup PaxosService_h_courtesy Courtesy functions
361 *
362 * Courtesy functions, in case the class implementing this service has
363 * anything it wants/needs to do at these times.
364 * @{
365 */
366 /**
367 * This is called when the Paxos state goes to active.
368 *
369 * On the peon, this is after each election.
370 * On the leader, this is after each election, *and* after each completed
371 * proposal.
372 *
373 * @note This function may get called twice in certain recovery cases.
374 */
375 virtual void on_active() { }
376
377 /**
378 * This is called when we are shutting down
379 */
380 virtual void on_shutdown() {}
381
382 /**
383 * this is called when activating on the leader
384 *
385 * it should conditionally upgrade the on-disk format by proposing a transaction
386 */
387 virtual void upgrade_format() { }
388
389 /**
390 * this is called when we detect the store has just upgraded underneath us
391 */
392 virtual void on_upgrade() {}
393
394 /**
395 * Called when the Paxos system enters a Leader election.
396 *
397 * @remarks It's a courtesy method, in case the class implementing this
398 * service has anything it wants/needs to do at that time.
399 */
400 virtual void on_restart() { }
401 /**
402 * @}
403 */
404
405 /**
406 * Tick.
407 */
408 virtual void tick() {}
409
410 /**
411 * Get health information
412 *
413 * @param summary list of summary strings and associated severity
414 * @param detail optional list of detailed problem reports; may be NULL
415 */
416 virtual void get_health(list<pair<health_status_t,string> >& summary,
417 list<pair<health_status_t,string> > *detail,
418 CephContext *cct) const { }
419
420 private:
421 /**
422 * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
423 * all the services implementing this
424 * class, and, being almost the only keys
425 * used, should be standardized to avoid
426 * mistakes.
427 * @{
428 */
429 const string last_committed_name;
430 const string first_committed_name;
431 const string full_prefix_name;
432 const string full_latest_name;
433 /**
434 * @}
435 */
436
437 /**
438 * @defgroup PaxosService_h_version_cache Variables holding cached values
439 * for the most used versions (first
440 * and last committed); we only have
441 * to read them when the store is
442 * updated, so in-between updates we
443 * may very well use cached versions
444 * and avoid the overhead.
445 * @{
446 */
447 version_t cached_first_committed;
448 version_t cached_last_committed;
449 /**
450 * @}
451 */
452
453 /**
454 * Callback list to be used whenever we are running a proposal through
455 * Paxos. These callbacks will be awaken whenever the said proposal
456 * finishes.
457 */
458 list<Context*> waiting_for_finished_proposal;
459
460 public:
461
462 /**
463 * Check if we are proposing a value through Paxos
464 *
465 * @returns true if we are proposing; false otherwise.
466 */
467 bool is_proposing() {
468 return proposing;
469 }
470
471 /**
472 * Check if we are in the Paxos ACTIVE state.
473 *
474 * @note This function is a wrapper for Paxos::is_active
475 *
476 * @returns true if in state ACTIVE; false otherwise.
477 */
478 bool is_active() {
479 return
480 !is_proposing() &&
481 (paxos->is_active() || paxos->is_updating() || paxos->is_writing());
482 }
483
484 /**
485 * Check if we are readable.
486 *
487 * This mirrors on the paxos check, except that we also verify that
488 *
489 * - the client hasn't seen the future relative to this PaxosService
490 * - this service isn't proposing.
491 * - we have committed our initial state (last_committed > 0)
492 *
493 * @param ver The version we want to check if is readable
494 * @returns true if it is readable; false otherwise
495 */
496 bool is_readable(version_t ver = 0) {
497 if (ver > get_last_committed() ||
498 !paxos->is_readable(0) ||
499 get_last_committed() == 0)
500 return false;
501 return true;
502 }
503
504 /**
505 * Check if we are writeable.
506 *
507 * We consider to be writeable iff:
508 *
509 * - we are not proposing a new version;
510 * - we are ready to be written to -- i.e., we have a pending value.
511 * - paxos is (active or updating or writing or refresh)
512 *
513 * @returns true if writeable; false otherwise
514 */
515 bool is_writeable() {
516 return is_write_ready();
517 }
518
519 /**
520 * Check if we are ready to be written to. This means we must have a
521 * pending value and be active.
522 *
523 * @returns true if we are ready to be written to; false otherwise.
524 */
525 bool is_write_ready() {
526 return is_active() && have_pending;
527 }
528
529 /**
530 * Wait for a proposal to finish.
531 *
532 * Add a callback to be awaken whenever our current proposal finishes being
533 * proposed through Paxos.
534 *
535 * @param c The callback to be awaken once the proposal is finished.
536 */
537 void wait_for_finished_proposal(MonOpRequestRef op, Context *c) {
538 if (op)
539 op->mark_event_string(service_name + ":wait_for_finished_proposal");
540 waiting_for_finished_proposal.push_back(c);
541 }
542 void wait_for_finished_proposal_ctx(Context *c) {
543 MonOpRequestRef o;
544 wait_for_finished_proposal(o, c);
545 }
546
547 /**
548 * Wait for us to become active
549 *
550 * @param c The callback to be awaken once we become active.
551 */
552 void wait_for_active(MonOpRequestRef op, Context *c) {
553 if (op)
554 op->mark_event_string(service_name + ":wait_for_active");
555
556 if (!is_proposing()) {
557 paxos->wait_for_active(op, c);
558 return;
559 }
560 wait_for_finished_proposal(op, c);
561 }
562 void wait_for_active_ctx(Context *c) {
563 MonOpRequestRef o;
564 wait_for_active(o, c);
565 }
566
567 /**
568 * Wait for us to become readable
569 *
570 * @param c The callback to be awaken once we become active.
571 * @param ver The version we want to wait on.
572 */
573 void wait_for_readable(MonOpRequestRef op, Context *c, version_t ver = 0) {
574 /* This is somewhat of a hack. We only do check if a version is readable on
575 * PaxosService::dispatch(), but, nonetheless, we must make sure that if that
576 * is why we are not readable, then we must wait on PaxosService and not on
577 * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it
578 * happens to be readable at that specific point in time.
579 */
580 if (op)
581 op->mark_event_string(service_name + ":wait_for_readable");
582
583 if (is_proposing() ||
584 ver > get_last_committed() ||
585 get_last_committed() == 0)
586 wait_for_finished_proposal(op, c);
587 else {
588 if (op)
589 op->mark_event_string(service_name + ":wait_for_readable/paxos");
590
591 paxos->wait_for_readable(op, c);
592 }
593 }
594
595 void wait_for_readable_ctx(Context *c, version_t ver = 0) {
596 MonOpRequestRef o; // will initialize the shared_ptr to NULL
597 wait_for_readable(o, c, ver);
598 }
599
600 /**
601 * Wait for us to become writeable
602 *
603 * @param c The callback to be awaken once we become writeable.
604 */
605 void wait_for_writeable(MonOpRequestRef op, Context *c) {
606 if (op)
607 op->mark_event_string(service_name + ":wait_for_writeable");
608
609 if (is_proposing())
610 wait_for_finished_proposal(op, c);
611 else if (!is_write_ready())
612 wait_for_active(op, c);
613 else
614 paxos->wait_for_writeable(op, c);
615 }
616 void wait_for_writeable_ctx(Context *c) {
617 MonOpRequestRef o;
618 wait_for_writeable(o, c);
619 }
620
621
622 /**
623 * @defgroup PaxosService_h_Trim Functions for trimming states
624 * @{
625 */
626 /**
627 * trim service states if appropriate
628 *
629 * Called at same interval as tick()
630 */
631 void maybe_trim();
632
633 /**
634 * Auxiliary function to trim our state from version @p from to version
635 * @p to, not including; i.e., the interval [from, to[
636 *
637 * @param t The transaction to which we will add the trim operations.
638 * @param from the lower limit of the interval to be trimmed
639 * @param to the upper limit of the interval to be trimmed (not including)
640 */
641 void trim(MonitorDBStore::TransactionRef t, version_t from, version_t to);
642
643 /**
644 * encode service-specific extra bits into trim transaction
645 *
646 * @param tx transaction
647 * @param first new first_committed value
648 */
649 virtual void encode_trim_extra(MonitorDBStore::TransactionRef tx,
650 version_t first) {}
651
652 /**
653 * Get the version we should trim to.
654 *
655 * Should be overloaded by service if it wants to trim states.
656 *
657 * @returns the version we should trim to; if we return zero, it should be
658 * assumed that there's no version to trim to.
659 */
660 virtual version_t get_trim_to() {
661 return 0;
662 }
663
664 /**
665 * @}
666 */
667 /**
668 * @defgroup PaxosService_h_Stash_Full
669 * @{
670 */
671 virtual bool should_stash_full();
672 /**
673 * Encode a full version on @p t
674 *
675 * @note We force every service to implement this function, since we strongly
676 * desire the encoding of full versions.
677 * @note Services that do not trim their state, will be bound to only create
678 * one full version. Full version stashing is determined/controled by
679 * trimming: we stash a version each time a trim is bound to erase the
680 * latest full version.
681 *
682 * @param t Transaction on which the full version shall be encoded.
683 */
684 virtual void encode_full(MonitorDBStore::TransactionRef t) = 0;
685
686 /**
687 * @}
688 */
689
690 /**
691 * Cancel events.
692 *
693 * @note This function is a wrapper for Paxos::cancel_events
694 */
695 void cancel_events() {
696 paxos->cancel_events();
697 }
698
699 /**
700 * @defgroup PaxosService_h_store_funcs Back storage interface functions
701 * @{
702 */
703 /**
704 * @defgroup PaxosService_h_store_modify Wrapper function interface to access
705 * the back store for modification
706 * purposes
707 * @{
708 */
709 void put_first_committed(MonitorDBStore::TransactionRef t, version_t ver) {
710 t->put(get_service_name(), first_committed_name, ver);
711 }
712 /**
713 * Set the last committed version to @p ver
714 *
715 * @param t A transaction to which we add this put operation
716 * @param ver The last committed version number being put
717 */
718 void put_last_committed(MonitorDBStore::TransactionRef t, version_t ver) {
719 t->put(get_service_name(), last_committed_name, ver);
720
721 /* We only need to do this once, and that is when we are about to make our
722 * first proposal. There are some services that rely on first_committed
723 * being set -- and it should! -- so we need to guarantee that it is,
724 * specially because the services itself do not do it themselves. They do
725 * rely on it, but they expect us to deal with it, and so we shall.
726 */
727 if (!get_first_committed())
728 put_first_committed(t, ver);
729 }
730 /**
731 * Put the contents of @p bl into version @p ver
732 *
733 * @param t A transaction to which we will add this put operation
734 * @param ver The version to which we will add the value
735 * @param bl A bufferlist containing the version's value
736 */
737 void put_version(MonitorDBStore::TransactionRef t, version_t ver,
738 bufferlist& bl) {
739 t->put(get_service_name(), ver, bl);
740 }
741 /**
742 * Put the contents of @p bl into a full version key for this service, that
743 * will be created with @p ver in mind.
744 *
745 * @param t The transaction to which we will add this put operation
746 * @param ver A version number
747 * @param bl A bufferlist containing the version's value
748 */
749 void put_version_full(MonitorDBStore::TransactionRef t,
750 version_t ver, bufferlist& bl) {
751 string key = mon->store->combine_strings(full_prefix_name, ver);
752 t->put(get_service_name(), key, bl);
753 }
754 /**
755 * Put the version number in @p ver into the key pointing to the latest full
756 * version of this service.
757 *
758 * @param t The transaction to which we will add this put operation
759 * @param ver A version number
760 */
761 void put_version_latest_full(MonitorDBStore::TransactionRef t, version_t ver) {
762 string key = mon->store->combine_strings(full_prefix_name, full_latest_name);
763 t->put(get_service_name(), key, ver);
764 }
765 /**
766 * Put the contents of @p bl into the key @p key.
767 *
768 * @param t A transaction to which we will add this put operation
769 * @param key The key to which we will add the value
770 * @param bl A bufferlist containing the value
771 */
772 void put_value(MonitorDBStore::TransactionRef t,
773 const string& key, bufferlist& bl) {
774 t->put(get_service_name(), key, bl);
775 }
776
777 /**
778 * @}
779 */
780
781 /**
782 * @defgroup PaxosService_h_store_get Wrapper function interface to access
783 * the back store for reading purposes
784 * @{
785 */
786
787 /**
788 * @defgroup PaxosService_h_version_cache Obtain cached versions for this
789 * service.
790 * @{
791 */
792 /**
793 * Get the first committed version
794 *
795 * @returns Our first committed version (that is available)
796 */
797 version_t get_first_committed() const{
798 return cached_first_committed;
799 }
800 /**
801 * Get the last committed version
802 *
803 * @returns Our last committed version
804 */
805 version_t get_last_committed() const{
806 return cached_last_committed;
807 }
808
809 /**
810 * @}
811 */
812
813 /**
814 * Get the contents of a given version @p ver
815 *
816 * @param ver The version being obtained
817 * @param bl The bufferlist to be populated
818 * @return 0 on success; <0 otherwise
819 */
820 virtual int get_version(version_t ver, bufferlist& bl) {
821 return mon->store->get(get_service_name(), ver, bl);
822 }
823 /**
824 * Get the contents of a given full version of this service.
825 *
826 * @param ver A version number
827 * @param bl The bufferlist to be populated
828 * @returns 0 on success; <0 otherwise
829 */
830 virtual int get_version_full(version_t ver, bufferlist& bl) {
831 string key = mon->store->combine_strings(full_prefix_name, ver);
832 return mon->store->get(get_service_name(), key, bl);
833 }
834 /**
835 * Get the latest full version number
836 *
837 * @returns A version number
838 */
839 version_t get_version_latest_full() {
840 string key = mon->store->combine_strings(full_prefix_name, full_latest_name);
841 return mon->store->get(get_service_name(), key);
842 }
843
844 /**
845 * Get a value from a given key.
846 *
847 * @param[in] key The key
848 * @param[out] bl The bufferlist to be populated with the value
849 */
850 int get_value(const string& key, bufferlist& bl) {
851 return mon->store->get(get_service_name(), key, bl);
852 }
853 /**
854 * Get an integer value from a given key.
855 *
856 * @param[in] key The key
857 */
858 version_t get_value(const string& key) {
859 return mon->store->get(get_service_name(), key);
860 }
861
862 /**
863 * @}
864 */
865 /**
866 * @}
867 */
868};
869
870#endif
871