]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | /* | |
16 | time----> | |
17 | ||
18 | cccccccccccccccccca???????????????????????????????????????? | |
19 | cccccccccccccccccca???????????????????????????????????????? | |
20 | cccccccccccccccccca???????????????????????????????????????? leader | |
21 | cccccccccccccccccc????????????????????????????????????????? | |
22 | ccccc?????????????????????????????????????????????????????? | |
23 | ||
24 | last_committed | |
25 | ||
26 | pn_from | |
27 | pn | |
28 | ||
29 | a 12v | |
30 | b 12v | |
31 | c 14v | |
32 | d | |
33 | e 12v | |
34 | */ | |
35 | ||
36 | /** | |
37 | * Paxos storage layout and behavior | |
38 | * | |
39 | * Currently, we use a key/value store to hold all the Paxos-related data, but | |
40 | * it can logically be depicted as this: | |
41 | * | |
42 | * paxos: | |
43 | * first_committed -> 1 | |
44 | * last_committed -> 4 | |
45 | * 1 -> value_1 | |
46 | * 2 -> value_2 | |
47 | * 3 -> value_3 | |
48 | * 4 -> value_4 | |
49 | * | |
50 | * Since we are relying on a k/v store supporting atomic transactions, we can | |
51 | * guarantee that if 'last_committed' has a value of '4', then we have up to | |
52 | * version 4 on the store, and no more than that; the same applies to | |
53 | * 'first_committed', which holding '1' will strictly meaning that our lowest | |
54 | * version is 1. | |
55 | * | |
56 | * Each version's value (value_1, value_2, ..., value_n) is a blob of data, | |
57 | * incomprehensible to the Paxos. These values are proposed to the Paxos on | |
58 | * propose_new_value() and each one is a transaction encoded in a bufferlist. | |
59 | * | |
60 | * The Paxos will write the value to disk, associating it with its version, | |
61 | * but will take a step further: the value shall be decoded, and the operations | |
62 | * on that transaction shall be applied during the same transaction that will | |
63 | * write the value's encoded bufferlist to disk. This behavior ensures that | |
64 | * whatever is being proposed will only be available on the store when it is | |
65 | * applied by Paxos, which will then be aware of such new values, guaranteeing | |
66 | * the store state is always consistent without requiring shady workarounds. | |
67 | * | |
68 | * So, let's say that FooMonitor proposes the following transaction, neatly | |
69 | * encoded on a bufferlist of course: | |
70 | * | |
71 | * Tx_Foo | |
72 | * put(foo, last_committed, 3) | |
73 | * put(foo, 3, foo_value_3) | |
74 | * erase(foo, 2) | |
75 | * erase(foo, 1) | |
76 | * put(foo, first_committed, 3) | |
77 | * | |
78 | * And knowing that the Paxos is proposed Tx_Foo as a bufferlist, once it is | |
79 | * ready to commit, and assuming we are now committing version 5 of the Paxos, | |
80 | * we will do something along the lines of: | |
81 | * | |
82 | * Tx proposed_tx; | |
83 | * proposed_tx.decode(Tx_foo_bufferlist); | |
84 | * | |
85 | * Tx our_tx; | |
86 | * our_tx.put(paxos, last_committed, 5); | |
87 | * our_tx.put(paxos, 5, Tx_foo_bufferlist); | |
88 | * our_tx.append(proposed_tx); | |
89 | * | |
90 | * store_apply(our_tx); | |
91 | * | |
92 | * And the store should look like this after we apply 'our_tx': | |
93 | * | |
94 | * paxos: | |
95 | * first_committed -> 1 | |
96 | * last_committed -> 5 | |
97 | * 1 -> value_1 | |
98 | * 2 -> value_2 | |
99 | * 3 -> value_3 | |
100 | * 4 -> value_4 | |
101 | * 5 -> Tx_foo_bufferlist | |
102 | * foo: | |
103 | * first_committed -> 3 | |
104 | * last_committed -> 3 | |
105 | * 3 -> foo_value_3 | |
106 | * | |
107 | */ | |
108 | ||
109 | #ifndef CEPH_MON_PAXOS_H | |
110 | #define CEPH_MON_PAXOS_H | |
111 | ||
112 | #include "include/types.h" | |
113 | #include "mon_types.h" | |
114 | #include "include/buffer.h" | |
115 | #include "msg/msg_types.h" | |
116 | #include "include/Context.h" | |
117 | #include "common/perf_counters.h" | |
118 | #include <errno.h> | |
119 | ||
120 | #include "MonitorDBStore.h" | |
121 | #include "mon/MonOpRequest.h" | |
122 | ||
123 | class Monitor; | |
124 | class MMonPaxos; | |
125 | ||
126 | enum { | |
127 | l_paxos_first = 45800, | |
128 | l_paxos_start_leader, | |
129 | l_paxos_start_peon, | |
130 | l_paxos_restart, | |
131 | l_paxos_refresh, | |
132 | l_paxos_refresh_latency, | |
133 | l_paxos_begin, | |
134 | l_paxos_begin_keys, | |
135 | l_paxos_begin_bytes, | |
136 | l_paxos_begin_latency, | |
137 | l_paxos_commit, | |
138 | l_paxos_commit_keys, | |
139 | l_paxos_commit_bytes, | |
140 | l_paxos_commit_latency, | |
141 | l_paxos_collect, | |
142 | l_paxos_collect_keys, | |
143 | l_paxos_collect_bytes, | |
144 | l_paxos_collect_latency, | |
145 | l_paxos_collect_uncommitted, | |
146 | l_paxos_collect_timeout, | |
147 | l_paxos_accept_timeout, | |
148 | l_paxos_lease_ack_timeout, | |
149 | l_paxos_lease_timeout, | |
150 | l_paxos_store_state, | |
151 | l_paxos_store_state_keys, | |
152 | l_paxos_store_state_bytes, | |
153 | l_paxos_store_state_latency, | |
154 | l_paxos_share_state, | |
155 | l_paxos_share_state_keys, | |
156 | l_paxos_share_state_bytes, | |
157 | l_paxos_new_pn, | |
158 | l_paxos_new_pn_latency, | |
159 | l_paxos_last, | |
160 | }; | |
161 | ||
162 | ||
163 | // i am one state machine. | |
164 | /** | |
165 | * This libary is based on the Paxos algorithm, but varies in a few key ways: | |
166 | * 1- Only a single new value is generated at a time, simplifying the recovery logic. | |
167 | * 2- Nodes track "committed" values, and share them generously (and trustingly) | |
168 | * 3- A 'leasing' mechanism is built-in, allowing nodes to determine when it is | |
169 | * safe to "read" their copy of the last committed value. | |
170 | * | |
171 | * This provides a simple replication substrate that services can be built on top of. | |
172 | * See PaxosService.h | |
173 | */ | |
174 | class Paxos { | |
175 | /** | |
176 | * @defgroup Paxos_h_class Paxos | |
177 | * @{ | |
178 | */ | |
179 | /** | |
180 | * The Monitor to which this Paxos class is associated with. | |
181 | */ | |
182 | Monitor *mon; | |
183 | ||
184 | /// perf counter for internal instrumentations | |
185 | PerfCounters *logger; | |
186 | ||
187 | void init_logger(); | |
188 | ||
189 | // my state machine info | |
190 | const string paxos_name; | |
191 | ||
192 | friend class Monitor; | |
193 | friend class PaxosService; | |
194 | ||
195 | list<std::string> extra_state_dirs; | |
196 | ||
197 | // LEADER+PEON | |
198 | ||
199 | // -- generic state -- | |
200 | public: | |
201 | /** | |
202 | * @defgroup Paxos_h_states States on which the leader/peon may be. | |
203 | * @{ | |
204 | */ | |
205 | enum { | |
206 | /** | |
207 | * Leader/Peon is in Paxos' Recovery state | |
208 | */ | |
209 | STATE_RECOVERING, | |
210 | /** | |
211 | * Leader/Peon is idle, and the Peon may or may not have a valid lease. | |
212 | */ | |
213 | STATE_ACTIVE, | |
214 | /** | |
215 | * Leader/Peon is updating to a new value. | |
216 | */ | |
217 | STATE_UPDATING, | |
218 | /* | |
219 | * Leader proposing an old value | |
220 | */ | |
221 | STATE_UPDATING_PREVIOUS, | |
222 | /* | |
223 | * Leader/Peon is writing a new commit. readable, but not | |
224 | * writeable. | |
225 | */ | |
226 | STATE_WRITING, | |
227 | /* | |
228 | * Leader/Peon is writing a new commit from a previous round. | |
229 | */ | |
230 | STATE_WRITING_PREVIOUS, | |
231 | // leader: refresh following a commit | |
232 | STATE_REFRESH, | |
35e4c445 FG |
233 | // Shutdown after WRITING or WRITING_PREVIOUS |
234 | STATE_SHUTDOWN | |
7c673cae FG |
235 | }; |
236 | ||
237 | /** | |
238 | * Obtain state name from constant value. | |
239 | * | |
240 | * @note This function will raise a fatal error if @p s is not | |
241 | * a valid state value. | |
242 | * | |
243 | * @param s State value. | |
244 | * @return The state's name. | |
245 | */ | |
246 | static const string get_statename(int s) { | |
247 | switch (s) { | |
248 | case STATE_RECOVERING: | |
249 | return "recovering"; | |
250 | case STATE_ACTIVE: | |
251 | return "active"; | |
252 | case STATE_UPDATING: | |
253 | return "updating"; | |
254 | case STATE_UPDATING_PREVIOUS: | |
255 | return "updating-previous"; | |
256 | case STATE_WRITING: | |
257 | return "writing"; | |
258 | case STATE_WRITING_PREVIOUS: | |
259 | return "writing-previous"; | |
260 | case STATE_REFRESH: | |
261 | return "refresh"; | |
35e4c445 FG |
262 | case STATE_SHUTDOWN: |
263 | return "shutdown"; | |
7c673cae FG |
264 | default: |
265 | return "UNKNOWN"; | |
266 | } | |
267 | } | |
268 | ||
269 | private: | |
270 | /** | |
271 | * The state we are in. | |
272 | */ | |
273 | int state; | |
274 | /** | |
275 | * @} | |
276 | */ | |
35e4c445 FG |
277 | int commits_started = 0; |
278 | ||
279 | Cond shutdown_cond; | |
7c673cae FG |
280 | |
281 | public: | |
282 | /** | |
283 | * Check if we are recovering. | |
284 | * | |
285 | * @return 'true' if we are on the Recovering state; 'false' otherwise. | |
286 | */ | |
287 | bool is_recovering() const { return (state == STATE_RECOVERING); } | |
288 | /** | |
289 | * Check if we are active. | |
290 | * | |
291 | * @return 'true' if we are on the Active state; 'false' otherwise. | |
292 | */ | |
293 | bool is_active() const { return state == STATE_ACTIVE; } | |
294 | /** | |
295 | * Check if we are updating. | |
296 | * | |
297 | * @return 'true' if we are on the Updating state; 'false' otherwise. | |
298 | */ | |
299 | bool is_updating() const { return state == STATE_UPDATING; } | |
300 | ||
301 | /** | |
302 | * Check if we are updating/proposing a previous value from a | |
303 | * previous quorum | |
304 | */ | |
305 | bool is_updating_previous() const { return state == STATE_UPDATING_PREVIOUS; } | |
306 | ||
307 | /// @return 'true' if we are writing an update to disk | |
308 | bool is_writing() const { return state == STATE_WRITING; } | |
309 | ||
310 | /// @return 'true' if we are writing an update-previous to disk | |
311 | bool is_writing_previous() const { return state == STATE_WRITING_PREVIOUS; } | |
312 | ||
313 | /// @return 'true' if we are refreshing an update just committed | |
314 | bool is_refresh() const { return state == STATE_REFRESH; } | |
315 | ||
35e4c445 FG |
316 | /// @return 'true' if we are in the process of shutting down |
317 | bool is_shutdown() const { return state == STATE_SHUTDOWN; } | |
318 | ||
7c673cae FG |
319 | private: |
320 | /** | |
321 | * @defgroup Paxos_h_recovery_vars Common recovery-related member variables | |
322 | * @note These variables are common to both the Leader and the Peons. | |
323 | * @{ | |
324 | */ | |
325 | /** | |
326 | * | |
327 | */ | |
328 | version_t first_committed; | |
329 | /** | |
330 | * Last Proposal Number | |
331 | * | |
332 | * @todo Expand description | |
333 | */ | |
334 | version_t last_pn; | |
335 | /** | |
336 | * Last committed value's version. | |
337 | * | |
338 | * On both the Leader and the Peons, this is the last value's version that | |
339 | * was accepted by a given quorum and thus committed, that this instance | |
340 | * knows about. | |
341 | * | |
342 | * @note It may not be the last committed value's version throughout the | |
343 | * system. If we are a Peon, we may have not been part of the quorum | |
344 | * that accepted the value, and for this very same reason we may still | |
345 | * be a (couple of) version(s) behind, until we learn about the most | |
346 | * recent version. This should only happen if we are not active (i.e., | |
347 | * part of the quorum), which should not happen if we are up, running | |
348 | * and able to communicate with others -- thus able to be part of the | |
349 | * monmap and trigger new elections. | |
350 | */ | |
351 | version_t last_committed; | |
352 | /** | |
353 | * Last committed value's time. | |
354 | * | |
355 | * When the commit finished. | |
356 | */ | |
357 | utime_t last_commit_time; | |
358 | /** | |
359 | * The last Proposal Number we have accepted. | |
360 | * | |
361 | * On the Leader, it will be the Proposal Number picked by the Leader | |
362 | * itself. On the Peon, however, it will be the proposal sent by the Leader | |
31f18b77 | 363 | * and it will only be updated if its value is higher than the one |
7c673cae FG |
364 | * already known by the Peon. |
365 | */ | |
366 | version_t accepted_pn; | |
367 | /** | |
368 | * The last_committed epoch of the leader at the time we accepted the last pn. | |
369 | * | |
370 | * This has NO SEMANTIC MEANING, and is there only for the debug output. | |
371 | */ | |
372 | version_t accepted_pn_from; | |
373 | /** | |
374 | * Map holding the first committed version by each quorum member. | |
375 | * | |
376 | * The versions kept in this map are updated during the collect phase. | |
377 | * When the Leader starts the collect phase, each Peon will reply with its | |
378 | * first committed version, which will then be kept in this map. | |
379 | */ | |
380 | map<int,version_t> peer_first_committed; | |
381 | /** | |
382 | * Map holding the last committed version by each quorum member. | |
383 | * | |
384 | * The versions kept in this map are updated during the collect phase. | |
385 | * When the Leader starts the collect phase, each Peon will reply with its | |
386 | * last committed version, which will then be kept in this map. | |
387 | */ | |
388 | map<int,version_t> peer_last_committed; | |
389 | /** | |
390 | * @} | |
391 | */ | |
392 | ||
393 | // active (phase 2) | |
394 | /** | |
395 | * @defgroup Paxos_h_active_vars Common active-related member variables | |
396 | * @{ | |
397 | */ | |
398 | /** | |
399 | * When does our read lease expires. | |
400 | * | |
401 | * Instead of performing a full commit each time a read is requested, we | |
402 | * keep leases. Each lease will have an expiration date, which may or may | |
403 | * not be extended. | |
404 | */ | |
405 | utime_t lease_expire; | |
406 | /** | |
407 | * List of callbacks waiting for our state to change into STATE_ACTIVE. | |
408 | */ | |
409 | list<Context*> waiting_for_active; | |
410 | /** | |
411 | * List of callbacks waiting for the chance to read a version from us. | |
412 | * | |
413 | * Each entry on the list may result from an attempt to read a version that | |
414 | * wasn't available at the time, or an attempt made during a period during | |
415 | * which we could not satisfy the read request. The first case happens if | |
416 | * the requested version is greater than our last committed version. The | |
417 | * second scenario may happen if we are recovering, or if we don't have a | |
418 | * valid lease. | |
419 | * | |
420 | * The list will be woken up once we change to STATE_ACTIVE with an extended | |
421 | * lease -- which can be achieved if we have everyone on the quorum on board | |
422 | * with the latest proposal, or if we don't really care about the remaining | |
423 | * uncommitted values --, or if we're on a quorum of one. | |
424 | */ | |
425 | list<Context*> waiting_for_readable; | |
426 | /** | |
427 | * @} | |
428 | */ | |
429 | ||
430 | // -- leader -- | |
431 | // recovery (paxos phase 1) | |
432 | /** | |
433 | * @defgroup Paxos_h_leader_recovery Leader-specific Recovery-related vars | |
434 | * @{ | |
435 | */ | |
436 | /** | |
437 | * Number of replies to the collect phase we've received so far. | |
438 | * | |
439 | * This variable is reset to 1 each time we start a collect phase; it is | |
440 | * incremented each time we receive a reply to the collect message, and | |
441 | * is used to determine whether or not we have received replies from the | |
442 | * whole quorum. | |
443 | */ | |
444 | unsigned num_last; | |
445 | /** | |
446 | * Uncommitted value's version. | |
447 | * | |
448 | * If we have, or end up knowing about, an uncommitted value, then its | |
449 | * version will be kept in this variable. | |
450 | * | |
451 | * @note If this version equals @p last_committed+1 when we reach the final | |
452 | * steps of recovery, then the algorithm will assume this is a value | |
453 | * the Leader does not know about, and trustingly the Leader will | |
454 | * propose this version's value. | |
455 | */ | |
456 | version_t uncommitted_v; | |
457 | /** | |
458 | * Uncommitted value's Proposal Number. | |
459 | * | |
460 | * We use this variable to assess if the Leader should take into consideration | |
461 | * an uncommitted value sent by a Peon. Given that the Peon will send back to | |
462 | * the Leader the last Proposal Number it accepted, the Leader will be able | |
463 | * to infer if this value is more recent than the one the Leader has, thus | |
464 | * more relevant. | |
465 | */ | |
466 | version_t uncommitted_pn; | |
467 | /** | |
468 | * Uncommitted Value. | |
469 | * | |
470 | * If the system fails in-between the accept replies from the Peons and the | |
471 | * instruction to commit from the Leader, then we may end up with accepted | |
472 | * but yet-uncommitted values. During the Leader's recovery, it will attempt | |
473 | * to bring the whole system to the latest state, and that means committing | |
474 | * past accepted but uncommitted values. | |
475 | * | |
476 | * This variable will hold an uncommitted value, which may originate either | |
477 | * on the Leader, or learnt by the Leader from a Peon during the collect | |
478 | * phase. | |
479 | */ | |
480 | bufferlist uncommitted_value; | |
481 | /** | |
482 | * Used to specify when an on-going collect phase times out. | |
483 | */ | |
484 | Context *collect_timeout_event; | |
485 | /** | |
486 | * @} | |
487 | */ | |
488 | ||
489 | // active | |
490 | /** | |
491 | * @defgroup Paxos_h_leader_active Leader-specific Active-related vars | |
492 | * @{ | |
493 | */ | |
494 | /** | |
495 | * Set of participants (Leader & Peons) that have acked a lease extension. | |
496 | * | |
497 | * Each Peon that acknowledges a lease extension will have its place in this | |
498 | * set, which will be used to account for all the acks from all the quorum | |
499 | * members, guaranteeing that we trigger new elections if some don't ack in | |
500 | * the expected timeframe. | |
501 | */ | |
502 | set<int> acked_lease; | |
503 | /** | |
504 | * Callback responsible for extending the lease periodically. | |
505 | */ | |
506 | Context *lease_renew_event; | |
507 | /** | |
508 | * Callback to trigger new elections once the time for acks is out. | |
509 | */ | |
510 | Context *lease_ack_timeout_event; | |
511 | /** | |
512 | * @} | |
513 | */ | |
514 | /** | |
515 | * @defgroup Paxos_h_peon_active Peon-specific Active-related vars | |
516 | * @{ | |
517 | */ | |
518 | /** | |
519 | * Callback to trigger new elections when the Peon's lease times out. | |
520 | * | |
521 | * If the Peon's lease is extended, this callback will be reset (i.e., | |
522 | * we cancel the event and reschedule a new one with starting from the | |
523 | * beginning). | |
524 | */ | |
525 | Context *lease_timeout_event; | |
526 | /** | |
527 | * @} | |
528 | */ | |
529 | ||
530 | // updating (paxos phase 2) | |
531 | /** | |
532 | * @defgroup Paxos_h_leader_updating Leader-specific Updating-related vars | |
533 | * @{ | |
534 | */ | |
535 | /** | |
536 | * New Value being proposed to the Peons. | |
537 | * | |
538 | * This bufferlist holds the value the Leader is proposing to the Peons, and | |
539 | * that will be committed if the Peons do accept the proposal. | |
540 | */ | |
541 | bufferlist new_value; | |
542 | /** | |
543 | * Set of participants (Leader & Peons) that accepted the new proposed value. | |
544 | * | |
545 | * This set is used to keep track of those who have accepted the proposed | |
546 | * value, so the leader may know when to issue a commit (when a majority of | |
547 | * participants has accepted the proposal), and when to extend the lease | |
548 | * (when all the quorum members have accepted the proposal). | |
549 | */ | |
550 | set<int> accepted; | |
551 | /** | |
552 | * Callback to trigger a new election if the proposal is not accepted by the | |
553 | * full quorum within a given timeframe. | |
554 | * | |
555 | * If the full quorum does not accept the proposal, then it means that the | |
556 | * Leader may no longer be recognized as the leader, or that the quorum has | |
557 | * changed, and the value may have not reached all the participants. Thus, | |
558 | * the leader must call new elections, and go through a recovery phase in | |
559 | * order to propagate the new value throughout the system. | |
560 | * | |
561 | * This does not mean that we won't commit. We will commit as soon as we | |
562 | * have a majority of acceptances. But if we do not have full acceptance | |
563 | * from the quorum, then we cannot extend the lease, as some participants | |
564 | * may not have the latest committed value. | |
565 | */ | |
566 | Context *accept_timeout_event; | |
567 | ||
568 | /** | |
569 | * List of callbacks waiting for it to be possible to write again. | |
570 | * | |
571 | * @remarks It is not possible to write if we are not the Leader, or we are | |
572 | * not on the active state, or if the lease has expired. | |
573 | */ | |
574 | list<Context*> waiting_for_writeable; | |
575 | /** | |
576 | * List of callbacks waiting for a commit to finish. | |
577 | * | |
578 | * @remarks This may be used to a) wait for an on-going commit to finish | |
579 | * before we proceed with, say, a new proposal; or b) wait for the | |
580 | * next commit to be finished so we are sure that our value was | |
581 | * fully committed. | |
582 | */ | |
583 | list<Context*> waiting_for_commit; | |
584 | ||
585 | /** | |
586 | * Pending proposal transaction | |
587 | * | |
588 | * This is the transaction that is under construction and pending | |
589 | * proposal. We will add operations to it until we decide it is | |
590 | * time to start a paxos round. | |
591 | */ | |
592 | MonitorDBStore::TransactionRef pending_proposal; | |
593 | ||
594 | /** | |
595 | * Finishers for pending transaction | |
596 | * | |
597 | * These are waiting for updates in the pending proposal/transaction | |
598 | * to be committed. | |
599 | */ | |
600 | list<Context*> pending_finishers; | |
601 | ||
602 | /** | |
603 | * Finishers for committing transaction | |
604 | * | |
605 | * When the pending_proposal is submitted, pending_finishers move to | |
606 | * this list. When it commits, these finishers are notified. | |
607 | */ | |
608 | list<Context*> committing_finishers; | |
609 | ||
610 | /** | |
611 | * @defgroup Paxos_h_sync_warns Synchronization warnings | |
612 | * @todo Describe these variables | |
613 | * @{ | |
614 | */ | |
615 | utime_t last_clock_drift_warn; | |
616 | int clock_drift_warned; | |
617 | /** | |
618 | * @} | |
619 | */ | |
620 | ||
621 | /** | |
622 | * Should be true if we have proposed to trim, or are in the middle of | |
623 | * trimming; false otherwise. | |
624 | */ | |
625 | bool trimming; | |
626 | ||
31f18b77 FG |
627 | /** |
628 | * true if we want trigger_propose to *not* propose (yet) | |
629 | */ | |
630 | bool plugged = false; | |
631 | ||
7c673cae FG |
632 | /** |
633 | * @defgroup Paxos_h_callbacks Callback classes. | |
634 | * @{ | |
635 | */ | |
636 | /** | |
637 | * Callback class responsible for handling a Collect Timeout. | |
638 | */ | |
639 | class C_CollectTimeout; | |
640 | /** | |
641 | * Callback class responsible for handling an Accept Timeout. | |
642 | */ | |
643 | class C_AcceptTimeout; | |
644 | /** | |
645 | * Callback class responsible for handling a Lease Ack Timeout. | |
646 | */ | |
647 | class C_LeaseAckTimeout; | |
648 | ||
649 | /** | |
650 | * Callback class responsible for handling a Lease Timeout. | |
651 | */ | |
652 | class C_LeaseTimeout; | |
653 | ||
654 | /** | |
655 | * Callback class responsible for handling a Lease Renew Timeout. | |
656 | */ | |
657 | class C_LeaseRenew; | |
658 | ||
659 | class C_Trimmed; | |
660 | /** | |
661 | * | |
662 | */ | |
663 | public: | |
664 | class C_Proposal : public Context { | |
665 | Context *proposer_context; | |
666 | public: | |
667 | bufferlist bl; | |
668 | // for debug purposes. Will go away. Soon. | |
669 | bool proposed; | |
670 | utime_t proposal_time; | |
671 | ||
672 | C_Proposal(Context *c, bufferlist& proposal_bl) : | |
673 | proposer_context(c), | |
674 | bl(proposal_bl), | |
675 | proposed(false), | |
676 | proposal_time(ceph_clock_now()) | |
677 | { } | |
678 | ||
679 | void finish(int r) override { | |
680 | if (proposer_context) { | |
681 | proposer_context->complete(r); | |
682 | proposer_context = NULL; | |
683 | } | |
684 | } | |
685 | }; | |
686 | /** | |
687 | * @} | |
688 | */ | |
689 | private: | |
690 | /** | |
691 | * @defgroup Paxos_h_election_triggered Steps triggered by an election. | |
692 | * | |
693 | * @note All these functions play a significant role in the Recovery Phase, | |
694 | * which is triggered right after an election once someone becomes | |
695 | * the Leader. | |
696 | * @{ | |
697 | */ | |
698 | /** | |
699 | * Create a new Proposal Number and propose it to the Peons. | |
700 | * | |
701 | * This function starts the Recovery Phase, which can be directly mapped | |
702 | * onto the original Paxos' Prepare phase. Basically, we'll generate a | |
703 | * Proposal Number, taking @p oldpn into consideration, and we will send | |
704 | * it to a quorum, along with our first and last committed versions. By | |
705 | * sending these information in a message to the quorum, we expect to | |
706 | * obtain acceptances from a majority, allowing us to commit, or be | |
707 | * informed of a higher Proposal Number known by one or more of the Peons | |
708 | * in the quorum. | |
709 | * | |
710 | * @pre We are the Leader. | |
711 | * @post Recovery Phase initiated by sending messages to the quorum. | |
712 | * | |
713 | * @param oldpn A proposal number taken as the highest known so far, that | |
714 | * should be taken into consideration when generating a new | |
715 | * Proposal Number for the Recovery Phase. | |
716 | */ | |
717 | void collect(version_t oldpn); | |
718 | /** | |
719 | * Handle the reception of a collect message from the Leader and reply | |
720 | * accordingly. | |
721 | * | |
722 | * Once a Peon receives a collect message from the Leader it will reply | |
723 | * with its first and last committed versions, as well as information so | |
724 | * the Leader may know if its Proposal Number was, or was not, accepted by | |
31f18b77 | 725 | * the Peon. The Peon will accept the Leader's Proposal Number if it is |
7c673cae FG |
726 | * higher than the Peon's currently accepted Proposal Number. The Peon may |
727 | * also inform the Leader of accepted but uncommitted values. | |
728 | * | |
729 | * @invariant The message is an operation of type OP_COLLECT. | |
730 | * @pre We are a Peon. | |
731 | * @post Replied to the Leader, accepting or not accepting its PN. | |
732 | * | |
733 | * @param collect The collect message sent by the Leader to the Peon. | |
734 | */ | |
735 | void handle_collect(MonOpRequestRef op); | |
736 | /** | |
737 | * Handle a response from a Peon to the Leader's collect phase. | |
738 | * | |
739 | * The received message will state the Peon's last committed version, as | |
740 | * well as its last proposal number. This will lead to one of the following | |
741 | * scenarios: if the replied Proposal Number is equal to the one we proposed, | |
742 | * then the Peon has accepted our proposal, and if all the Peons do accept | |
743 | * our Proposal Number, then we are allowed to proceed with the commit; | |
744 | * however, if a Peon replies with a higher Proposal Number, we assume he | |
745 | * knows something we don't and the Leader will have to abort the current | |
746 | * proposal in order to retry with the Proposal Number specified by the Peon. | |
747 | * It may also occur that the Peon replied with a lower Proposal Number, in | |
748 | * which case we assume it is a reply to an older value and we'll simply | |
749 | * drop it. | |
750 | * This function will also check if the Peon replied with an accepted but | |
751 | * yet uncommitted value. In this case, if its version is higher than our | |
752 | * last committed value by one, we assume that the Peon knows a value from a | |
753 | * previous proposal that has never been committed, and we should try to | |
754 | * commit that value by proposing it next. On the other hand, if that is | |
755 | * not the case, we'll assume it is an old, uncommitted value, we do not | |
756 | * care about and we'll consider the system active by extending the leases. | |
757 | * | |
758 | * @invariant The message is an operation of type OP_LAST. | |
759 | * @pre We are the Leader. | |
760 | * @post We initiate a commit, or we retry with a higher Proposal Number, | |
761 | * or we drop the message. | |
762 | * @post We move from STATE_RECOVERING to STATE_ACTIVE. | |
763 | * | |
764 | * @param last The message sent by the Peon to the Leader. | |
765 | */ | |
766 | void handle_last(MonOpRequestRef op); | |
767 | /** | |
768 | * The Recovery Phase timed out, meaning that a significant part of the | |
769 | * quorum does not believe we are the Leader, and we thus should trigger new | |
770 | * elections. | |
771 | * | |
772 | * @pre We believe to be the Leader. | |
773 | * @post Trigger new elections. | |
774 | */ | |
775 | void collect_timeout(); | |
776 | /** | |
777 | * @} | |
778 | */ | |
779 | ||
780 | /** | |
781 | * @defgroup Paxos_h_updating_funcs Functions used during the Updating State | |
782 | * | |
783 | * These functions may easily be mapped to the original Paxos Algorithm's | |
784 | * phases. | |
785 | * | |
786 | * Taking into account the algorithm can be divided in 4 phases (Prepare, | |
787 | * Promise, Accept Request and Accepted), we can easily map Paxos::begin to | |
788 | * both the Prepare and Accept Request phases; the Paxos::handle_begin to | |
789 | * the Promise phase; and the Paxos::handle_accept to the Accepted phase. | |
790 | * @{ | |
791 | */ | |
792 | /** | |
793 | * Start a new proposal with the intent of committing @p value. | |
794 | * | |
795 | * If we are alone on the system (i.e., a quorum of one), then we will | |
796 | * simply commit the value, but if we are not alone, then we need to propose | |
797 | * the value to the quorum. | |
798 | * | |
799 | * @pre We are the Leader | |
800 | * @pre We are on STATE_ACTIVE | |
31f18b77 | 801 | * @post We commit, if we are alone, or we send a message to each quorum |
7c673cae | 802 | * member |
31f18b77 | 803 | * @post We are on STATE_ACTIVE, if we are alone, or on |
7c673cae FG |
804 | * STATE_UPDATING otherwise |
805 | * | |
806 | * @param value The value being proposed to the quorum | |
807 | */ | |
808 | void begin(bufferlist& value); | |
809 | /** | |
810 | * Accept or decline (by ignoring) a proposal from the Leader. | |
811 | * | |
812 | * We will decline the proposal (by ignoring it) if we have promised to | |
813 | * accept a higher numbered proposal. If that is not the case, we will | |
814 | * accept it and accordingly reply to the Leader. | |
815 | * | |
816 | * @pre We are a Peon | |
817 | * @pre We are on STATE_ACTIVE | |
31f18b77 FG |
818 | * @post We are on STATE_UPDATING if we accept the Leader's proposal |
819 | * @post We send a reply message to the Leader if we accept its proposal | |
7c673cae FG |
820 | * |
821 | * @invariant The received message is an operation of type OP_BEGIN | |
822 | * | |
823 | * @param begin The message sent by the Leader to the Peon during the | |
824 | * Paxos::begin function | |
825 | * | |
826 | */ | |
827 | void handle_begin(MonOpRequestRef op); | |
828 | /** | |
829 | * Handle an Accept message sent by a Peon. | |
830 | * | |
831 | * In order to commit, the Leader has to receive accepts from a majority of | |
832 | * the quorum. If that does happen, then the Leader may proceed with the | |
833 | * commit. However, the Leader needs the accepts from all the quorum members | |
834 | * in order to extend the lease and move on to STATE_ACTIVE. | |
835 | * | |
836 | * This function handles these two situations, accounting for the amount of | |
837 | * received accepts. | |
838 | * | |
839 | * @pre We are the Leader | |
840 | * @pre We are on STATE_UPDATING | |
31f18b77 FG |
841 | * @post We are on STATE_ACTIVE if we received accepts from the full quorum |
842 | * @post We extended the lease if we moved on to STATE_ACTIVE | |
843 | * @post We are on STATE_UPDATING if we didn't received accepts from the | |
7c673cae | 844 | * full quorum |
31f18b77 | 845 | * @post We have committed if we received accepts from a majority |
7c673cae FG |
846 | * |
847 | * @invariant The received message is an operation of type OP_ACCEPT | |
848 | * | |
849 | * @param accept The message sent by the Peons to the Leader during the | |
850 | * Paxos::handle_begin function | |
851 | */ | |
852 | void handle_accept(MonOpRequestRef op); | |
853 | /** | |
854 | * Trigger a fresh election. | |
855 | * | |
856 | * During Paxos::begin we set a Callback of type Paxos::C_AcceptTimeout in | |
857 | * order to limit the amount of time we spend waiting for Accept replies. | |
858 | * This callback will call Paxos::accept_timeout when it is fired. | |
859 | * | |
860 | * This is essential to the algorithm because there may be the chance that | |
861 | * we are no longer the Leader (i.e., others don't believe in us) and we | |
862 | * are getting ignored, or we dropped out of the quorum and haven't realised | |
863 | * it. So, our only option is to trigger fresh elections. | |
864 | * | |
865 | * @pre We are the Leader | |
866 | * @pre We are on STATE_UPDATING | |
867 | * @post Triggered fresh elections | |
868 | */ | |
869 | void accept_timeout(); | |
870 | /** | |
871 | * @} | |
872 | */ | |
873 | ||
874 | ||
875 | utime_t commit_start_stamp; | |
876 | friend struct C_Committed; | |
877 | ||
878 | /** | |
879 | * Commit a value throughout the system. | |
880 | * | |
881 | * The Leader will cancel the current lease (as it was for the old value), | |
882 | * and will store the committed value locally. It will then instruct every | |
883 | * quorum member to do so as well. | |
884 | * | |
885 | * @pre We are the Leader | |
886 | * @pre We are on STATE_UPDATING | |
887 | * @pre A majority of quorum members accepted our proposal | |
888 | * @post Value locally stored | |
889 | * @post Quorum members instructed to commit the new value. | |
890 | */ | |
891 | void commit_start(); | |
892 | void commit_finish(); ///< finish a commit after txn becomes durable | |
35e4c445 | 893 | void abort_commit(); ///< Handle commit finish after shutdown started |
7c673cae FG |
894 | /** |
895 | * Commit the new value to stable storage as being the latest available | |
896 | * version. | |
897 | * | |
898 | * @pre We are a Peon | |
899 | * @post The new value is locally stored | |
900 | * @post Fire up the callbacks waiting on waiting_for_commit | |
901 | * | |
902 | * @invariant The received message is an operation of type OP_COMMIT | |
903 | * | |
904 | * @param commit The message sent by the Leader to the Peon during | |
905 | * Paxos::commit | |
906 | */ | |
907 | void handle_commit(MonOpRequestRef op); | |
908 | /** | |
909 | * Extend the system's lease. | |
910 | * | |
911 | * This means that the Leader considers that it should now safe to read from | |
912 | * any node on the system, since every quorum member is now in possession of | |
913 | * the latest version. Therefore, the Leader will send a message stating just | |
914 | * this to each quorum member, and will impose a limited timeframe during | |
915 | * which acks will be accepted. If there aren't as many acks as expected | |
916 | * (i.e, if at least one quorum member does not ack the lease) during this | |
917 | * timeframe, then we will force fresh elections. | |
918 | * | |
919 | * @pre We are the Leader | |
920 | * @pre We are on STATE_ACTIVE | |
921 | * @post A message extending the lease is sent to each quorum member | |
922 | * @post A timeout callback is set to limit the amount of time we will wait | |
923 | * for lease acks. | |
924 | * @post A timer is set in order to renew the lease after a certain amount | |
925 | * of time. | |
926 | */ | |
927 | void extend_lease(); | |
928 | /** | |
929 | * Update the lease on the Peon's side of things. | |
930 | * | |
931 | * Once a Peon receives a Lease message, it will update its lease_expire | |
932 | * variable, reply to the Leader acknowledging the lease update and set a | |
933 | * timeout callback to be fired upon the lease's expiration. Finally, the | |
934 | * Peon will fire up all the callbacks waiting for it to become active, | |
935 | * which it just did, and all those waiting for it to become readable, | |
936 | * which should be true if the Peon's lease didn't expire in the mean time. | |
937 | * | |
938 | * @pre We are a Peon | |
939 | * @post We update the lease accordingly | |
940 | * @post A lease timeout callback is set | |
941 | * @post Move to STATE_ACTIVE | |
942 | * @post Fire up all the callbacks waiting for STATE_ACTIVE | |
31f18b77 | 943 | * @post Fire up all the callbacks waiting for readable if we are readable |
7c673cae FG |
944 | * @post Ack the lease to the Leader |
945 | * | |
946 | * @invariant The received message is an operation of type OP_LEASE | |
947 | * | |
948 | * @param lease The message sent by the Leader to the Peon during the | |
949 | * Paxos::extend_lease function | |
950 | */ | |
951 | void handle_lease(MonOpRequestRef op); | |
952 | /** | |
953 | * Account for all the Lease Acks the Leader receives from the Peons. | |
954 | * | |
955 | * Once the Leader receives all the Lease Acks from the Peons, it will be | |
956 | * able to cancel the Lease Ack timeout callback, thus avoiding calling | |
957 | * fresh elections. | |
958 | * | |
959 | * @pre We are the Leader | |
31f18b77 | 960 | * @post Cancel the Lease Ack timeout callback if we receive acks from all |
7c673cae FG |
961 | * the quorum members |
962 | * | |
963 | * @invariant The received message is an operation of type OP_LEASE_ACK | |
964 | * | |
965 | * @param ack The message sent by a Peon to the Leader during the | |
966 | * Paxos::handle_lease function | |
967 | */ | |
968 | void handle_lease_ack(MonOpRequestRef op); | |
969 | /** | |
970 | * Call fresh elections because at least one Peon didn't acked our lease. | |
971 | * | |
972 | * @pre We are the Leader | |
973 | * @pre We are on STATE_ACTIVE | |
974 | * @post Trigger fresh elections | |
975 | */ | |
976 | void lease_ack_timeout(); | |
977 | /** | |
978 | * Extend lease since we haven't had new committed values meanwhile. | |
979 | * | |
980 | * @pre We are the Leader | |
981 | * @pre We are on STATE_ACTIVE | |
982 | * @post Go through with Paxos::extend_lease | |
983 | */ | |
984 | void lease_renew_timeout(); | |
985 | /** | |
986 | * Call fresh elections because the Peon's lease expired without being | |
987 | * renewed or receiving a fresh lease. | |
988 | * | |
989 | * This means that the Peon is no longer assumed as being in the quorum | |
990 | * (or there is no Leader to speak of), so just trigger fresh elections | |
991 | * to circumvent this issue. | |
992 | * | |
993 | * @pre We are a Peon | |
994 | * @post Trigger fresh elections | |
995 | */ | |
996 | void lease_timeout(); // on peon, if lease isn't extended | |
997 | ||
998 | /// restart the lease timeout timer | |
999 | void reset_lease_timeout(); | |
1000 | ||
1001 | /** | |
1002 | * Cancel all of Paxos' timeout/renew events. | |
1003 | */ | |
1004 | void cancel_events(); | |
1005 | /** | |
1006 | * Shutdown this Paxos machine | |
1007 | */ | |
1008 | void shutdown(); | |
1009 | ||
1010 | /** | |
1011 | * Generate a new Proposal Number based on @p gt | |
1012 | * | |
1013 | * @todo Check what @p gt actually means and what its usage entails | |
1014 | * @param gt A hint for the geration of the Proposal Number | |
1015 | * @return A globally unique, monotonically increasing Proposal Number | |
1016 | */ | |
1017 | version_t get_new_proposal_number(version_t gt=0); | |
1018 | ||
1019 | /** | |
1020 | * @todo document sync function | |
1021 | */ | |
1022 | void warn_on_future_time(utime_t t, entity_name_t from); | |
1023 | ||
1024 | /** | |
1025 | * Begin proposing the pending_proposal. | |
1026 | */ | |
1027 | void propose_pending(); | |
1028 | ||
1029 | /** | |
1030 | * refresh state from store | |
1031 | * | |
1032 | * Called when we have new state for the mon to consume. If we return false, | |
1033 | * abort (we triggered a bootstrap). | |
1034 | * | |
1035 | * @returns true on success, false if we are now bootstrapping | |
1036 | */ | |
1037 | bool do_refresh(); | |
1038 | ||
1039 | void commit_proposal(); | |
1040 | void finish_round(); | |
1041 | ||
1042 | public: | |
1043 | /** | |
1044 | * @param m A monitor | |
1045 | * @param name A name for the paxos service. It serves as the naming space | |
1046 | * of the underlying persistent storage for this service. | |
1047 | */ | |
1048 | Paxos(Monitor *m, const string &name) | |
1049 | : mon(m), | |
1050 | logger(NULL), | |
1051 | paxos_name(name), | |
1052 | state(STATE_RECOVERING), | |
1053 | first_committed(0), | |
1054 | last_pn(0), | |
1055 | last_committed(0), | |
1056 | accepted_pn(0), | |
1057 | accepted_pn_from(0), | |
1058 | num_last(0), | |
1059 | uncommitted_v(0), uncommitted_pn(0), | |
1060 | collect_timeout_event(0), | |
1061 | lease_renew_event(0), | |
1062 | lease_ack_timeout_event(0), | |
1063 | lease_timeout_event(0), | |
1064 | accept_timeout_event(0), | |
1065 | clock_drift_warned(0), | |
1066 | trimming(false) { } | |
1067 | ||
1068 | const string get_name() const { | |
1069 | return paxos_name; | |
1070 | } | |
1071 | ||
1072 | void dispatch(MonOpRequestRef op); | |
1073 | ||
1074 | void read_and_prepare_transactions(MonitorDBStore::TransactionRef tx, | |
1075 | version_t from, version_t last); | |
1076 | ||
1077 | void init(); | |
1078 | ||
1079 | /** | |
1080 | * dump state info to a formatter | |
1081 | */ | |
1082 | void dump_info(Formatter *f); | |
1083 | ||
1084 | /** | |
1085 | * This function runs basic consistency checks. Importantly, if | |
1086 | * it is inconsistent and shouldn't be, it asserts out. | |
1087 | * | |
1088 | * @return True if consistent, false if not. | |
1089 | */ | |
1090 | bool is_consistent(); | |
1091 | ||
1092 | void restart(); | |
1093 | /** | |
1094 | * Initiate the Leader after it wins an election. | |
1095 | * | |
1096 | * Once an election is won, the Leader will be initiated and there are two | |
1097 | * possible outcomes of this method: the Leader directly jumps to the active | |
1098 | * state (STATE_ACTIVE) if it believes to be the only one in the quorum, or | |
1099 | * will start recovering (STATE_RECOVERING) by initiating the collect phase. | |
1100 | * | |
1101 | * @pre Our monitor is the Leader. | |
1102 | * @post We are either on STATE_ACTIVE if we're the only one in the quorum, | |
1103 | * or on STATE_RECOVERING otherwise. | |
1104 | */ | |
1105 | void leader_init(); | |
1106 | /** | |
1107 | * Initiate a Peon after it loses an election. | |
1108 | * | |
1109 | * If we are a Peon, then there must be a Leader and we are not alone in the | |
1110 | * quorum, thus automatically assume we are on STATE_RECOVERING, which means | |
1111 | * we will soon be enrolled into the Leader's collect phase. | |
1112 | * | |
1113 | * @pre There is a Leader, and it?s about to start the collect phase. | |
1114 | * @post We are on STATE_RECOVERING and will soon receive collect phase's | |
1115 | * messages. | |
1116 | */ | |
1117 | void peon_init(); | |
1118 | ||
1119 | /** | |
1120 | * Include an incremental state of values, ranging from peer_first_committed | |
1121 | * to the last committed value, on the message m | |
1122 | * | |
1123 | * @param m A message | |
1124 | * @param peer_first_committed Lowest version to take into account | |
1125 | * @param peer_last_committed Highest version to take into account | |
1126 | */ | |
1127 | void share_state(MMonPaxos *m, version_t peer_first_committed, | |
1128 | version_t peer_last_committed); | |
1129 | /** | |
1130 | * Store on disk a state that was shared with us | |
1131 | * | |
1132 | * Basically, we received a set of version. Or just one. It doesn't matter. | |
1133 | * What matters is that we have to stash it in the store. So, we will simply | |
1134 | * write every single bufferlist into their own versions on our side (i.e., | |
1135 | * onto paxos-related keys), and then we will decode those same bufferlists | |
1136 | * we just wrote and apply the transactions they hold. We will also update | |
1137 | * our first and last committed values to point to the new values, if need | |
1138 | * be. All this is done tightly wrapped in a transaction to ensure we | |
1139 | * enjoy the atomicity guarantees given by our awesome k/v store. | |
1140 | * | |
1141 | * @param m A message | |
1142 | * @returns true if we stored something new; false otherwise | |
1143 | */ | |
1144 | bool store_state(MMonPaxos *m); | |
1145 | void _sanity_check_store(); | |
1146 | ||
1147 | /** | |
1148 | * Helper function to decode a bufferlist into a transaction and append it | |
1149 | * to another transaction. | |
1150 | * | |
1151 | * This function is used during the Leader's commit and during the | |
1152 | * Paxos::store_state in order to apply the bufferlist's transaction onto | |
1153 | * the store. | |
1154 | * | |
1155 | * @param t The transaction to which we will append the operations | |
1156 | * @param bl A bufferlist containing an encoded transaction | |
1157 | */ | |
1158 | static void decode_append_transaction(MonitorDBStore::TransactionRef t, | |
1159 | bufferlist& bl) { | |
1160 | auto vt(std::make_shared<MonitorDBStore::Transaction>()); | |
1161 | bufferlist::iterator it = bl.begin(); | |
1162 | vt->decode(it); | |
1163 | t->append(vt); | |
1164 | } | |
1165 | ||
1166 | /** | |
1167 | * @todo This appears to be used only by the OSDMonitor, and I would say | |
1168 | * its objective is to allow a third-party to have a "private" | |
1169 | * state dir. -JL | |
1170 | */ | |
1171 | void add_extra_state_dir(string s) { | |
1172 | extra_state_dirs.push_back(s); | |
1173 | } | |
1174 | ||
1175 | // -- service interface -- | |
1176 | /** | |
1177 | * Add c to the list of callbacks waiting for us to become active. | |
1178 | * | |
1179 | * @param c A callback | |
1180 | */ | |
1181 | void wait_for_active(MonOpRequestRef op, Context *c) { | |
1182 | if (op) | |
1183 | op->mark_event("paxos:wait_for_active"); | |
1184 | waiting_for_active.push_back(c); | |
1185 | } | |
1186 | void wait_for_active(Context *c) { | |
1187 | MonOpRequestRef o; | |
1188 | wait_for_active(o, c); | |
1189 | } | |
1190 | ||
1191 | /** | |
1192 | * Trim the Paxos state as much as we can. | |
1193 | */ | |
1194 | void trim(); | |
1195 | ||
1196 | /** | |
1197 | * Check if we should trim. | |
1198 | * | |
1199 | * If trimming is disabled, we must take that into consideration and only | |
1200 | * return true if we are positively sure that we should trim soon. | |
1201 | * | |
1202 | * @returns true if we should trim; false otherwise. | |
1203 | */ | |
1204 | bool should_trim() { | |
1205 | int available_versions = get_version() - get_first_committed(); | |
1206 | int maximum_versions = g_conf->paxos_min + g_conf->paxos_trim_min; | |
1207 | ||
1208 | if (trimming || (available_versions <= maximum_versions)) | |
1209 | return false; | |
1210 | ||
1211 | return true; | |
1212 | } | |
31f18b77 FG |
1213 | |
1214 | bool is_plugged() const { | |
1215 | return plugged; | |
1216 | } | |
1217 | void plug() { | |
1218 | assert(plugged == false); | |
1219 | plugged = true; | |
1220 | } | |
1221 | void unplug() { | |
1222 | assert(plugged == true); | |
1223 | plugged = false; | |
1224 | } | |
1225 | ||
7c673cae FG |
1226 | // read |
1227 | /** | |
1228 | * @defgroup Paxos_h_read_funcs Read-related functions | |
1229 | * @{ | |
1230 | */ | |
1231 | /** | |
1232 | * Get latest committed version | |
1233 | * | |
1234 | * @return latest committed version | |
1235 | */ | |
1236 | version_t get_version() { return last_committed; } | |
1237 | /** | |
1238 | * Get first committed version | |
1239 | * | |
1240 | * @return the first committed version | |
1241 | */ | |
1242 | version_t get_first_committed() { return first_committed; } | |
1243 | /** | |
1244 | * Get the last commit time | |
1245 | * | |
1246 | * @returns Our last commit time | |
1247 | */ | |
1248 | utime_t get_last_commit_time() const{ | |
1249 | return last_commit_time; | |
1250 | } | |
1251 | /** | |
1252 | * Check if a given version is readable. | |
1253 | * | |
1254 | * A version may not be readable for a myriad of reasons: | |
1255 | * @li the version @e v is higher that the last committed version | |
1256 | * @li we are not the Leader nor a Peon (election may be on-going) | |
1257 | * @li we do not have a committed value yet | |
1258 | * @li we do not have a valid lease | |
1259 | * | |
1260 | * @param seen The version we want to check if it is readable. | |
1261 | * @return 'true' if the version is readable; 'false' otherwise. | |
1262 | */ | |
1263 | bool is_readable(version_t seen=0); | |
1264 | /** | |
1265 | * Read version @e v and store its value in @e bl | |
1266 | * | |
1267 | * @param[in] v The version we want to read | |
1268 | * @param[out] bl The version's value | |
1269 | * @return 'true' if we successfully read the value; 'false' otherwise | |
1270 | */ | |
1271 | bool read(version_t v, bufferlist &bl); | |
1272 | /** | |
1273 | * Read the latest committed version | |
1274 | * | |
1275 | * @param[out] bl The version's value | |
1276 | * @return the latest committed version if we successfully read the value; | |
1277 | * or 0 (zero) otherwise. | |
1278 | */ | |
1279 | version_t read_current(bufferlist &bl); | |
1280 | /** | |
1281 | * Add onreadable to the list of callbacks waiting for us to become readable. | |
1282 | * | |
1283 | * @param onreadable A callback | |
1284 | */ | |
1285 | void wait_for_readable(MonOpRequestRef op, Context *onreadable) { | |
1286 | assert(!is_readable()); | |
1287 | if (op) | |
1288 | op->mark_event("paxos:wait_for_readable"); | |
1289 | waiting_for_readable.push_back(onreadable); | |
1290 | } | |
1291 | void wait_for_readable(Context *onreadable) { | |
1292 | MonOpRequestRef o; | |
1293 | wait_for_readable(o, onreadable); | |
1294 | } | |
1295 | /** | |
1296 | * @} | |
1297 | */ | |
1298 | ||
1299 | /** | |
1300 | * Check if we have a valid lease. | |
1301 | * | |
1302 | * @returns true if the lease is still valid; false otherwise. | |
1303 | */ | |
1304 | bool is_lease_valid(); | |
1305 | // write | |
1306 | /** | |
1307 | * @defgroup Paxos_h_write_funcs Write-related functions | |
1308 | * @{ | |
1309 | */ | |
1310 | /** | |
1311 | * Check if we are writeable. | |
1312 | * | |
1313 | * We are writeable if we are alone (i.e., a quorum of one), or if we match | |
1314 | * all the following conditions: | |
1315 | * @li We are the Leader | |
1316 | * @li We are on STATE_ACTIVE | |
1317 | * @li We have a valid lease | |
1318 | * | |
1319 | * @return 'true' if we are writeable; 'false' otherwise. | |
1320 | */ | |
1321 | bool is_writeable(); | |
1322 | /** | |
1323 | * Add c to the list of callbacks waiting for us to become writeable. | |
1324 | * | |
1325 | * @param c A callback | |
1326 | */ | |
1327 | void wait_for_writeable(MonOpRequestRef op, Context *c) { | |
1328 | assert(!is_writeable()); | |
1329 | if (op) | |
1330 | op->mark_event("paxos:wait_for_writeable"); | |
1331 | waiting_for_writeable.push_back(c); | |
1332 | } | |
1333 | void wait_for_writeable(Context *c) { | |
1334 | MonOpRequestRef o; | |
1335 | wait_for_writeable(o, c); | |
1336 | } | |
1337 | ||
1338 | /** | |
1339 | * Get a transaction to submit operations to propose against | |
1340 | * | |
1341 | * Apply operations to this transaction. It will eventually be proposed | |
1342 | * to paxos. | |
1343 | */ | |
1344 | MonitorDBStore::TransactionRef get_pending_transaction(); | |
1345 | ||
1346 | /** | |
1347 | * Queue a completion for the pending proposal | |
1348 | * | |
1349 | * This completion will get triggered when the pending proposal | |
1350 | * transaction commits. | |
1351 | */ | |
1352 | void queue_pending_finisher(Context *onfinished); | |
1353 | ||
1354 | /** | |
1355 | * (try to) trigger a proposal | |
1356 | * | |
1357 | * Tell paxos that it should submit the pending proposal. Note that if it | |
1358 | * is not active (e.g., because it is already in the midst of committing | |
1359 | * something) that will be deferred (e.g., until the current round finishes). | |
1360 | */ | |
1361 | bool trigger_propose(); | |
1362 | ||
1363 | /** | |
1364 | * Add oncommit to the back of the list of callbacks waiting for us to | |
1365 | * finish committing. | |
1366 | * | |
1367 | * @param oncommit A callback | |
1368 | */ | |
1369 | void wait_for_commit(Context *oncommit) { | |
1370 | waiting_for_commit.push_back(oncommit); | |
1371 | } | |
1372 | /** | |
1373 | * Add oncommit to the front of the list of callbacks waiting for us to | |
1374 | * finish committing. | |
1375 | * | |
1376 | * @param oncommit A callback | |
1377 | */ | |
1378 | void wait_for_commit_front(Context *oncommit) { | |
1379 | waiting_for_commit.push_front(oncommit); | |
1380 | } | |
1381 | /** | |
1382 | * @} | |
1383 | */ | |
1384 | ||
1385 | /** | |
1386 | * @} | |
1387 | */ | |
1388 | protected: | |
1389 | MonitorDBStore *get_store(); | |
1390 | }; | |
1391 | ||
1392 | inline ostream& operator<<(ostream& out, Paxos::C_Proposal& p) | |
1393 | { | |
1394 | string proposed = (p.proposed ? "proposed" : "unproposed"); | |
1395 | out << " " << proposed | |
1396 | << " queued " << (ceph_clock_now() - p.proposal_time) | |
1397 | << " tx dump:\n"; | |
1398 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
1399 | bufferlist::iterator p_it = p.bl.begin(); | |
1400 | t->decode(p_it); | |
1401 | JSONFormatter f(true); | |
1402 | t->dump(&f); | |
1403 | f.flush(out); | |
1404 | return out; | |
1405 | } | |
1406 | ||
1407 | #endif | |
1408 |