]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
update sources to v12.2.5
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46
47 #include <atomic>
48 #include <list>
49 #include <memory>
50 #include <stack>
51 #include <string>
52 #include <tuple>
53 using namespace std;
54
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
57
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60 class OSD;
61 class OSDService;
62 class MOSDOp;
63 class MOSDPGScan;
64 class MOSDPGBackfill;
65 class MOSDPGInfo;
66
67 class PG;
68 struct OpRequest;
69 typedef OpRequest::Ref OpRequestRef;
70 class MOSDPGLog;
71 class CephContext;
72
73 namespace Scrub {
74 class Store;
75 }
76
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131 };
132
133 #ifdef PG_DEBUG_REFS
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138 #else
139 typedef boost::intrusive_ptr<PG> PGRef;
140 #endif
141
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218 };
219
220 struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247 };
248
249 /** PG - Replica Placement Group
250 *
251 */
252
253 class PG : public DoutPrefixProvider {
254 protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
259 bool eio_errors_to_process = false;
260
261 virtual PGBackend *get_pgbackend() = 0;
262 public:
263 std::string gen_prefix() const override;
264 CephContext *get_cct() const override { return cct; }
265 unsigned get_subsys() const override { return ceph_subsys_osd; }
266
267 /*** PG ****/
268 void update_snap_mapper_bits(uint32_t bits) {
269 snap_mapper.update_bits(bits);
270 }
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
274 }
275 protected:
276 OSDMapRef osdmap_ref;
277 OSDMapRef last_persisted_osdmap_ref;
278 PGPool pool;
279
280 void requeue_map_waiters();
281
282 void update_osdmap_ref(OSDMapRef newmap) {
283 assert(_lock.is_locked_by_me());
284 osdmap_ref = std::move(newmap);
285 }
286
287 public:
288 OSDMapRef get_osdmap() const {
289 assert(is_locked());
290 assert(osdmap_ref);
291 return osdmap_ref;
292 }
293 protected:
294
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
301 */
302 mutable Mutex _lock;
303 std::atomic_uint ref{0};
304
305 #ifdef PG_DEBUG_REFS
306 Mutex _ref_id_lock;
307 map<uint64_t, string> _live_ids;
308 map<string, uint64_t> _tag_counts;
309 uint64_t _ref_id;
310 #endif
311
312 public:
313 bool deleting; // true while in removing or OSD is shutting down
314
315 ZTracer::Endpoint trace_endpoint;
316
317 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
318 void lock(bool no_lockdep = false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
321 assert(!dirty_info);
322 assert(!dirty_big_info);
323 _lock.Unlock();
324 }
325
326 bool is_locked() const {
327 return _lock.is_locked();
328 }
329
330 #ifdef PG_DEBUG_REFS
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
334 #endif
335 void get(const char* tag);
336 void put(const char* tag);
337
338 bool dirty_info, dirty_big_info;
339
340 public:
341 bool is_ec_pg() const {
342 return pool.info.ec_pool();
343 }
344 // pg state
345 pg_info_t info; ///< current pg info
346 pg_info_t last_written_info; ///< last written info
347 __u8 info_struct_v;
348 static const __u8 cur_struct_v = 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v = 7;
355 bool must_upgrade() {
356 return info_struct_v < cur_struct_v;
357 }
358 bool can_upgrade() {
359 return info_struct_v >= compat_struct_v;
360 }
361 void upgrade(ObjectStore *store);
362
363 const coll_t coll;
364 ObjectStore::CollectionHandle ch;
365 PGLog pg_log;
366 static string get_info_key(spg_t pgid) {
367 return stringify(pgid) + "_info";
368 }
369 static string get_biginfo_key(spg_t pgid) {
370 return stringify(pgid) + "_biginfo";
371 }
372 static string get_epoch_key(spg_t pgid) {
373 return stringify(pgid) + "_epoch";
374 }
375 ghobject_t pgmeta_oid;
376
377 class MissingLoc {
378 map<hobject_t, pg_missing_item> needs_recovery_map;
379 map<hobject_t, set<pg_shard_t> > missing_loc;
380 set<pg_shard_t> missing_loc_sources;
381 PG *pg;
382 set<pg_shard_t> empty_set;
383 public:
384 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
385 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
386 explicit MissingLoc(PG *pg)
387 : pg(pg) {}
388 void set_backend_predicates(
389 IsPGReadablePredicate *_is_readable,
390 IsPGRecoverablePredicate *_is_recoverable) {
391 is_readable.reset(_is_readable);
392 is_recoverable.reset(_is_recoverable);
393 }
394 string gen_prefix() const { return pg->gen_prefix(); }
395 bool needs_recovery(
396 const hobject_t &hoid,
397 eversion_t *v = 0) const {
398 map<hobject_t, pg_missing_item>::const_iterator i =
399 needs_recovery_map.find(hoid);
400 if (i == needs_recovery_map.end())
401 return false;
402 if (v)
403 *v = i->second.need;
404 return true;
405 }
406 bool is_deleted(const hobject_t &hoid) const {
407 auto i = needs_recovery_map.find(hoid);
408 if (i == needs_recovery_map.end())
409 return false;
410 return i->second.is_delete();
411 }
412 bool is_unfound(const hobject_t &hoid) const {
413 return needs_recovery(hoid) && !is_deleted(hoid) && (
414 !missing_loc.count(hoid) ||
415 !(*is_recoverable)(missing_loc.find(hoid)->second));
416 }
417 bool readable_with_acting(
418 const hobject_t &hoid,
419 const set<pg_shard_t> &acting) const;
420 uint64_t num_unfound() const {
421 uint64_t ret = 0;
422 for (map<hobject_t, pg_missing_item>::const_iterator i =
423 needs_recovery_map.begin();
424 i != needs_recovery_map.end();
425 ++i) {
426 if (is_unfound(i->first))
427 ++ret;
428 }
429 return ret;
430 }
431
432 bool have_unfound() const {
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 needs_recovery_map.begin();
435 i != needs_recovery_map.end();
436 ++i) {
437 if (is_unfound(i->first))
438 return true;
439 }
440 return false;
441 }
442 void clear() {
443 needs_recovery_map.clear();
444 missing_loc.clear();
445 missing_loc_sources.clear();
446 }
447
448 void add_location(const hobject_t &hoid, pg_shard_t location) {
449 missing_loc[hoid].insert(location);
450 }
451 void remove_location(const hobject_t &hoid, pg_shard_t location) {
452 missing_loc[hoid].erase(location);
453 }
454 void add_active_missing(const pg_missing_t &missing) {
455 for (map<hobject_t, pg_missing_item>::const_iterator i =
456 missing.get_items().begin();
457 i != missing.get_items().end();
458 ++i) {
459 map<hobject_t, pg_missing_item>::const_iterator j =
460 needs_recovery_map.find(i->first);
461 if (j == needs_recovery_map.end()) {
462 needs_recovery_map.insert(*i);
463 } else {
464 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
465 << i->first << " have " << j->second
466 << " tried to add " << i->second << dendl;
467 assert(i->second.need == j->second.need);
468 }
469 }
470 }
471
472 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
473 needs_recovery_map[hoid] = pg_missing_item(need, have);
474 }
475 void revise_need(const hobject_t &hoid, eversion_t need) {
476 assert(needs_recovery(hoid));
477 needs_recovery_map[hoid].need = need;
478 }
479
480 /// Adds info about a possible recovery source
481 bool add_source_info(
482 pg_shard_t source, ///< [in] source
483 const pg_info_t &oinfo, ///< [in] info
484 const pg_missing_t &omissing, ///< [in] (optional) missing
485 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
486 ); ///< @return whether a new object location was discovered
487
488 /// Adds recovery sources in batch
489 void add_batch_sources_info(
490 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
491 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
492 );
493
494 /// Uses osdmap to update structures for now down sources
495 void check_recovery_sources(const OSDMapRef& osdmap);
496
497 /// Call when hoid is no longer missing in acting set
498 void recovered(const hobject_t &hoid) {
499 needs_recovery_map.erase(hoid);
500 missing_loc.erase(hoid);
501 }
502
503 /// Call to update structures for hoid after a change
504 void rebuild(
505 const hobject_t &hoid,
506 pg_shard_t self,
507 const set<pg_shard_t> to_recover,
508 const pg_info_t &info,
509 const pg_missing_t &missing,
510 const map<pg_shard_t, pg_missing_t> &pmissing,
511 const map<pg_shard_t, pg_info_t> &pinfo) {
512 recovered(hoid);
513 boost::optional<pg_missing_item> item;
514 auto miter = missing.get_items().find(hoid);
515 if (miter != missing.get_items().end()) {
516 item = miter->second;
517 } else {
518 for (auto &&i: to_recover) {
519 if (i == self)
520 continue;
521 auto pmiter = pmissing.find(i);
522 assert(pmiter != pmissing.end());
523 miter = pmiter->second.get_items().find(hoid);
524 if (miter != pmiter->second.get_items().end()) {
525 item = miter->second;
526 break;
527 }
528 }
529 }
530 if (!item)
531 return; // recovered!
532
533 needs_recovery_map[hoid] = *item;
534 if (item->is_delete())
535 return;
536 auto mliter =
537 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
538 assert(info.last_backfill.is_max());
539 assert(info.last_update >= item->need);
540 if (!missing.is_missing(hoid))
541 mliter->second.insert(self);
542 for (auto &&i: pmissing) {
543 auto pinfoiter = pinfo.find(i.first);
544 assert(pinfoiter != pinfo.end());
545 if (item->need <= pinfoiter->second.last_update &&
546 hoid <= pinfoiter->second.last_backfill &&
547 !i.second.is_missing(hoid))
548 mliter->second.insert(i.first);
549 }
550 }
551
552 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
553 return missing_loc.count(hoid) ?
554 missing_loc.find(hoid)->second : empty_set;
555 }
556 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
557 return missing_loc;
558 }
559 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
560 return needs_recovery_map;
561 }
562 } missing_loc;
563
564 PastIntervals past_intervals;
565
566 interval_set<snapid_t> snap_trimq;
567
568 /* You should not use these items without taking their respective queue locks
569 * (if they have one) */
570 xlist<PG*>::item stat_queue_item;
571 bool scrub_queued;
572 bool recovery_queued;
573
574 int recovery_ops_active;
575 set<pg_shard_t> waiting_on_backfill;
576 #ifdef DEBUG_RECOVERY_OIDS
577 set<hobject_t> recovering_oids;
578 #endif
579
580 protected:
581 int role; // 0 = primary, 1 = replica, -1=none.
582 unsigned state; // PG_STATE_*
583
584 bool send_notify; ///< true if we are non-primary and should notify the primary
585
586 public:
587 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
588 eversion_t last_complete_ondisk; // last_complete that has committed.
589 eversion_t last_update_applied;
590
591
592 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
593 PGRef pg;
594 epoch_t e;
595 eversion_t v;
596 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
597 : pg(pg), e(e), v(v) {}
598 void finish(int) override {
599 pg->lock();
600 if (!pg->pg_has_reset_since(e)) {
601 pg->last_rollback_info_trimmed_to_applied = v;
602 }
603 pg->unlock();
604 }
605 };
606 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
607 // and the transaction has applied
608 eversion_t last_rollback_info_trimmed_to_applied;
609
610 // primary state
611 public:
612 pg_shard_t primary;
613 pg_shard_t pg_whoami;
614 pg_shard_t up_primary;
615 vector<int> up, acting, want_acting;
616 set<pg_shard_t> actingbackfill, actingset, upset;
617 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
618 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
619 eversion_t pg_trim_to;
620
621 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
622
623 // [primary only] content recovery state
624
625 public:
626 struct BufferedRecoveryMessages {
627 map<int, map<spg_t, pg_query_t> > query_map;
628 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
629 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
630 };
631
632 public:
633 bool dne() { return info.dne(); }
634 struct RecoveryCtx {
635 utime_t start_time;
636 map<int, map<spg_t, pg_query_t> > *query_map;
637 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
638 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
639 set<PGRef> created_pgs;
640 C_Contexts *on_applied;
641 C_Contexts *on_safe;
642 ObjectStore::Transaction *transaction;
643 ThreadPool::TPHandle* handle;
644 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
645 map<int,
646 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
647 map<int,
648 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
649 C_Contexts *on_applied,
650 C_Contexts *on_safe,
651 ObjectStore::Transaction *transaction)
652 : query_map(query_map), info_map(info_map),
653 notify_list(notify_list),
654 on_applied(on_applied),
655 on_safe(on_safe),
656 transaction(transaction),
657 handle(NULL) {}
658
659 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
660 : query_map(&(buf.query_map)),
661 info_map(&(buf.info_map)),
662 notify_list(&(buf.notify_list)),
663 on_applied(rctx.on_applied),
664 on_safe(rctx.on_safe),
665 transaction(rctx.transaction),
666 handle(rctx.handle) {}
667
668 void accept_buffered_messages(BufferedRecoveryMessages &m) {
669 assert(query_map);
670 assert(info_map);
671 assert(notify_list);
672 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
673 i != m.query_map.end();
674 ++i) {
675 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
676 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
677 j != i->second.end();
678 ++j) {
679 omap[j->first] = j->second;
680 }
681 }
682 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
683 = m.info_map.begin();
684 i != m.info_map.end();
685 ++i) {
686 vector<pair<pg_notify_t, PastIntervals> > &ovec =
687 (*info_map)[i->first];
688 ovec.reserve(ovec.size() + i->second.size());
689 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
690 }
691 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
692 = m.notify_list.begin();
693 i != m.notify_list.end();
694 ++i) {
695 vector<pair<pg_notify_t, PastIntervals> > &ovec =
696 (*notify_list)[i->first];
697 ovec.reserve(ovec.size() + i->second.size());
698 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
699 }
700 }
701 };
702
703
704 PGStateHistory pgstate_history;
705
706 struct NamedState {
707 const char *state_name;
708 utime_t enter_time;
709 PG* pg;
710 const char *get_state_name() { return state_name; }
711 NamedState(PG *pg_, const char *state_name_)
712 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
713 pg->pgstate_history.enter(pg, enter_time, state_name);
714 }
715 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
716 };
717
718
719
720 protected:
721
722 /*
723 * peer_info -- projected (updates _before_ replicas ack)
724 * peer_missing -- committed (updates _after_ replicas ack)
725 */
726
727 bool need_up_thru;
728 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
729 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
730 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
731 set<pg_shard_t> peer_purged; // peers purged
732 map<pg_shard_t, pg_missing_t> peer_missing;
733 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
734 set<pg_shard_t> peer_missing_requested;
735
736 // i deleted these strays; ignore racing PGInfo from them
737 set<pg_shard_t> peer_activated;
738
739 // primary-only, recovery-only state
740 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
741 // which are unfound on the primary
742 epoch_t last_peering_reset;
743
744
745 /* heartbeat peers */
746 void set_probe_targets(const set<pg_shard_t> &probe_set);
747 void clear_probe_targets();
748 public:
749 Mutex heartbeat_peer_lock;
750 set<int> heartbeat_peers;
751 set<int> probe_targets;
752
753 /**
754 * BackfillInterval
755 *
756 * Represents the objects in a range [begin, end)
757 *
758 * Possible states:
759 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
760 * 2) Else, objects contains all objects in [begin, end)
761 */
762 struct BackfillInterval {
763 // info about a backfill interval on a peer
764 eversion_t version; /// version at which the scan occurred
765 map<hobject_t,eversion_t> objects;
766 hobject_t begin;
767 hobject_t end;
768
769 /// clear content
770 void clear() {
771 *this = BackfillInterval();
772 }
773
774 /// clear objects list only
775 void clear_objects() {
776 objects.clear();
777 }
778
779 /// reinstantiate with a new start+end position and sort order
780 void reset(hobject_t start) {
781 clear();
782 begin = end = start;
783 }
784
785 /// true if there are no objects in this interval
786 bool empty() const {
787 return objects.empty();
788 }
789
790 /// true if interval extends to the end of the range
791 bool extends_to_end() const {
792 return end.is_max();
793 }
794
795 /// removes items <= soid and adjusts begin to the first object
796 void trim_to(const hobject_t &soid) {
797 trim();
798 while (!objects.empty() &&
799 objects.begin()->first <= soid) {
800 pop_front();
801 }
802 }
803
804 /// Adjusts begin to the first object
805 void trim() {
806 if (!objects.empty())
807 begin = objects.begin()->first;
808 else
809 begin = end;
810 }
811
812 /// drop first entry, and adjust @begin accordingly
813 void pop_front() {
814 assert(!objects.empty());
815 objects.erase(objects.begin());
816 trim();
817 }
818
819 /// dump
820 void dump(Formatter *f) const {
821 f->dump_stream("begin") << begin;
822 f->dump_stream("end") << end;
823 f->open_array_section("objects");
824 for (map<hobject_t, eversion_t>::const_iterator i =
825 objects.begin();
826 i != objects.end();
827 ++i) {
828 f->open_object_section("object");
829 f->dump_stream("object") << i->first;
830 f->dump_stream("version") << i->second;
831 f->close_section();
832 }
833 f->close_section();
834 }
835 };
836
837 protected:
838 BackfillInterval backfill_info;
839 map<pg_shard_t, BackfillInterval> peer_backfill_info;
840 bool backfill_reserved;
841 bool backfill_reserving;
842
843 friend class OSD;
844
845 public:
846 set<pg_shard_t> backfill_targets;
847
848 bool is_backfill_targets(pg_shard_t osd) {
849 return backfill_targets.count(osd);
850 }
851
852 protected:
853
854 /*
855 * blocked request wait hierarchy
856 *
857 * In order to preserve request ordering we need to be careful about the
858 * order in which blocked requests get requeued. Generally speaking, we
859 * push the requests back up to the op_wq in reverse order (most recent
860 * request first) so that they come back out again in the original order.
861 * However, because there are multiple wait queues, we need to requeue
862 * waitlists in order. Generally speaking, we requeue the wait lists
863 * that are checked first.
864 *
865 * Here are the various wait lists, in the order they are used during
866 * request processing, with notes:
867 *
868 * - waiting_for_map
869 * - may start or stop blocking at any time (depending on client epoch)
870 * - waiting_for_peered
871 * - !is_peered() or flushes_in_progress
872 * - only starts blocking on interval change; never restarts
873 * - waiting_for_active
874 * - !is_active()
875 * - only starts blocking on interval change; never restarts
876 * - waiting_for_flush
877 * - is_active() and flushes_in_progress
878 * - waiting for final flush during activate
879 * - waiting_for_scrub
880 * - starts and stops blocking for varying intervals during scrub
881 * - waiting_for_unreadable_object
882 * - never restarts once object is readable (* except for EIO?)
883 * - waiting_for_degraded_object
884 * - never restarts once object is writeable (* except for EIO?)
885 * - waiting_for_blocked_object
886 * - starts and stops based on proxied op activity
887 * - obc rwlocks
888 * - starts and stops based on read/write activity
889 *
890 * Notes:
891 *
892 * 1. During and interval change, we requeue *everything* in the above order.
893 *
894 * 2. When an obc rwlock is released, we check for a scrub block and requeue
895 * the op there if it applies. We ignore the unreadable/degraded/blocked
896 * queues because we assume they cannot apply at that time (this is
897 * probably mostly true).
898 *
899 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
900 * it is non-empty.
901 *
902 * These three behaviors are generally sufficient to maintain ordering, with
903 * the possible exception of cases where we make an object degraded or
904 * unreadable that was previously okay, e.g. when scrub or op processing
905 * encounter an unexpected error. FIXME.
906 */
907
908 // pg waiters
909 unsigned flushes_in_progress;
910
911 // ops with newer maps than our (or blocked behind them)
912 // track these by client, since inter-request ordering doesn't otherwise
913 // matter.
914 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
915
916 // ops waiting on peered
917 list<OpRequestRef> waiting_for_peered;
918
919 // ops waiting on active (require peered as well)
920 list<OpRequestRef> waiting_for_active;
921 list<OpRequestRef> waiting_for_flush;
922 list<OpRequestRef> waiting_for_scrub;
923
924 list<OpRequestRef> waiting_for_cache_not_full;
925 list<OpRequestRef> waiting_for_clean_to_primary_repair;
926 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
927 waiting_for_degraded_object,
928 waiting_for_blocked_object;
929
930 set<hobject_t> objects_blocked_on_cache_full;
931 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
932 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
933
934 // Callbacks should assume pg (and nothing else) is locked
935 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
936
937 map<eversion_t,
938 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
939
940 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
941 void requeue_op(OpRequestRef op);
942 void requeue_ops(list<OpRequestRef> &l);
943
944 // stats that persist lazily
945 object_stat_collection_t unstable_stats;
946
947 // publish stats
948 Mutex pg_stats_publish_lock;
949 bool pg_stats_publish_valid;
950 pg_stat_t pg_stats_publish;
951
952 // for ordering writes
953 ceph::shared_ptr<ObjectStore::Sequencer> osr;
954
955 void _update_calc_stats();
956 void _update_blocked_by();
957 void publish_stats_to_osd();
958 void clear_publish_stats();
959
960 public:
961 void clear_primary_state();
962
963 bool is_actingbackfill(pg_shard_t osd) const {
964 return actingbackfill.count(osd);
965 }
966 bool is_acting(pg_shard_t osd) const {
967 return has_shard(pool.info.ec_pool(), acting, osd);
968 }
969 bool is_up(pg_shard_t osd) const {
970 return has_shard(pool.info.ec_pool(), up, osd);
971 }
972 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
973 if (ec) {
974 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
975 } else {
976 return std::find(v.begin(), v.end(), osd.osd) != v.end();
977 }
978 }
979
980 bool needs_recovery() const;
981 bool needs_backfill() const;
982
983 /// clip calculated priority to reasonable range
984 inline int clamp_recovery_priority(int priority);
985 /// get log recovery reservation priority
986 unsigned get_recovery_priority();
987 /// get backfill reservation priority
988 unsigned get_backfill_priority();
989
990 void mark_clean(); ///< mark an active pg clean
991 void _change_recovery_force_mode(int new_mode, bool clear);
992
993 /// return [start,end) bounds for required past_intervals
994 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
995 const pg_info_t &info,
996 epoch_t oldest_map) {
997 epoch_t start = MAX(
998 info.history.last_epoch_clean ? info.history.last_epoch_clean :
999 info.history.epoch_pool_created,
1000 oldest_map);
1001 epoch_t end = MAX(
1002 info.history.same_interval_since,
1003 info.history.epoch_pool_created);
1004 return make_pair(start, end);
1005 }
1006 void check_past_interval_bounds() const;
1007 PastIntervals::PriorSet build_prior();
1008
1009 void remove_down_peer_info(const OSDMapRef osdmap);
1010
1011 bool adjust_need_up_thru(const OSDMapRef osdmap);
1012
1013 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1014 virtual void dump_recovery_info(Formatter *f) const = 0;
1015
1016 bool calc_min_last_complete_ondisk() {
1017 eversion_t min = last_complete_ondisk;
1018 assert(!actingbackfill.empty());
1019 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1020 i != actingbackfill.end();
1021 ++i) {
1022 if (*i == get_primary()) continue;
1023 if (peer_last_complete_ondisk.count(*i) == 0)
1024 return false; // we don't have complete info
1025 eversion_t a = peer_last_complete_ondisk[*i];
1026 if (a < min)
1027 min = a;
1028 }
1029 if (min == min_last_complete_ondisk)
1030 return false;
1031 min_last_complete_ondisk = min;
1032 return true;
1033 }
1034
1035 virtual void calc_trim_to() = 0;
1036
1037 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1038 pg_missing_t& omissing, pg_shard_t from);
1039 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1040 pg_missing_t& omissing, pg_shard_t from);
1041 bool proc_replica_info(
1042 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1043
1044 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1045 PG *pg;
1046 ObjectStore::Transaction *t;
1047 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1048
1049 // LogEntryHandler
1050 void remove(const hobject_t &hoid) override {
1051 pg->get_pgbackend()->remove(hoid, t);
1052 }
1053 void try_stash(const hobject_t &hoid, version_t v) override {
1054 pg->get_pgbackend()->try_stash(hoid, v, t);
1055 }
1056 void rollback(const pg_log_entry_t &entry) override {
1057 assert(entry.can_rollback());
1058 pg->get_pgbackend()->rollback(entry, t);
1059 }
1060 void rollforward(const pg_log_entry_t &entry) override {
1061 pg->get_pgbackend()->rollforward(entry, t);
1062 }
1063 void trim(const pg_log_entry_t &entry) override {
1064 pg->get_pgbackend()->trim(entry, t);
1065 }
1066 };
1067
1068 void update_object_snap_mapping(
1069 ObjectStore::Transaction *t, const hobject_t &soid,
1070 const set<snapid_t> &snaps);
1071 void clear_object_snap_mapping(
1072 ObjectStore::Transaction *t, const hobject_t &soid);
1073 void remove_snap_mapped_object(
1074 ObjectStore::Transaction& t, const hobject_t& soid);
1075 void merge_log(
1076 ObjectStore::Transaction& t, pg_info_t &oinfo,
1077 pg_log_t &olog, pg_shard_t from);
1078 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1079 bool search_for_missing(
1080 const pg_info_t &oinfo, const pg_missing_t &omissing,
1081 pg_shard_t fromosd,
1082 RecoveryCtx*);
1083
1084 void check_for_lost_objects();
1085 void forget_lost_objects();
1086
1087 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1088
1089 void trim_write_ahead();
1090
1091 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1092 const map<pg_shard_t, pg_info_t> &infos,
1093 bool restrict_to_up_acting,
1094 bool *history_les_bound) const;
1095 static void calc_ec_acting(
1096 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1097 unsigned size,
1098 const vector<int> &acting,
1099 pg_shard_t acting_primary,
1100 const vector<int> &up,
1101 pg_shard_t up_primary,
1102 const map<pg_shard_t, pg_info_t> &all_info,
1103 bool restrict_to_up_acting,
1104 vector<int> *want,
1105 set<pg_shard_t> *backfill,
1106 set<pg_shard_t> *acting_backfill,
1107 pg_shard_t *want_primary,
1108 ostream &ss);
1109 static void calc_replicated_acting(
1110 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1111 unsigned size,
1112 const vector<int> &acting,
1113 pg_shard_t acting_primary,
1114 const vector<int> &up,
1115 pg_shard_t up_primary,
1116 const map<pg_shard_t, pg_info_t> &all_info,
1117 bool restrict_to_up_acting,
1118 vector<int> *want,
1119 set<pg_shard_t> *backfill,
1120 set<pg_shard_t> *acting_backfill,
1121 pg_shard_t *want_primary,
1122 ostream &ss);
1123 bool choose_acting(pg_shard_t &auth_log_shard,
1124 bool restrict_to_up_acting,
1125 bool *history_les_bound);
1126 void build_might_have_unfound();
1127 void activate(
1128 ObjectStore::Transaction& t,
1129 epoch_t activation_epoch,
1130 list<Context*>& tfin,
1131 map<int, map<spg_t,pg_query_t> >& query_map,
1132 map<int,
1133 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1134 RecoveryCtx *ctx);
1135 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1136 void all_activated_and_committed();
1137
1138 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1139
1140 bool have_unfound() const {
1141 return missing_loc.have_unfound();
1142 }
1143 uint64_t get_num_unfound() const {
1144 return missing_loc.num_unfound();
1145 }
1146
1147 virtual void check_local() = 0;
1148
1149 /**
1150 * @param ops_begun returns how many recovery ops the function started
1151 * @returns true if any useful work was accomplished; false otherwise
1152 */
1153 virtual bool start_recovery_ops(
1154 uint64_t max,
1155 ThreadPool::TPHandle &handle,
1156 uint64_t *ops_begun) = 0;
1157
1158 void purge_strays();
1159
1160 void update_heartbeat_peers();
1161
1162 Context *finish_sync_event;
1163
1164 void finish_recovery(list<Context*>& tfin);
1165 void _finish_recovery(Context *c);
1166 void cancel_recovery();
1167 void clear_recovery_state();
1168 virtual void _clear_recovery_state() = 0;
1169 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1170 void start_recovery_op(const hobject_t& soid);
1171 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1172
1173 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1174 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1175
1176 friend class C_OSD_RepModify_Commit;
1177
1178 // -- backoff --
1179 Mutex backoff_lock; // orders inside Backoff::lock
1180 map<hobject_t,set<BackoffRef>> backoffs;
1181
1182 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1183 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1184 void release_backoffs(const hobject_t& o) {
1185 release_backoffs(o, o);
1186 }
1187 void clear_backoffs();
1188
1189 void add_pg_backoff(SessionRef s) {
1190 hobject_t begin = info.pgid.pgid.get_hobj_start();
1191 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1192 add_backoff(s, begin, end);
1193 }
1194 void release_pg_backoffs() {
1195 hobject_t begin = info.pgid.pgid.get_hobj_start();
1196 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1197 release_backoffs(begin, end);
1198 }
1199
1200 void rm_backoff(BackoffRef b);
1201
1202 // -- scrub --
1203 struct Scrubber {
1204 Scrubber();
1205 ~Scrubber();
1206
1207 // metadata
1208 set<pg_shard_t> reserved_peers;
1209 bool reserved, reserve_failed;
1210 epoch_t epoch_start;
1211
1212 // common to both scrubs
1213 bool active;
1214 int waiting_on;
1215 set<pg_shard_t> waiting_on_whom;
1216 int shallow_errors;
1217 int deep_errors;
1218 int fixed;
1219 ScrubMap primary_scrubmap;
1220 map<pg_shard_t, ScrubMap> received_maps;
1221 OpRequestRef active_rep_scrub;
1222 utime_t scrub_reg_stamp; // stamp we registered for
1223
1224 // For async sleep
1225 bool sleeping = false;
1226 bool needs_sleep = true;
1227 utime_t sleep_start;
1228
1229 // flags to indicate explicitly requested scrubs (by admin)
1230 bool must_scrub, must_deep_scrub, must_repair;
1231
1232 // Priority to use for scrub scheduling
1233 unsigned priority;
1234
1235 // this flag indicates whether we would like to do auto-repair of the PG or not
1236 bool auto_repair;
1237
1238 // Maps from objects with errors to missing/inconsistent peers
1239 map<hobject_t, set<pg_shard_t>> missing;
1240 map<hobject_t, set<pg_shard_t>> inconsistent;
1241
1242 // Map from object with errors to good peers
1243 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1244
1245 // Cleaned map pending snap metadata scrub
1246 ScrubMap cleaned_meta_map;
1247
1248 // digest updates which we are waiting on
1249 int num_digest_updates_pending;
1250
1251 // chunky scrub
1252 hobject_t start, end;
1253 eversion_t subset_last_update;
1254
1255 // chunky scrub state
1256 enum State {
1257 INACTIVE,
1258 NEW_CHUNK,
1259 WAIT_PUSHES,
1260 WAIT_LAST_UPDATE,
1261 BUILD_MAP,
1262 WAIT_REPLICAS,
1263 COMPARE_MAPS,
1264 WAIT_DIGEST_UPDATES,
1265 FINISH,
1266 } state;
1267
1268 std::unique_ptr<Scrub::Store> store;
1269 // deep scrub
1270 bool deep;
1271 uint32_t seed;
1272
1273 list<Context*> callbacks;
1274 void add_callback(Context *context) {
1275 callbacks.push_back(context);
1276 }
1277 void run_callbacks() {
1278 list<Context*> to_run;
1279 to_run.swap(callbacks);
1280 for (list<Context*>::iterator i = to_run.begin();
1281 i != to_run.end();
1282 ++i) {
1283 (*i)->complete(0);
1284 }
1285 }
1286
1287 static const char *state_string(const PG::Scrubber::State& state) {
1288 const char *ret = NULL;
1289 switch( state )
1290 {
1291 case INACTIVE: ret = "INACTIVE"; break;
1292 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1293 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1294 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1295 case BUILD_MAP: ret = "BUILD_MAP"; break;
1296 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1297 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1298 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1299 case FINISH: ret = "FINISH"; break;
1300 }
1301 return ret;
1302 }
1303
1304 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1305
1306 // classic (non chunk) scrubs block all writes
1307 // chunky scrubs only block writes to a range
1308 bool write_blocked_by_scrub(const hobject_t &soid) {
1309 return (soid >= start && soid < end);
1310 }
1311
1312 // clear all state
1313 void reset() {
1314 active = false;
1315 waiting_on = 0;
1316 waiting_on_whom.clear();
1317 if (active_rep_scrub) {
1318 active_rep_scrub = OpRequestRef();
1319 }
1320 received_maps.clear();
1321
1322 must_scrub = false;
1323 must_deep_scrub = false;
1324 must_repair = false;
1325 auto_repair = false;
1326
1327 state = PG::Scrubber::INACTIVE;
1328 start = hobject_t();
1329 end = hobject_t();
1330 subset_last_update = eversion_t();
1331 shallow_errors = 0;
1332 deep_errors = 0;
1333 fixed = 0;
1334 deep = false;
1335 seed = 0;
1336 run_callbacks();
1337 inconsistent.clear();
1338 missing.clear();
1339 authoritative.clear();
1340 num_digest_updates_pending = 0;
1341 cleaned_meta_map = ScrubMap();
1342 sleeping = false;
1343 needs_sleep = true;
1344 sleep_start = utime_t();
1345 }
1346
1347 void create_results(const hobject_t& obj);
1348 void cleanup_store(ObjectStore::Transaction *t);
1349 } scrubber;
1350
1351 bool scrub_after_recovery;
1352
1353 int active_pushes;
1354
1355 void repair_object(
1356 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1357 pg_shard_t bad_peer);
1358
1359 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1360 void chunky_scrub(ThreadPool::TPHandle &handle);
1361 void scrub_compare_maps();
1362 /**
1363 * return true if any inconsistency/missing is repaired, false otherwise
1364 */
1365 bool scrub_process_inconsistent();
1366 bool ops_blocked_by_scrub() const;
1367 void scrub_finish();
1368 void scrub_clear_state();
1369 void _scan_snaps(ScrubMap &map);
1370 void _repair_oinfo_oid(ScrubMap &map);
1371 void _scan_rollback_obs(
1372 const vector<ghobject_t> &rollback_obs,
1373 ThreadPool::TPHandle &handle);
1374 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1375 hobject_t start, hobject_t end, bool deep,
1376 uint32_t seed);
1377 int build_scrub_map_chunk(
1378 ScrubMap &map,
1379 hobject_t start, hobject_t end, bool deep, uint32_t seed,
1380 ThreadPool::TPHandle &handle);
1381 /**
1382 * returns true if [begin, end) is good to scrub at this time
1383 * a false return value obliges the implementer to requeue scrub when the
1384 * condition preventing scrub clears
1385 */
1386 virtual bool _range_available_for_scrub(
1387 const hobject_t &begin, const hobject_t &end) = 0;
1388 virtual void scrub_snapshot_metadata(
1389 ScrubMap &map,
1390 const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
1391 virtual void _scrub_clear_state() { }
1392 virtual void _scrub_finish() { }
1393 virtual void split_colls(
1394 spg_t child,
1395 int split_bits,
1396 int seed,
1397 const pg_pool_t *pool,
1398 ObjectStore::Transaction *t) = 0;
1399 void clear_scrub_reserved();
1400 void scrub_reserve_replicas();
1401 void scrub_unreserve_replicas();
1402 bool scrub_all_replicas_reserved() const;
1403 bool sched_scrub();
1404 void reg_next_scrub();
1405 void unreg_next_scrub();
1406
1407 void replica_scrub(
1408 OpRequestRef op,
1409 ThreadPool::TPHandle &handle);
1410 void do_replica_scrub_map(OpRequestRef op);
1411 void sub_op_scrub_map(OpRequestRef op);
1412
1413 void handle_scrub_reserve_request(OpRequestRef op);
1414 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1415 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1416 void handle_scrub_reserve_release(OpRequestRef op);
1417
1418 void reject_reservation();
1419 void schedule_backfill_retry(float retry);
1420 void schedule_recovery_retry(float retry);
1421
1422 // -- recovery state --
1423
1424 template <class EVT>
1425 struct QueuePeeringEvt : Context {
1426 PGRef pg;
1427 epoch_t epoch;
1428 EVT evt;
1429 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1430 pg(pg), epoch(epoch), evt(evt) {}
1431 void finish(int r) override {
1432 pg->lock();
1433 pg->queue_peering_event(PG::CephPeeringEvtRef(
1434 new PG::CephPeeringEvt(
1435 epoch,
1436 epoch,
1437 evt)));
1438 pg->unlock();
1439 }
1440 };
1441
1442 class CephPeeringEvt {
1443 epoch_t epoch_sent;
1444 epoch_t epoch_requested;
1445 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1446 string desc;
1447 public:
1448 MEMPOOL_CLASS_HELPERS();
1449 template <class T>
1450 CephPeeringEvt(epoch_t epoch_sent,
1451 epoch_t epoch_requested,
1452 const T &evt_) :
1453 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1454 evt(evt_.intrusive_from_this()) {
1455 stringstream out;
1456 out << "epoch_sent: " << epoch_sent
1457 << " epoch_requested: " << epoch_requested << " ";
1458 evt_.print(&out);
1459 desc = out.str();
1460 }
1461 epoch_t get_epoch_sent() { return epoch_sent; }
1462 epoch_t get_epoch_requested() { return epoch_requested; }
1463 const boost::statechart::event_base &get_event() { return *evt; }
1464 string get_desc() { return desc; }
1465 };
1466 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1467 list<CephPeeringEvtRef> peering_queue; // op queue
1468 list<CephPeeringEvtRef> peering_waiters;
1469
1470 struct QueryState : boost::statechart::event< QueryState > {
1471 Formatter *f;
1472 explicit QueryState(Formatter *f) : f(f) {}
1473 void print(std::ostream *out) const {
1474 *out << "Query";
1475 }
1476 };
1477
1478 struct MInfoRec : boost::statechart::event< MInfoRec > {
1479 pg_shard_t from;
1480 pg_info_t info;
1481 epoch_t msg_epoch;
1482 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1483 from(from), info(info), msg_epoch(msg_epoch) {}
1484 void print(std::ostream *out) const {
1485 *out << "MInfoRec from " << from << " info: " << info;
1486 }
1487 };
1488
1489 struct MLogRec : boost::statechart::event< MLogRec > {
1490 pg_shard_t from;
1491 boost::intrusive_ptr<MOSDPGLog> msg;
1492 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1493 from(from), msg(msg) {}
1494 void print(std::ostream *out) const {
1495 *out << "MLogRec from " << from;
1496 }
1497 };
1498
1499 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1500 pg_shard_t from;
1501 pg_notify_t notify;
1502 uint64_t features;
1503 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1504 from(from), notify(notify), features(f) {}
1505 void print(std::ostream *out) const {
1506 *out << "MNotifyRec from " << from << " notify: " << notify
1507 << " features: 0x" << hex << features << dec;
1508 }
1509 };
1510
1511 struct MQuery : boost::statechart::event< MQuery > {
1512 pg_shard_t from;
1513 pg_query_t query;
1514 epoch_t query_epoch;
1515 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1516 from(from), query(query), query_epoch(query_epoch) {}
1517 void print(std::ostream *out) const {
1518 *out << "MQuery from " << from
1519 << " query_epoch " << query_epoch
1520 << " query: " << query;
1521 }
1522 };
1523
1524 struct AdvMap : boost::statechart::event< AdvMap > {
1525 OSDMapRef osdmap;
1526 OSDMapRef lastmap;
1527 vector<int> newup, newacting;
1528 int up_primary, acting_primary;
1529 AdvMap(
1530 OSDMapRef osdmap, OSDMapRef lastmap,
1531 vector<int>& newup, int up_primary,
1532 vector<int>& newacting, int acting_primary):
1533 osdmap(osdmap), lastmap(lastmap),
1534 newup(newup),
1535 newacting(newacting),
1536 up_primary(up_primary),
1537 acting_primary(acting_primary) {}
1538 void print(std::ostream *out) const {
1539 *out << "AdvMap";
1540 }
1541 };
1542
1543 struct ActMap : boost::statechart::event< ActMap > {
1544 ActMap() : boost::statechart::event< ActMap >() {}
1545 void print(std::ostream *out) const {
1546 *out << "ActMap";
1547 }
1548 };
1549 struct Activate : boost::statechart::event< Activate > {
1550 epoch_t activation_epoch;
1551 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1552 activation_epoch(q) {}
1553 void print(std::ostream *out) const {
1554 *out << "Activate from " << activation_epoch;
1555 }
1556 };
1557 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1558 unsigned priority;
1559 explicit RequestBackfillPrio(unsigned prio) :
1560 boost::statechart::event< RequestBackfillPrio >(),
1561 priority(prio) {}
1562 void print(std::ostream *out) const {
1563 *out << "RequestBackfillPrio: priority " << priority;
1564 }
1565 };
1566 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1567 T() : boost::statechart::event< T >() {} \
1568 void print(std::ostream *out) const { \
1569 *out << #T; \
1570 } \
1571 };
1572 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1573 float delay;
1574 explicit DeferBackfill(float delay) : delay(delay) {}
1575 void print(std::ostream *out) const {
1576 *out << "DeferBackfill: delay " << delay;
1577 }
1578 };
1579 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1580 float delay;
1581 explicit DeferRecovery(float delay) : delay(delay) {}
1582 void print(std::ostream *out) const {
1583 *out << "DeferRecovery: delay " << delay;
1584 }
1585 };
1586 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
1587 explicit UnfoundBackfill() {}
1588 void print(std::ostream *out) const {
1589 *out << "UnfoundBackfill";
1590 }
1591 };
1592 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
1593 explicit UnfoundRecovery() {}
1594 void print(std::ostream *out) const {
1595 *out << "UnfoundRecovery";
1596 }
1597 };
1598 protected:
1599 TrivialEvent(Initialize)
1600 TrivialEvent(Load)
1601 TrivialEvent(GotInfo)
1602 TrivialEvent(NeedUpThru)
1603 TrivialEvent(NullEvt)
1604 TrivialEvent(FlushedEvt)
1605 TrivialEvent(Backfilled)
1606 TrivialEvent(LocalBackfillReserved)
1607 TrivialEvent(RemoteBackfillReserved)
1608 TrivialEvent(RejectRemoteReservation)
1609 TrivialEvent(RemoteReservationRejected)
1610 TrivialEvent(RemoteReservationCanceled)
1611 TrivialEvent(RequestBackfill)
1612 TrivialEvent(RequestRecovery)
1613 TrivialEvent(RecoveryDone)
1614 TrivialEvent(BackfillTooFull)
1615 TrivialEvent(RecoveryTooFull)
1616
1617 TrivialEvent(MakePrimary)
1618 TrivialEvent(MakeStray)
1619 TrivialEvent(NeedActingChange)
1620 TrivialEvent(IsIncomplete)
1621 TrivialEvent(IsDown)
1622
1623 TrivialEvent(AllReplicasRecovered)
1624 TrivialEvent(DoRecovery)
1625 TrivialEvent(LocalRecoveryReserved)
1626 TrivialEvent(RemoteRecoveryReserved)
1627 TrivialEvent(AllRemotesReserved)
1628 TrivialEvent(AllBackfillsReserved)
1629 TrivialEvent(GoClean)
1630
1631 TrivialEvent(AllReplicasActivated)
1632
1633 TrivialEvent(IntervalFlush)
1634
1635 /* Encapsulates PG recovery process */
1636 class RecoveryState {
1637 void start_handle(RecoveryCtx *new_ctx);
1638 void end_handle();
1639 public:
1640 void begin_block_outgoing();
1641 void end_block_outgoing();
1642 void clear_blocked_outgoing();
1643 private:
1644
1645 /* States */
1646 struct Initial;
1647 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1648 RecoveryState *state;
1649 public:
1650 PG *pg;
1651
1652 utime_t event_time;
1653 uint64_t event_count;
1654
1655 void clear_event_counters() {
1656 event_time = utime_t();
1657 event_count = 0;
1658 }
1659
1660 void log_enter(const char *state_name);
1661 void log_exit(const char *state_name, utime_t duration);
1662
1663 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1664
1665 /* Accessor functions for state methods */
1666 ObjectStore::Transaction* get_cur_transaction() {
1667 assert(state->rctx);
1668 assert(state->rctx->transaction);
1669 return state->rctx->transaction;
1670 }
1671
1672 void send_query(pg_shard_t to, const pg_query_t &query) {
1673 assert(state->rctx);
1674 assert(state->rctx->query_map);
1675 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1676 query;
1677 }
1678
1679 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1680 assert(state->rctx);
1681 assert(state->rctx->query_map);
1682 return state->rctx->query_map;
1683 }
1684
1685 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1686 assert(state->rctx);
1687 assert(state->rctx->info_map);
1688 return state->rctx->info_map;
1689 }
1690
1691 list< Context* > *get_on_safe_context_list() {
1692 assert(state->rctx);
1693 assert(state->rctx->on_safe);
1694 return &(state->rctx->on_safe->contexts);
1695 }
1696
1697 list< Context * > *get_on_applied_context_list() {
1698 assert(state->rctx);
1699 assert(state->rctx->on_applied);
1700 return &(state->rctx->on_applied->contexts);
1701 }
1702
1703 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1704
1705 void send_notify(pg_shard_t to,
1706 const pg_notify_t &info, const PastIntervals &pi) {
1707 assert(state->rctx);
1708 assert(state->rctx->notify_list);
1709 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1710 }
1711 };
1712 friend class RecoveryMachine;
1713
1714 /* States */
1715 // Initial
1716 // Reset
1717 // Start
1718 // Started
1719 // Primary
1720 // WaitActingChange
1721 // Peering
1722 // GetInfo
1723 // GetLog
1724 // GetMissing
1725 // WaitUpThru
1726 // Incomplete
1727 // Active
1728 // Activating
1729 // Clean
1730 // Recovered
1731 // Backfilling
1732 // WaitRemoteBackfillReserved
1733 // WaitLocalBackfillReserved
1734 // NotBackfilling
1735 // NotRecovering
1736 // Recovering
1737 // WaitRemoteRecoveryReserved
1738 // WaitLocalRecoveryReserved
1739 // ReplicaActive
1740 // RepNotRecovering
1741 // RepRecovering
1742 // RepWaitBackfillReserved
1743 // RepWaitRecoveryReserved
1744 // Stray
1745
1746 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1747 explicit Crashed(my_context ctx);
1748 };
1749
1750 struct Reset;
1751
1752 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1753 explicit Initial(my_context ctx);
1754 void exit();
1755
1756 typedef boost::mpl::list <
1757 boost::statechart::transition< Initialize, Reset >,
1758 boost::statechart::custom_reaction< Load >,
1759 boost::statechart::custom_reaction< NullEvt >,
1760 boost::statechart::transition< boost::statechart::event_base, Crashed >
1761 > reactions;
1762
1763 boost::statechart::result react(const Load&);
1764 boost::statechart::result react(const MNotifyRec&);
1765 boost::statechart::result react(const MInfoRec&);
1766 boost::statechart::result react(const MLogRec&);
1767 boost::statechart::result react(const boost::statechart::event_base&) {
1768 return discard_event();
1769 }
1770 };
1771
1772 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1773 explicit Reset(my_context ctx);
1774 void exit();
1775
1776 typedef boost::mpl::list <
1777 boost::statechart::custom_reaction< QueryState >,
1778 boost::statechart::custom_reaction< AdvMap >,
1779 boost::statechart::custom_reaction< ActMap >,
1780 boost::statechart::custom_reaction< NullEvt >,
1781 boost::statechart::custom_reaction< FlushedEvt >,
1782 boost::statechart::custom_reaction< IntervalFlush >,
1783 boost::statechart::transition< boost::statechart::event_base, Crashed >
1784 > reactions;
1785 boost::statechart::result react(const QueryState& q);
1786 boost::statechart::result react(const AdvMap&);
1787 boost::statechart::result react(const ActMap&);
1788 boost::statechart::result react(const FlushedEvt&);
1789 boost::statechart::result react(const IntervalFlush&);
1790 boost::statechart::result react(const boost::statechart::event_base&) {
1791 return discard_event();
1792 }
1793 };
1794
1795 struct Start;
1796
1797 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1798 explicit Started(my_context ctx);
1799 void exit();
1800
1801 typedef boost::mpl::list <
1802 boost::statechart::custom_reaction< QueryState >,
1803 boost::statechart::custom_reaction< AdvMap >,
1804 boost::statechart::custom_reaction< NullEvt >,
1805 boost::statechart::custom_reaction< FlushedEvt >,
1806 boost::statechart::custom_reaction< IntervalFlush >,
1807 boost::statechart::transition< boost::statechart::event_base, Crashed >
1808 > reactions;
1809 boost::statechart::result react(const QueryState& q);
1810 boost::statechart::result react(const AdvMap&);
1811 boost::statechart::result react(const FlushedEvt&);
1812 boost::statechart::result react(const IntervalFlush&);
1813 boost::statechart::result react(const boost::statechart::event_base&) {
1814 return discard_event();
1815 }
1816 };
1817
1818 struct Primary;
1819 struct Stray;
1820
1821 struct Start : boost::statechart::state< Start, Started >, NamedState {
1822 explicit Start(my_context ctx);
1823 void exit();
1824
1825 typedef boost::mpl::list <
1826 boost::statechart::transition< MakePrimary, Primary >,
1827 boost::statechart::transition< MakeStray, Stray >
1828 > reactions;
1829 };
1830
1831 struct Peering;
1832 struct WaitActingChange;
1833 struct Incomplete;
1834 struct Down;
1835
1836 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1837 explicit Primary(my_context ctx);
1838 void exit();
1839
1840 typedef boost::mpl::list <
1841 boost::statechart::custom_reaction< ActMap >,
1842 boost::statechart::custom_reaction< MNotifyRec >,
1843 boost::statechart::transition< NeedActingChange, WaitActingChange >
1844 > reactions;
1845 boost::statechart::result react(const ActMap&);
1846 boost::statechart::result react(const MNotifyRec&);
1847 };
1848
1849 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1850 NamedState {
1851 typedef boost::mpl::list <
1852 boost::statechart::custom_reaction< QueryState >,
1853 boost::statechart::custom_reaction< AdvMap >,
1854 boost::statechart::custom_reaction< MLogRec >,
1855 boost::statechart::custom_reaction< MInfoRec >,
1856 boost::statechart::custom_reaction< MNotifyRec >
1857 > reactions;
1858 explicit WaitActingChange(my_context ctx);
1859 boost::statechart::result react(const QueryState& q);
1860 boost::statechart::result react(const AdvMap&);
1861 boost::statechart::result react(const MLogRec&);
1862 boost::statechart::result react(const MInfoRec&);
1863 boost::statechart::result react(const MNotifyRec&);
1864 void exit();
1865 };
1866
1867 struct GetInfo;
1868 struct Active;
1869
1870 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1871 PastIntervals::PriorSet prior_set;
1872 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1873
1874 explicit Peering(my_context ctx);
1875 void exit();
1876
1877 typedef boost::mpl::list <
1878 boost::statechart::custom_reaction< QueryState >,
1879 boost::statechart::transition< Activate, Active >,
1880 boost::statechart::custom_reaction< AdvMap >
1881 > reactions;
1882 boost::statechart::result react(const QueryState& q);
1883 boost::statechart::result react(const AdvMap &advmap);
1884 };
1885
1886 struct WaitLocalRecoveryReserved;
1887 struct Activating;
1888 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1889 explicit Active(my_context ctx);
1890 void exit();
1891
1892 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1893 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1894 bool all_replicas_activated;
1895
1896 typedef boost::mpl::list <
1897 boost::statechart::custom_reaction< QueryState >,
1898 boost::statechart::custom_reaction< ActMap >,
1899 boost::statechart::custom_reaction< AdvMap >,
1900 boost::statechart::custom_reaction< MInfoRec >,
1901 boost::statechart::custom_reaction< MNotifyRec >,
1902 boost::statechart::custom_reaction< MLogRec >,
1903 boost::statechart::custom_reaction< Backfilled >,
1904 boost::statechart::custom_reaction< AllReplicasActivated >,
1905 boost::statechart::custom_reaction< DeferRecovery >,
1906 boost::statechart::custom_reaction< DeferBackfill >,
1907 boost::statechart::custom_reaction< UnfoundRecovery >,
1908 boost::statechart::custom_reaction< UnfoundBackfill >,
1909 boost::statechart::custom_reaction< DoRecovery>
1910 > reactions;
1911 boost::statechart::result react(const QueryState& q);
1912 boost::statechart::result react(const ActMap&);
1913 boost::statechart::result react(const AdvMap&);
1914 boost::statechart::result react(const MInfoRec& infoevt);
1915 boost::statechart::result react(const MNotifyRec& notevt);
1916 boost::statechart::result react(const MLogRec& logevt);
1917 boost::statechart::result react(const Backfilled&) {
1918 return discard_event();
1919 }
1920 boost::statechart::result react(const AllReplicasActivated&);
1921 boost::statechart::result react(const DeferRecovery& evt) {
1922 return discard_event();
1923 }
1924 boost::statechart::result react(const DeferBackfill& evt) {
1925 return discard_event();
1926 }
1927 boost::statechart::result react(const UnfoundRecovery& evt) {
1928 return discard_event();
1929 }
1930 boost::statechart::result react(const UnfoundBackfill& evt) {
1931 return discard_event();
1932 }
1933 boost::statechart::result react(const DoRecovery&) {
1934 return discard_event();
1935 }
1936 };
1937
1938 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1939 typedef boost::mpl::list<
1940 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1941 > reactions;
1942 explicit Clean(my_context ctx);
1943 void exit();
1944 };
1945
1946 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1947 typedef boost::mpl::list<
1948 boost::statechart::transition< GoClean, Clean >,
1949 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
1950 boost::statechart::custom_reaction< AllReplicasActivated >
1951 > reactions;
1952 explicit Recovered(my_context ctx);
1953 void exit();
1954 boost::statechart::result react(const AllReplicasActivated&) {
1955 post_event(GoClean());
1956 return forward_event();
1957 }
1958 };
1959
1960 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
1961 typedef boost::mpl::list<
1962 boost::statechart::transition< Backfilled, Recovered >,
1963 boost::statechart::custom_reaction< DeferBackfill >,
1964 boost::statechart::custom_reaction< UnfoundBackfill >,
1965 boost::statechart::custom_reaction< RemoteReservationRejected >
1966 > reactions;
1967 explicit Backfilling(my_context ctx);
1968 boost::statechart::result react(const RemoteReservationRejected& evt);
1969 boost::statechart::result react(const DeferBackfill& evt);
1970 boost::statechart::result react(const UnfoundBackfill& evt);
1971 void exit();
1972 };
1973
1974 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
1975 typedef boost::mpl::list<
1976 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1977 boost::statechart::custom_reaction< RemoteReservationRejected >,
1978 boost::statechart::transition< AllBackfillsReserved, Backfilling >
1979 > reactions;
1980 set<pg_shard_t>::const_iterator backfill_osd_it;
1981 explicit WaitRemoteBackfillReserved(my_context ctx);
1982 void exit();
1983 boost::statechart::result react(const RemoteBackfillReserved& evt);
1984 boost::statechart::result react(const RemoteReservationRejected& evt);
1985 };
1986
1987 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
1988 typedef boost::mpl::list<
1989 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
1990 > reactions;
1991 explicit WaitLocalBackfillReserved(my_context ctx);
1992 void exit();
1993 };
1994
1995 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
1996 typedef boost::mpl::list<
1997 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
1998 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1999 boost::statechart::custom_reaction< RemoteReservationRejected >
2000 > reactions;
2001 explicit NotBackfilling(my_context ctx);
2002 void exit();
2003 boost::statechart::result react(const RemoteBackfillReserved& evt);
2004 boost::statechart::result react(const RemoteReservationRejected& evt);
2005 };
2006
2007 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2008 typedef boost::mpl::list<
2009 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2010 boost::statechart::custom_reaction< DeferRecovery >,
2011 boost::statechart::custom_reaction< UnfoundRecovery >
2012 > reactions;
2013 explicit NotRecovering(my_context ctx);
2014 boost::statechart::result react(const DeferRecovery& evt) {
2015 /* no-op */
2016 return discard_event();
2017 }
2018 boost::statechart::result react(const UnfoundRecovery& evt) {
2019 /* no-op */
2020 return discard_event();
2021 }
2022 void exit();
2023 };
2024
2025 struct RepNotRecovering;
2026 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2027 explicit ReplicaActive(my_context ctx);
2028 void exit();
2029
2030 typedef boost::mpl::list <
2031 boost::statechart::custom_reaction< QueryState >,
2032 boost::statechart::custom_reaction< ActMap >,
2033 boost::statechart::custom_reaction< MQuery >,
2034 boost::statechart::custom_reaction< MInfoRec >,
2035 boost::statechart::custom_reaction< MLogRec >,
2036 boost::statechart::custom_reaction< Activate >,
2037 boost::statechart::custom_reaction< DeferRecovery >,
2038 boost::statechart::custom_reaction< DeferBackfill >,
2039 boost::statechart::custom_reaction< UnfoundRecovery >,
2040 boost::statechart::custom_reaction< UnfoundBackfill >
2041 > reactions;
2042 boost::statechart::result react(const QueryState& q);
2043 boost::statechart::result react(const MInfoRec& infoevt);
2044 boost::statechart::result react(const MLogRec& logevt);
2045 boost::statechart::result react(const ActMap&);
2046 boost::statechart::result react(const MQuery&);
2047 boost::statechart::result react(const Activate&);
2048 boost::statechart::result react(const DeferRecovery& evt) {
2049 return discard_event();
2050 }
2051 boost::statechart::result react(const DeferBackfill& evt) {
2052 return discard_event();
2053 }
2054 boost::statechart::result react(const UnfoundRecovery& evt) {
2055 return discard_event();
2056 }
2057 boost::statechart::result react(const UnfoundBackfill& evt) {
2058 return discard_event();
2059 }
2060 };
2061
2062 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2063 typedef boost::mpl::list<
2064 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2065 // for compat with old peers
2066 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2067 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2068 boost::statechart::custom_reaction< BackfillTooFull >
2069 > reactions;
2070 explicit RepRecovering(my_context ctx);
2071 boost::statechart::result react(const BackfillTooFull &evt);
2072 void exit();
2073 };
2074
2075 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2076 typedef boost::mpl::list<
2077 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2078 boost::statechart::custom_reaction< RejectRemoteReservation >,
2079 boost::statechart::custom_reaction< RemoteReservationRejected >,
2080 boost::statechart::custom_reaction< RemoteReservationCanceled >
2081 > reactions;
2082 explicit RepWaitBackfillReserved(my_context ctx);
2083 void exit();
2084 boost::statechart::result react(const RemoteBackfillReserved &evt);
2085 boost::statechart::result react(const RejectRemoteReservation &evt);
2086 boost::statechart::result react(const RemoteReservationRejected &evt);
2087 boost::statechart::result react(const RemoteReservationCanceled &evt);
2088 };
2089
2090 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2091 typedef boost::mpl::list<
2092 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2093 // for compat with old peers
2094 boost::statechart::custom_reaction< RemoteReservationRejected >,
2095 boost::statechart::custom_reaction< RemoteReservationCanceled >
2096 > reactions;
2097 explicit RepWaitRecoveryReserved(my_context ctx);
2098 void exit();
2099 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2100 boost::statechart::result react(const RemoteReservationRejected &evt) {
2101 // for compat with old peers
2102 post_event(RemoteReservationCanceled());
2103 return discard_event();
2104 }
2105 boost::statechart::result react(const RemoteReservationCanceled &evt);
2106 };
2107
2108 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2109 typedef boost::mpl::list<
2110 boost::statechart::custom_reaction< RequestBackfillPrio >,
2111 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
2112 boost::statechart::custom_reaction< RejectRemoteReservation >,
2113 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2114 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2115 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2116 > reactions;
2117 explicit RepNotRecovering(my_context ctx);
2118 boost::statechart::result react(const RequestBackfillPrio &evt);
2119 boost::statechart::result react(const RejectRemoteReservation &evt);
2120 void exit();
2121 };
2122
2123 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2124 typedef boost::mpl::list <
2125 boost::statechart::custom_reaction< AllReplicasRecovered >,
2126 boost::statechart::custom_reaction< DeferRecovery >,
2127 boost::statechart::custom_reaction< UnfoundRecovery >,
2128 boost::statechart::custom_reaction< RequestBackfill >
2129 > reactions;
2130 explicit Recovering(my_context ctx);
2131 void exit();
2132 void release_reservations(bool cancel = false);
2133 boost::statechart::result react(const AllReplicasRecovered &evt);
2134 boost::statechart::result react(const DeferRecovery& evt);
2135 boost::statechart::result react(const UnfoundRecovery& evt);
2136 boost::statechart::result react(const RequestBackfill &evt);
2137 };
2138
2139 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2140 typedef boost::mpl::list <
2141 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2142 boost::statechart::transition< AllRemotesReserved, Recovering >
2143 > reactions;
2144 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2145 explicit WaitRemoteRecoveryReserved(my_context ctx);
2146 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2147 void exit();
2148 };
2149
2150 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2151 typedef boost::mpl::list <
2152 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2153 boost::statechart::custom_reaction< RecoveryTooFull >
2154 > reactions;
2155 explicit WaitLocalRecoveryReserved(my_context ctx);
2156 void exit();
2157 boost::statechart::result react(const RecoveryTooFull &evt);
2158 };
2159
2160 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2161 typedef boost::mpl::list <
2162 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2163 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2164 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2165 > reactions;
2166 explicit Activating(my_context ctx);
2167 void exit();
2168 };
2169
2170 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2171 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2172
2173 explicit Stray(my_context ctx);
2174 void exit();
2175
2176 typedef boost::mpl::list <
2177 boost::statechart::custom_reaction< MQuery >,
2178 boost::statechart::custom_reaction< MLogRec >,
2179 boost::statechart::custom_reaction< MInfoRec >,
2180 boost::statechart::custom_reaction< ActMap >,
2181 boost::statechart::custom_reaction< RecoveryDone >
2182 > reactions;
2183 boost::statechart::result react(const MQuery& query);
2184 boost::statechart::result react(const MLogRec& logevt);
2185 boost::statechart::result react(const MInfoRec& infoevt);
2186 boost::statechart::result react(const ActMap&);
2187 boost::statechart::result react(const RecoveryDone&) {
2188 return discard_event();
2189 }
2190 };
2191
2192 struct GetLog;
2193
2194 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2195 set<pg_shard_t> peer_info_requested;
2196
2197 explicit GetInfo(my_context ctx);
2198 void exit();
2199 void get_infos();
2200
2201 typedef boost::mpl::list <
2202 boost::statechart::custom_reaction< QueryState >,
2203 boost::statechart::transition< GotInfo, GetLog >,
2204 boost::statechart::custom_reaction< MNotifyRec >,
2205 boost::statechart::transition< IsDown, Down >
2206 > reactions;
2207 boost::statechart::result react(const QueryState& q);
2208 boost::statechart::result react(const MNotifyRec& infoevt);
2209 };
2210
2211 struct GotLog : boost::statechart::event< GotLog > {
2212 GotLog() : boost::statechart::event< GotLog >() {}
2213 };
2214
2215 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2216 pg_shard_t auth_log_shard;
2217 boost::intrusive_ptr<MOSDPGLog> msg;
2218
2219 explicit GetLog(my_context ctx);
2220 void exit();
2221
2222 typedef boost::mpl::list <
2223 boost::statechart::custom_reaction< QueryState >,
2224 boost::statechart::custom_reaction< MLogRec >,
2225 boost::statechart::custom_reaction< GotLog >,
2226 boost::statechart::custom_reaction< AdvMap >,
2227 boost::statechart::transition< IsIncomplete, Incomplete >
2228 > reactions;
2229 boost::statechart::result react(const AdvMap&);
2230 boost::statechart::result react(const QueryState& q);
2231 boost::statechart::result react(const MLogRec& logevt);
2232 boost::statechart::result react(const GotLog&);
2233 };
2234
2235 struct WaitUpThru;
2236
2237 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2238 set<pg_shard_t> peer_missing_requested;
2239
2240 explicit GetMissing(my_context ctx);
2241 void exit();
2242
2243 typedef boost::mpl::list <
2244 boost::statechart::custom_reaction< QueryState >,
2245 boost::statechart::custom_reaction< MLogRec >,
2246 boost::statechart::transition< NeedUpThru, WaitUpThru >
2247 > reactions;
2248 boost::statechart::result react(const QueryState& q);
2249 boost::statechart::result react(const MLogRec& logevt);
2250 };
2251
2252 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2253 explicit WaitUpThru(my_context ctx);
2254 void exit();
2255
2256 typedef boost::mpl::list <
2257 boost::statechart::custom_reaction< QueryState >,
2258 boost::statechart::custom_reaction< ActMap >,
2259 boost::statechart::custom_reaction< MLogRec >
2260 > reactions;
2261 boost::statechart::result react(const QueryState& q);
2262 boost::statechart::result react(const ActMap& am);
2263 boost::statechart::result react(const MLogRec& logrec);
2264 };
2265
2266 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2267 explicit Down(my_context ctx);
2268 typedef boost::mpl::list <
2269 boost::statechart::custom_reaction< QueryState >
2270 > reactions;
2271 boost::statechart::result react(const QueryState& infoevt);
2272 void exit();
2273 };
2274
2275 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2276 typedef boost::mpl::list <
2277 boost::statechart::custom_reaction< AdvMap >,
2278 boost::statechart::custom_reaction< MNotifyRec >,
2279 boost::statechart::custom_reaction< QueryState >
2280 > reactions;
2281 explicit Incomplete(my_context ctx);
2282 boost::statechart::result react(const AdvMap &advmap);
2283 boost::statechart::result react(const MNotifyRec& infoevt);
2284 boost::statechart::result react(const QueryState& infoevt);
2285 void exit();
2286 };
2287
2288
2289 RecoveryMachine machine;
2290 PG *pg;
2291
2292 /// context passed in by state machine caller
2293 RecoveryCtx *orig_ctx;
2294
2295 /// populated if we are buffering messages pending a flush
2296 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2297
2298 /**
2299 * populated between start_handle() and end_handle(), points into
2300 * the message lists for messages_pending_flush while blocking messages
2301 * or into orig_ctx otherwise
2302 */
2303 boost::optional<RecoveryCtx> rctx;
2304
2305 public:
2306 explicit RecoveryState(PG *pg)
2307 : machine(this, pg), pg(pg), orig_ctx(0) {
2308 machine.initiate();
2309 }
2310
2311 void handle_event(const boost::statechart::event_base &evt,
2312 RecoveryCtx *rctx) {
2313 start_handle(rctx);
2314 machine.process_event(evt);
2315 end_handle();
2316 }
2317
2318 void handle_event(CephPeeringEvtRef evt,
2319 RecoveryCtx *rctx) {
2320 start_handle(rctx);
2321 machine.process_event(evt->get_event());
2322 end_handle();
2323 }
2324
2325 } recovery_state;
2326
2327
2328 public:
2329 PG(OSDService *o, OSDMapRef curmap,
2330 const PGPool &pool, spg_t p);
2331 ~PG() override;
2332
2333 private:
2334 // Prevent copying
2335 explicit PG(const PG& rhs);
2336 PG& operator=(const PG& rhs);
2337 const spg_t pg_id;
2338 uint64_t peer_features;
2339 uint64_t acting_features;
2340 uint64_t upacting_features;
2341
2342 epoch_t last_epoch;
2343
2344 public:
2345 const spg_t& get_pgid() const { return pg_id; }
2346
2347 void reset_min_peer_features() {
2348 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2349 }
2350 uint64_t get_min_peer_features() const { return peer_features; }
2351 void apply_peer_features(uint64_t f) { peer_features &= f; }
2352
2353 uint64_t get_min_acting_features() const { return acting_features; }
2354 uint64_t get_min_upacting_features() const { return upacting_features; }
2355 bool perform_deletes_during_peering() const {
2356 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2357 }
2358
2359 void init_primary_up_acting(
2360 const vector<int> &newup,
2361 const vector<int> &newacting,
2362 int new_up_primary,
2363 int new_acting_primary) {
2364 actingset.clear();
2365 acting = newacting;
2366 for (uint8_t i = 0; i < acting.size(); ++i) {
2367 if (acting[i] != CRUSH_ITEM_NONE)
2368 actingset.insert(
2369 pg_shard_t(
2370 acting[i],
2371 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2372 }
2373 upset.clear();
2374 up = newup;
2375 for (uint8_t i = 0; i < up.size(); ++i) {
2376 if (up[i] != CRUSH_ITEM_NONE)
2377 upset.insert(
2378 pg_shard_t(
2379 up[i],
2380 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2381 }
2382 if (!pool.info.ec_pool()) {
2383 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2384 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2385 return;
2386 }
2387 up_primary = pg_shard_t();
2388 primary = pg_shard_t();
2389 for (uint8_t i = 0; i < up.size(); ++i) {
2390 if (up[i] == new_up_primary) {
2391 up_primary = pg_shard_t(up[i], shard_id_t(i));
2392 break;
2393 }
2394 }
2395 for (uint8_t i = 0; i < acting.size(); ++i) {
2396 if (acting[i] == new_acting_primary) {
2397 primary = pg_shard_t(acting[i], shard_id_t(i));
2398 break;
2399 }
2400 }
2401 assert(up_primary.osd == new_up_primary);
2402 assert(primary.osd == new_acting_primary);
2403 }
2404 pg_shard_t get_primary() const { return primary; }
2405
2406 int get_role() const { return role; }
2407 void set_role(int r) { role = r; }
2408
2409 bool is_primary() const { return pg_whoami == primary; }
2410 bool is_replica() const { return role > 0; }
2411
2412 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2413
2414 //int get_state() const { return state; }
2415 bool state_test(int m) const { return (state & m) != 0; }
2416 void state_set(int m) { state |= m; }
2417 void state_clear(int m) { state &= ~m; }
2418
2419 bool is_complete() const { return info.last_complete == info.last_update; }
2420 bool should_send_notify() const { return send_notify; }
2421
2422 int get_state() const { return state; }
2423 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2424 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2425 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2426 bool is_down() const { return state_test(PG_STATE_DOWN); }
2427 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2428 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2429 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2430 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2431 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2432 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2433
2434 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2435 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2436 bool is_peered() const {
2437 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2438 }
2439
2440 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2441
2442 void init(
2443 int role,
2444 const vector<int>& up,
2445 int up_primary,
2446 const vector<int>& acting,
2447 int acting_primary,
2448 const pg_history_t& history,
2449 const PastIntervals& pim,
2450 bool backfill,
2451 ObjectStore::Transaction *t);
2452
2453 // pg on-disk state
2454 void do_pending_flush();
2455
2456 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2457 static void _init(ObjectStore::Transaction& t,
2458 spg_t pgid, const pg_pool_t *pool);
2459
2460 private:
2461 void prepare_write_info(map<string,bufferlist> *km);
2462
2463 void update_store_with_options();
2464 void update_store_on_load();
2465
2466 public:
2467 static int _prepare_write_info(
2468 CephContext* cct,
2469 map<string,bufferlist> *km,
2470 epoch_t epoch,
2471 pg_info_t &info,
2472 pg_info_t &last_written_info,
2473 PastIntervals &past_intervals,
2474 bool dirty_big_info,
2475 bool dirty_epoch,
2476 bool try_fast_info,
2477 PerfCounters *logger = nullptr);
2478 void write_if_dirty(ObjectStore::Transaction& t);
2479
2480 PGLog::IndexedLog projected_log;
2481 bool check_in_progress_op(
2482 const osd_reqid_t &r,
2483 eversion_t *version,
2484 version_t *user_version,
2485 int *return_code) const;
2486 eversion_t projected_last_update;
2487 eversion_t get_next_version() const {
2488 eversion_t at_version(
2489 get_osdmap()->get_epoch(),
2490 projected_last_update.version+1);
2491 assert(at_version > info.last_update);
2492 assert(at_version > pg_log.get_head());
2493 assert(at_version > projected_last_update);
2494 return at_version;
2495 }
2496
2497 void add_log_entry(const pg_log_entry_t& e, bool applied);
2498 void append_log(
2499 const vector<pg_log_entry_t>& logv,
2500 eversion_t trim_to,
2501 eversion_t roll_forward_to,
2502 ObjectStore::Transaction &t,
2503 bool transaction_applied = true);
2504 bool check_log_for_corruption(ObjectStore *store);
2505 void trim_log();
2506
2507 std::string get_corrupt_pg_log_name() const;
2508 static int read_info(
2509 ObjectStore *store, spg_t pgid, const coll_t &coll,
2510 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2511 __u8 &);
2512 void read_state(ObjectStore *store, bufferlist &bl);
2513 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2514 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2515 epoch_t *pepoch, bufferlist *bl);
2516 void update_snap_map(
2517 const vector<pg_log_entry_t> &log_entries,
2518 ObjectStore::Transaction& t);
2519
2520 void filter_snapc(vector<snapid_t> &snaps);
2521
2522 void log_weirdness();
2523
2524 virtual void kick_snap_trim() = 0;
2525 virtual void snap_trimmer_scrub_complete() = 0;
2526 bool requeue_scrub(bool high_priority = false);
2527 void queue_recovery();
2528 bool queue_scrub();
2529 unsigned get_scrub_priority();
2530
2531 /// share pg info after a pg is active
2532 void share_pg_info();
2533
2534
2535 bool append_log_entries_update_missing(
2536 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2537 ObjectStore::Transaction &t,
2538 boost::optional<eversion_t> trim_to,
2539 boost::optional<eversion_t> roll_forward_to);
2540
2541 /**
2542 * Merge entries updating missing as necessary on all
2543 * actingbackfill logs and missings (also missing_loc)
2544 */
2545 void merge_new_log_entries(
2546 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2547 ObjectStore::Transaction &t,
2548 boost::optional<eversion_t> trim_to,
2549 boost::optional<eversion_t> roll_forward_to);
2550
2551 void reset_interval_flush();
2552 void start_peering_interval(
2553 const OSDMapRef lastmap,
2554 const vector<int>& newup, int up_primary,
2555 const vector<int>& newacting, int acting_primary,
2556 ObjectStore::Transaction *t);
2557 void on_new_interval();
2558 virtual void _on_new_interval() = 0;
2559 void start_flush(ObjectStore::Transaction *t,
2560 list<Context *> *on_applied,
2561 list<Context *> *on_safe);
2562 void set_last_peering_reset();
2563 bool pg_has_reset_since(epoch_t e) {
2564 assert(is_locked());
2565 return deleting || e < get_last_peering_reset();
2566 }
2567
2568 void update_history(const pg_history_t& history);
2569 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2570 pair<pg_shard_t, pg_info_t> &notify_info);
2571 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2572
2573 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2574
2575 bool should_restart_peering(
2576 int newupprimary,
2577 int newactingprimary,
2578 const vector<int>& newup,
2579 const vector<int>& newacting,
2580 OSDMapRef lastmap,
2581 OSDMapRef osdmap);
2582
2583 // OpRequest queueing
2584 bool can_discard_op(OpRequestRef& op);
2585 bool can_discard_scan(OpRequestRef op);
2586 bool can_discard_backfill(OpRequestRef op);
2587 bool can_discard_request(OpRequestRef& op);
2588
2589 template<typename T, int MSGTYPE>
2590 bool can_discard_replica_op(OpRequestRef& op);
2591
2592 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2593 bool old_peering_evt(CephPeeringEvtRef evt) {
2594 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2595 }
2596 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2597 return e <= cur_epoch;
2598 }
2599 bool have_same_or_newer_map(epoch_t e) {
2600 return e <= get_osdmap()->get_epoch();
2601 }
2602
2603 bool op_has_sufficient_caps(OpRequestRef& op);
2604
2605
2606 // recovery bits
2607 void take_waiters();
2608 void queue_peering_event(CephPeeringEvtRef evt);
2609 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2610 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2611 pg_shard_t from, const pg_query_t& q);
2612 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2613 void queue_flushed(epoch_t started_at);
2614 void handle_advance_map(
2615 OSDMapRef osdmap, OSDMapRef lastmap,
2616 vector<int>& newup, int up_primary,
2617 vector<int>& newacting, int acting_primary,
2618 RecoveryCtx *rctx);
2619 void handle_activate_map(RecoveryCtx *rctx);
2620 void handle_create(RecoveryCtx *rctx);
2621 void handle_loaded(RecoveryCtx *rctx);
2622 void handle_query_state(Formatter *f);
2623
2624 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2625
2626
2627 // abstract bits
2628 virtual void do_request(
2629 OpRequestRef& op,
2630 ThreadPool::TPHandle &handle
2631 ) = 0;
2632
2633 virtual void do_op(OpRequestRef& op) = 0;
2634 virtual void do_sub_op(OpRequestRef op) = 0;
2635 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2636 virtual void do_scan(
2637 OpRequestRef op,
2638 ThreadPool::TPHandle &handle
2639 ) = 0;
2640 virtual void do_backfill(OpRequestRef op) = 0;
2641 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2642
2643 virtual int do_command(
2644 cmdmap_t cmdmap,
2645 ostream& ss,
2646 bufferlist& idata,
2647 bufferlist& odata,
2648 ConnectionRef conn,
2649 ceph_tid_t tid) = 0;
2650
2651 virtual void on_role_change() = 0;
2652 virtual void on_pool_change() = 0;
2653 virtual void on_change(ObjectStore::Transaction *t) = 0;
2654 virtual void on_activate() = 0;
2655 virtual void on_flushed() = 0;
2656 virtual void on_shutdown() = 0;
2657 virtual void check_blacklisted_watchers() = 0;
2658 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2659
2660 virtual bool agent_work(int max) = 0;
2661 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2662 virtual void agent_stop() = 0;
2663 virtual void agent_delay() = 0;
2664 virtual void agent_clear() = 0;
2665 virtual void agent_choose_mode_restart() = 0;
2666 };
2667
2668 ostream& operator<<(ostream& out, const PG& pg);
2669
2670 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2671
2672 #endif