]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
update sources to 12.2.8
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46
47 #include <atomic>
48 #include <list>
49 #include <memory>
50 #include <stack>
51 #include <string>
52 #include <tuple>
53 using namespace std;
54
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
57
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60 class OSD;
61 class OSDService;
62 class MOSDOp;
63 class MOSDPGScan;
64 class MOSDPGBackfill;
65 class MOSDPGInfo;
66
67 class PG;
68 struct OpRequest;
69 typedef OpRequest::Ref OpRequestRef;
70 class MOSDPGLog;
71 class CephContext;
72
73 namespace Scrub {
74 class Store;
75 }
76
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131 };
132
133 #ifdef PG_DEBUG_REFS
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138 #else
139 typedef boost::intrusive_ptr<PG> PGRef;
140 #endif
141
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218 };
219
220 struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247 };
248
249 /** PG - Replica Placement Group
250 *
251 */
252
253 class PG : public DoutPrefixProvider {
254 protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
259 bool eio_errors_to_process = false;
260
261 virtual PGBackend *get_pgbackend() = 0;
262 public:
263 std::string gen_prefix() const override;
264 CephContext *get_cct() const override { return cct; }
265 unsigned get_subsys() const override { return ceph_subsys_osd; }
266
267 /*** PG ****/
268 void update_snap_mapper_bits(uint32_t bits) {
269 snap_mapper.update_bits(bits);
270 }
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
274 }
275 protected:
276 OSDMapRef osdmap_ref;
277 OSDMapRef last_persisted_osdmap_ref;
278 PGPool pool;
279
280 void requeue_map_waiters();
281
282 void update_osdmap_ref(OSDMapRef newmap) {
283 assert(_lock.is_locked_by_me());
284 osdmap_ref = std::move(newmap);
285 }
286
287 public:
288 OSDMapRef get_osdmap() const {
289 assert(is_locked());
290 assert(osdmap_ref);
291 return osdmap_ref;
292 }
293 protected:
294
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
301 */
302 mutable Mutex _lock;
303 std::atomic_uint ref{0};
304
305 #ifdef PG_DEBUG_REFS
306 Mutex _ref_id_lock;
307 map<uint64_t, string> _live_ids;
308 map<string, uint64_t> _tag_counts;
309 uint64_t _ref_id;
310 #endif
311
312 public:
313 bool deleting; // true while in removing or OSD is shutting down
314
315 ZTracer::Endpoint trace_endpoint;
316
317 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
318 void lock(bool no_lockdep = false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
321 assert(!dirty_info);
322 assert(!dirty_big_info);
323 _lock.Unlock();
324 }
325
326 bool is_locked() const {
327 return _lock.is_locked();
328 }
329
330 #ifdef PG_DEBUG_REFS
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
334 #endif
335 void get(const char* tag);
336 void put(const char* tag);
337
338 bool dirty_info, dirty_big_info;
339
340 public:
341 bool is_ec_pg() const {
342 return pool.info.ec_pool();
343 }
344 // pg state
345 pg_info_t info; ///< current pg info
346 pg_info_t last_written_info; ///< last written info
347 __u8 info_struct_v;
348 static const __u8 cur_struct_v = 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v = 7;
355 bool must_upgrade() {
356 return info_struct_v < cur_struct_v;
357 }
358 bool can_upgrade() {
359 return info_struct_v >= compat_struct_v;
360 }
361 void upgrade(ObjectStore *store);
362
363 const coll_t coll;
364 ObjectStore::CollectionHandle ch;
365 PGLog pg_log;
366 static string get_info_key(spg_t pgid) {
367 return stringify(pgid) + "_info";
368 }
369 static string get_biginfo_key(spg_t pgid) {
370 return stringify(pgid) + "_biginfo";
371 }
372 static string get_epoch_key(spg_t pgid) {
373 return stringify(pgid) + "_epoch";
374 }
375 ghobject_t pgmeta_oid;
376
377 class MissingLoc {
378 map<hobject_t, pg_missing_item> needs_recovery_map;
379 map<hobject_t, set<pg_shard_t> > missing_loc;
380 set<pg_shard_t> missing_loc_sources;
381 PG *pg;
382 set<pg_shard_t> empty_set;
383 public:
384 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
385 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
386 explicit MissingLoc(PG *pg)
387 : pg(pg) {}
388 void set_backend_predicates(
389 IsPGReadablePredicate *_is_readable,
390 IsPGRecoverablePredicate *_is_recoverable) {
391 is_readable.reset(_is_readable);
392 is_recoverable.reset(_is_recoverable);
393 }
394 string gen_prefix() const { return pg->gen_prefix(); }
395 bool needs_recovery(
396 const hobject_t &hoid,
397 eversion_t *v = 0) const {
398 map<hobject_t, pg_missing_item>::const_iterator i =
399 needs_recovery_map.find(hoid);
400 if (i == needs_recovery_map.end())
401 return false;
402 if (v)
403 *v = i->second.need;
404 return true;
405 }
406 bool is_deleted(const hobject_t &hoid) const {
407 auto i = needs_recovery_map.find(hoid);
408 if (i == needs_recovery_map.end())
409 return false;
410 return i->second.is_delete();
411 }
412 bool is_unfound(const hobject_t &hoid) const {
413 return needs_recovery(hoid) && !is_deleted(hoid) && (
414 !missing_loc.count(hoid) ||
415 !(*is_recoverable)(missing_loc.find(hoid)->second));
416 }
417 bool readable_with_acting(
418 const hobject_t &hoid,
419 const set<pg_shard_t> &acting) const;
420 uint64_t num_unfound() const {
421 uint64_t ret = 0;
422 for (map<hobject_t, pg_missing_item>::const_iterator i =
423 needs_recovery_map.begin();
424 i != needs_recovery_map.end();
425 ++i) {
426 if (is_unfound(i->first))
427 ++ret;
428 }
429 return ret;
430 }
431
432 bool have_unfound() const {
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 needs_recovery_map.begin();
435 i != needs_recovery_map.end();
436 ++i) {
437 if (is_unfound(i->first))
438 return true;
439 }
440 return false;
441 }
442 void clear() {
443 needs_recovery_map.clear();
444 missing_loc.clear();
445 missing_loc_sources.clear();
446 }
447
448 void add_location(const hobject_t &hoid, pg_shard_t location) {
449 missing_loc[hoid].insert(location);
450 }
451 void remove_location(const hobject_t &hoid, pg_shard_t location) {
452 missing_loc[hoid].erase(location);
453 }
454 void add_active_missing(const pg_missing_t &missing) {
455 for (map<hobject_t, pg_missing_item>::const_iterator i =
456 missing.get_items().begin();
457 i != missing.get_items().end();
458 ++i) {
459 map<hobject_t, pg_missing_item>::const_iterator j =
460 needs_recovery_map.find(i->first);
461 if (j == needs_recovery_map.end()) {
462 needs_recovery_map.insert(*i);
463 } else {
464 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
465 << i->first << " have " << j->second
466 << " tried to add " << i->second << dendl;
467 assert(i->second.need == j->second.need);
468 }
469 }
470 }
471
472 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
473 needs_recovery_map[hoid] = pg_missing_item(need, have);
474 }
475 void revise_need(const hobject_t &hoid, eversion_t need) {
476 assert(needs_recovery(hoid));
477 needs_recovery_map[hoid].need = need;
478 }
479
480 /// Adds info about a possible recovery source
481 bool add_source_info(
482 pg_shard_t source, ///< [in] source
483 const pg_info_t &oinfo, ///< [in] info
484 const pg_missing_t &omissing, ///< [in] (optional) missing
485 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
486 ); ///< @return whether a new object location was discovered
487
488 /// Adds recovery sources in batch
489 void add_batch_sources_info(
490 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
491 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
492 );
493
494 /// Uses osdmap to update structures for now down sources
495 void check_recovery_sources(const OSDMapRef& osdmap);
496
497 /// Call when hoid is no longer missing in acting set
498 void recovered(const hobject_t &hoid) {
499 needs_recovery_map.erase(hoid);
500 missing_loc.erase(hoid);
501 }
502
503 /// Call to update structures for hoid after a change
504 void rebuild(
505 const hobject_t &hoid,
506 pg_shard_t self,
507 const set<pg_shard_t> to_recover,
508 const pg_info_t &info,
509 const pg_missing_t &missing,
510 const map<pg_shard_t, pg_missing_t> &pmissing,
511 const map<pg_shard_t, pg_info_t> &pinfo) {
512 recovered(hoid);
513 boost::optional<pg_missing_item> item;
514 auto miter = missing.get_items().find(hoid);
515 if (miter != missing.get_items().end()) {
516 item = miter->second;
517 } else {
518 for (auto &&i: to_recover) {
519 if (i == self)
520 continue;
521 auto pmiter = pmissing.find(i);
522 assert(pmiter != pmissing.end());
523 miter = pmiter->second.get_items().find(hoid);
524 if (miter != pmiter->second.get_items().end()) {
525 item = miter->second;
526 break;
527 }
528 }
529 }
530 if (!item)
531 return; // recovered!
532
533 needs_recovery_map[hoid] = *item;
534 if (item->is_delete())
535 return;
536 auto mliter =
537 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
538 assert(info.last_backfill.is_max());
539 assert(info.last_update >= item->need);
540 if (!missing.is_missing(hoid))
541 mliter->second.insert(self);
542 for (auto &&i: pmissing) {
543 auto pinfoiter = pinfo.find(i.first);
544 assert(pinfoiter != pinfo.end());
545 if (item->need <= pinfoiter->second.last_update &&
546 hoid <= pinfoiter->second.last_backfill &&
547 !i.second.is_missing(hoid))
548 mliter->second.insert(i.first);
549 }
550 }
551
552 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
553 return missing_loc.count(hoid) ?
554 missing_loc.find(hoid)->second : empty_set;
555 }
556 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
557 return missing_loc;
558 }
559 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
560 return needs_recovery_map;
561 }
562 } missing_loc;
563
564 PastIntervals past_intervals;
565
566 interval_set<snapid_t> snap_trimq;
567
568 /* You should not use these items without taking their respective queue locks
569 * (if they have one) */
570 xlist<PG*>::item stat_queue_item;
571 bool scrub_queued;
572 bool recovery_queued;
573
574 int recovery_ops_active;
575 set<pg_shard_t> waiting_on_backfill;
576 #ifdef DEBUG_RECOVERY_OIDS
577 set<hobject_t> recovering_oids;
578 #endif
579
580 protected:
581 int role; // 0 = primary, 1 = replica, -1=none.
582 unsigned state; // PG_STATE_*
583
584 bool send_notify; ///< true if we are non-primary and should notify the primary
585
586 public:
587 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
588 eversion_t last_complete_ondisk; // last_complete that has committed.
589 eversion_t last_update_applied;
590
591
592 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
593 PGRef pg;
594 epoch_t e;
595 eversion_t v;
596 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
597 : pg(pg), e(e), v(v) {}
598 void finish(int) override {
599 pg->lock();
600 if (!pg->pg_has_reset_since(e)) {
601 pg->last_rollback_info_trimmed_to_applied = v;
602 }
603 pg->unlock();
604 }
605 };
606 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
607 // and the transaction has applied
608 eversion_t last_rollback_info_trimmed_to_applied;
609
610 // primary state
611 public:
612 pg_shard_t primary;
613 pg_shard_t pg_whoami;
614 pg_shard_t up_primary;
615 vector<int> up, acting, want_acting;
616 set<pg_shard_t> actingbackfill, actingset, upset;
617 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
618 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
619 eversion_t pg_trim_to;
620
621 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
622
623 // [primary only] content recovery state
624
625 public:
626 struct BufferedRecoveryMessages {
627 map<int, map<spg_t, pg_query_t> > query_map;
628 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
629 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
630 };
631
632 public:
633 bool dne() { return info.dne(); }
634 struct RecoveryCtx {
635 utime_t start_time;
636 map<int, map<spg_t, pg_query_t> > *query_map;
637 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
638 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
639 set<PGRef> created_pgs;
640 C_Contexts *on_applied;
641 C_Contexts *on_safe;
642 ObjectStore::Transaction *transaction;
643 ThreadPool::TPHandle* handle;
644 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
645 map<int,
646 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
647 map<int,
648 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
649 C_Contexts *on_applied,
650 C_Contexts *on_safe,
651 ObjectStore::Transaction *transaction)
652 : query_map(query_map), info_map(info_map),
653 notify_list(notify_list),
654 on_applied(on_applied),
655 on_safe(on_safe),
656 transaction(transaction),
657 handle(NULL) {}
658
659 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
660 : query_map(&(buf.query_map)),
661 info_map(&(buf.info_map)),
662 notify_list(&(buf.notify_list)),
663 on_applied(rctx.on_applied),
664 on_safe(rctx.on_safe),
665 transaction(rctx.transaction),
666 handle(rctx.handle) {}
667
668 void accept_buffered_messages(BufferedRecoveryMessages &m) {
669 assert(query_map);
670 assert(info_map);
671 assert(notify_list);
672 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
673 i != m.query_map.end();
674 ++i) {
675 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
676 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
677 j != i->second.end();
678 ++j) {
679 omap[j->first] = j->second;
680 }
681 }
682 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
683 = m.info_map.begin();
684 i != m.info_map.end();
685 ++i) {
686 vector<pair<pg_notify_t, PastIntervals> > &ovec =
687 (*info_map)[i->first];
688 ovec.reserve(ovec.size() + i->second.size());
689 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
690 }
691 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
692 = m.notify_list.begin();
693 i != m.notify_list.end();
694 ++i) {
695 vector<pair<pg_notify_t, PastIntervals> > &ovec =
696 (*notify_list)[i->first];
697 ovec.reserve(ovec.size() + i->second.size());
698 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
699 }
700 }
701
702 void send_notify(pg_shard_t to,
703 const pg_notify_t &info, const PastIntervals &pi) {
704 assert(notify_list);
705 (*notify_list)[to.osd].push_back(make_pair(info, pi));
706 }
707 };
708
709
710 PGStateHistory pgstate_history;
711
712 struct NamedState {
713 const char *state_name;
714 utime_t enter_time;
715 PG* pg;
716 const char *get_state_name() { return state_name; }
717 NamedState(PG *pg_, const char *state_name_)
718 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
719 pg->pgstate_history.enter(pg, enter_time, state_name);
720 }
721 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
722 };
723
724
725
726 protected:
727
728 /*
729 * peer_info -- projected (updates _before_ replicas ack)
730 * peer_missing -- committed (updates _after_ replicas ack)
731 */
732
733 bool need_up_thru;
734 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
735 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
736 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
737 set<pg_shard_t> peer_purged; // peers purged
738 map<pg_shard_t, pg_missing_t> peer_missing;
739 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
740 set<pg_shard_t> peer_missing_requested;
741
742 // i deleted these strays; ignore racing PGInfo from them
743 set<pg_shard_t> peer_activated;
744
745 // primary-only, recovery-only state
746 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
747 // which are unfound on the primary
748 epoch_t last_peering_reset;
749
750
751 /* heartbeat peers */
752 void set_probe_targets(const set<pg_shard_t> &probe_set);
753 void clear_probe_targets();
754 public:
755 Mutex heartbeat_peer_lock;
756 set<int> heartbeat_peers;
757 set<int> probe_targets;
758
759 /**
760 * BackfillInterval
761 *
762 * Represents the objects in a range [begin, end)
763 *
764 * Possible states:
765 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
766 * 2) Else, objects contains all objects in [begin, end)
767 */
768 struct BackfillInterval {
769 // info about a backfill interval on a peer
770 eversion_t version; /// version at which the scan occurred
771 map<hobject_t,eversion_t> objects;
772 hobject_t begin;
773 hobject_t end;
774
775 /// clear content
776 void clear() {
777 *this = BackfillInterval();
778 }
779
780 /// clear objects list only
781 void clear_objects() {
782 objects.clear();
783 }
784
785 /// reinstantiate with a new start+end position and sort order
786 void reset(hobject_t start) {
787 clear();
788 begin = end = start;
789 }
790
791 /// true if there are no objects in this interval
792 bool empty() const {
793 return objects.empty();
794 }
795
796 /// true if interval extends to the end of the range
797 bool extends_to_end() const {
798 return end.is_max();
799 }
800
801 /// removes items <= soid and adjusts begin to the first object
802 void trim_to(const hobject_t &soid) {
803 trim();
804 while (!objects.empty() &&
805 objects.begin()->first <= soid) {
806 pop_front();
807 }
808 }
809
810 /// Adjusts begin to the first object
811 void trim() {
812 if (!objects.empty())
813 begin = objects.begin()->first;
814 else
815 begin = end;
816 }
817
818 /// drop first entry, and adjust @begin accordingly
819 void pop_front() {
820 assert(!objects.empty());
821 objects.erase(objects.begin());
822 trim();
823 }
824
825 /// dump
826 void dump(Formatter *f) const {
827 f->dump_stream("begin") << begin;
828 f->dump_stream("end") << end;
829 f->open_array_section("objects");
830 for (map<hobject_t, eversion_t>::const_iterator i =
831 objects.begin();
832 i != objects.end();
833 ++i) {
834 f->open_object_section("object");
835 f->dump_stream("object") << i->first;
836 f->dump_stream("version") << i->second;
837 f->close_section();
838 }
839 f->close_section();
840 }
841 };
842
843 protected:
844 BackfillInterval backfill_info;
845 map<pg_shard_t, BackfillInterval> peer_backfill_info;
846 bool backfill_reserved;
847 bool backfill_reserving;
848
849 friend class OSD;
850
851 public:
852 set<pg_shard_t> backfill_targets;
853
854 bool is_backfill_targets(pg_shard_t osd) {
855 return backfill_targets.count(osd);
856 }
857
858 protected:
859
860 /*
861 * blocked request wait hierarchy
862 *
863 * In order to preserve request ordering we need to be careful about the
864 * order in which blocked requests get requeued. Generally speaking, we
865 * push the requests back up to the op_wq in reverse order (most recent
866 * request first) so that they come back out again in the original order.
867 * However, because there are multiple wait queues, we need to requeue
868 * waitlists in order. Generally speaking, we requeue the wait lists
869 * that are checked first.
870 *
871 * Here are the various wait lists, in the order they are used during
872 * request processing, with notes:
873 *
874 * - waiting_for_map
875 * - may start or stop blocking at any time (depending on client epoch)
876 * - waiting_for_peered
877 * - !is_peered() or flushes_in_progress
878 * - only starts blocking on interval change; never restarts
879 * - waiting_for_active
880 * - !is_active()
881 * - only starts blocking on interval change; never restarts
882 * - waiting_for_flush
883 * - is_active() and flushes_in_progress
884 * - waiting for final flush during activate
885 * - waiting_for_scrub
886 * - starts and stops blocking for varying intervals during scrub
887 * - waiting_for_unreadable_object
888 * - never restarts once object is readable (* except for EIO?)
889 * - waiting_for_degraded_object
890 * - never restarts once object is writeable (* except for EIO?)
891 * - waiting_for_blocked_object
892 * - starts and stops based on proxied op activity
893 * - obc rwlocks
894 * - starts and stops based on read/write activity
895 *
896 * Notes:
897 *
898 * 1. During and interval change, we requeue *everything* in the above order.
899 *
900 * 2. When an obc rwlock is released, we check for a scrub block and requeue
901 * the op there if it applies. We ignore the unreadable/degraded/blocked
902 * queues because we assume they cannot apply at that time (this is
903 * probably mostly true).
904 *
905 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
906 * it is non-empty.
907 *
908 * These three behaviors are generally sufficient to maintain ordering, with
909 * the possible exception of cases where we make an object degraded or
910 * unreadable that was previously okay, e.g. when scrub or op processing
911 * encounter an unexpected error. FIXME.
912 */
913
914 // pg waiters
915 unsigned flushes_in_progress;
916
917 // ops with newer maps than our (or blocked behind them)
918 // track these by client, since inter-request ordering doesn't otherwise
919 // matter.
920 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
921
922 // ops waiting on peered
923 list<OpRequestRef> waiting_for_peered;
924
925 // ops waiting on active (require peered as well)
926 list<OpRequestRef> waiting_for_active;
927 list<OpRequestRef> waiting_for_flush;
928 list<OpRequestRef> waiting_for_scrub;
929
930 list<OpRequestRef> waiting_for_cache_not_full;
931 list<OpRequestRef> waiting_for_clean_to_primary_repair;
932 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
933 waiting_for_degraded_object,
934 waiting_for_blocked_object;
935
936 set<hobject_t> objects_blocked_on_cache_full;
937 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
938 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
939
940 // Callbacks should assume pg (and nothing else) is locked
941 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
942
943 map<eversion_t,
944 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
945
946 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
947 void requeue_op(OpRequestRef op);
948 void requeue_ops(list<OpRequestRef> &l);
949
950 // stats that persist lazily
951 object_stat_collection_t unstable_stats;
952
953 // publish stats
954 Mutex pg_stats_publish_lock;
955 bool pg_stats_publish_valid;
956 pg_stat_t pg_stats_publish;
957
958 // for ordering writes
959 ceph::shared_ptr<ObjectStore::Sequencer> osr;
960
961 void _update_calc_stats();
962 void _update_blocked_by();
963 void publish_stats_to_osd();
964 void clear_publish_stats();
965
966 public:
967 void clear_primary_state();
968
969 bool is_actingbackfill(pg_shard_t osd) const {
970 return actingbackfill.count(osd);
971 }
972 bool is_acting(pg_shard_t osd) const {
973 return has_shard(pool.info.ec_pool(), acting, osd);
974 }
975 bool is_up(pg_shard_t osd) const {
976 return has_shard(pool.info.ec_pool(), up, osd);
977 }
978 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
979 if (ec) {
980 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
981 } else {
982 return std::find(v.begin(), v.end(), osd.osd) != v.end();
983 }
984 }
985
986 bool needs_recovery() const;
987 bool needs_backfill() const;
988
989 /// clip calculated priority to reasonable range
990 inline int clamp_recovery_priority(int priority);
991 /// get log recovery reservation priority
992 unsigned get_recovery_priority();
993 /// get backfill reservation priority
994 unsigned get_backfill_priority();
995
996 void mark_clean(); ///< mark an active pg clean
997 void _change_recovery_force_mode(int new_mode, bool clear);
998
999 /// return [start,end) bounds for required past_intervals
1000 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1001 const pg_info_t &info,
1002 epoch_t oldest_map) {
1003 epoch_t start = MAX(
1004 info.history.last_epoch_clean ? info.history.last_epoch_clean :
1005 info.history.epoch_pool_created,
1006 oldest_map);
1007 epoch_t end = MAX(
1008 info.history.same_interval_since,
1009 info.history.epoch_pool_created);
1010 return make_pair(start, end);
1011 }
1012 void check_past_interval_bounds() const;
1013 PastIntervals::PriorSet build_prior();
1014
1015 void remove_down_peer_info(const OSDMapRef osdmap);
1016
1017 bool adjust_need_up_thru(const OSDMapRef osdmap);
1018
1019 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1020 virtual void dump_recovery_info(Formatter *f) const = 0;
1021
1022 bool calc_min_last_complete_ondisk() {
1023 eversion_t min = last_complete_ondisk;
1024 assert(!actingbackfill.empty());
1025 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1026 i != actingbackfill.end();
1027 ++i) {
1028 if (*i == get_primary()) continue;
1029 if (peer_last_complete_ondisk.count(*i) == 0)
1030 return false; // we don't have complete info
1031 eversion_t a = peer_last_complete_ondisk[*i];
1032 if (a < min)
1033 min = a;
1034 }
1035 if (min == min_last_complete_ondisk)
1036 return false;
1037 min_last_complete_ondisk = min;
1038 return true;
1039 }
1040
1041 virtual void calc_trim_to() = 0;
1042
1043 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1044 pg_missing_t& omissing, pg_shard_t from);
1045 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1046 pg_missing_t& omissing, pg_shard_t from);
1047 bool proc_replica_info(
1048 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1049
1050 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1051 PG *pg;
1052 ObjectStore::Transaction *t;
1053 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1054
1055 // LogEntryHandler
1056 void remove(const hobject_t &hoid) override {
1057 pg->get_pgbackend()->remove(hoid, t);
1058 }
1059 void try_stash(const hobject_t &hoid, version_t v) override {
1060 pg->get_pgbackend()->try_stash(hoid, v, t);
1061 }
1062 void rollback(const pg_log_entry_t &entry) override {
1063 assert(entry.can_rollback());
1064 pg->get_pgbackend()->rollback(entry, t);
1065 }
1066 void rollforward(const pg_log_entry_t &entry) override {
1067 pg->get_pgbackend()->rollforward(entry, t);
1068 }
1069 void trim(const pg_log_entry_t &entry) override {
1070 pg->get_pgbackend()->trim(entry, t);
1071 }
1072 };
1073
1074 void update_object_snap_mapping(
1075 ObjectStore::Transaction *t, const hobject_t &soid,
1076 const set<snapid_t> &snaps);
1077 void clear_object_snap_mapping(
1078 ObjectStore::Transaction *t, const hobject_t &soid);
1079 void remove_snap_mapped_object(
1080 ObjectStore::Transaction& t, const hobject_t& soid);
1081 void merge_log(
1082 ObjectStore::Transaction& t, pg_info_t &oinfo,
1083 pg_log_t &olog, pg_shard_t from);
1084 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1085 bool search_for_missing(
1086 const pg_info_t &oinfo, const pg_missing_t &omissing,
1087 pg_shard_t fromosd,
1088 RecoveryCtx*);
1089
1090 void check_for_lost_objects();
1091 void forget_lost_objects();
1092
1093 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1094
1095 void trim_write_ahead();
1096
1097 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1098 const map<pg_shard_t, pg_info_t> &infos,
1099 bool restrict_to_up_acting,
1100 bool *history_les_bound) const;
1101 static void calc_ec_acting(
1102 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1103 unsigned size,
1104 const vector<int> &acting,
1105 pg_shard_t acting_primary,
1106 const vector<int> &up,
1107 pg_shard_t up_primary,
1108 const map<pg_shard_t, pg_info_t> &all_info,
1109 bool restrict_to_up_acting,
1110 vector<int> *want,
1111 set<pg_shard_t> *backfill,
1112 set<pg_shard_t> *acting_backfill,
1113 pg_shard_t *want_primary,
1114 ostream &ss);
1115 static void calc_replicated_acting(
1116 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1117 unsigned size,
1118 const vector<int> &acting,
1119 pg_shard_t acting_primary,
1120 const vector<int> &up,
1121 pg_shard_t up_primary,
1122 const map<pg_shard_t, pg_info_t> &all_info,
1123 bool restrict_to_up_acting,
1124 vector<int> *want,
1125 set<pg_shard_t> *backfill,
1126 set<pg_shard_t> *acting_backfill,
1127 pg_shard_t *want_primary,
1128 ostream &ss);
1129 bool choose_acting(pg_shard_t &auth_log_shard,
1130 bool restrict_to_up_acting,
1131 bool *history_les_bound);
1132 void build_might_have_unfound();
1133 void activate(
1134 ObjectStore::Transaction& t,
1135 epoch_t activation_epoch,
1136 list<Context*>& tfin,
1137 map<int, map<spg_t,pg_query_t> >& query_map,
1138 map<int,
1139 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1140 RecoveryCtx *ctx);
1141 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1142 void all_activated_and_committed();
1143
1144 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1145
1146 bool have_unfound() const {
1147 return missing_loc.have_unfound();
1148 }
1149 uint64_t get_num_unfound() const {
1150 return missing_loc.num_unfound();
1151 }
1152
1153 virtual void check_local() = 0;
1154
1155 /**
1156 * @param ops_begun returns how many recovery ops the function started
1157 * @returns true if any useful work was accomplished; false otherwise
1158 */
1159 virtual bool start_recovery_ops(
1160 uint64_t max,
1161 ThreadPool::TPHandle &handle,
1162 uint64_t *ops_begun) = 0;
1163
1164 void purge_strays();
1165
1166 void update_heartbeat_peers();
1167
1168 Context *finish_sync_event;
1169
1170 void finish_recovery(list<Context*>& tfin);
1171 void _finish_recovery(Context *c);
1172 void cancel_recovery();
1173 void clear_recovery_state();
1174 virtual void _clear_recovery_state() = 0;
1175 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1176 void start_recovery_op(const hobject_t& soid);
1177 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1178
1179 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1180 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1181
1182 friend class C_OSD_RepModify_Commit;
1183
1184 // -- backoff --
1185 Mutex backoff_lock; // orders inside Backoff::lock
1186 map<hobject_t,set<BackoffRef>> backoffs;
1187
1188 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1189 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1190 void release_backoffs(const hobject_t& o) {
1191 release_backoffs(o, o);
1192 }
1193 void clear_backoffs();
1194
1195 void add_pg_backoff(SessionRef s) {
1196 hobject_t begin = info.pgid.pgid.get_hobj_start();
1197 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1198 add_backoff(s, begin, end);
1199 }
1200 void release_pg_backoffs() {
1201 hobject_t begin = info.pgid.pgid.get_hobj_start();
1202 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1203 release_backoffs(begin, end);
1204 }
1205
1206 void rm_backoff(BackoffRef b);
1207
1208 // -- scrub --
1209 struct Scrubber {
1210 Scrubber();
1211 ~Scrubber();
1212
1213 // metadata
1214 set<pg_shard_t> reserved_peers;
1215 bool reserved, reserve_failed;
1216 epoch_t epoch_start;
1217
1218 // common to both scrubs
1219 bool active;
1220 set<pg_shard_t> waiting_on_whom;
1221 int shallow_errors;
1222 int deep_errors;
1223 int large_omap_objects = 0;
1224 int fixed;
1225 ScrubMap primary_scrubmap;
1226 ScrubMapBuilder primary_scrubmap_pos;
1227 epoch_t replica_scrub_start = 0;
1228 ScrubMap replica_scrubmap;
1229 ScrubMapBuilder replica_scrubmap_pos;
1230 map<pg_shard_t, ScrubMap> received_maps;
1231 OpRequestRef active_rep_scrub;
1232 utime_t scrub_reg_stamp; // stamp we registered for
1233
1234 // For async sleep
1235 bool sleeping = false;
1236 bool needs_sleep = true;
1237 utime_t sleep_start;
1238
1239 // flags to indicate explicitly requested scrubs (by admin)
1240 bool must_scrub, must_deep_scrub, must_repair;
1241
1242 // Priority to use for scrub scheduling
1243 unsigned priority;
1244
1245 // this flag indicates whether we would like to do auto-repair of the PG or not
1246 bool auto_repair;
1247
1248 // Maps from objects with errors to missing/inconsistent peers
1249 map<hobject_t, set<pg_shard_t>> missing;
1250 map<hobject_t, set<pg_shard_t>> inconsistent;
1251
1252 // Map from object with errors to good peers
1253 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1254
1255 // Cleaned map pending snap metadata scrub
1256 ScrubMap cleaned_meta_map;
1257
1258 void clean_meta_map(ScrubMap &for_meta_scrub) {
1259 if (end.is_max() ||
1260 cleaned_meta_map.objects.empty()) {
1261 cleaned_meta_map.swap(for_meta_scrub);
1262 } else {
1263 auto iter = cleaned_meta_map.objects.end();
1264 --iter; // not empty, see if clause
1265 auto begin = cleaned_meta_map.objects.begin();
1266 if (iter->first.has_snapset()) {
1267 ++iter;
1268 } else {
1269 while (iter != begin) {
1270 auto next = iter--;
1271 if (next->first.get_head() != iter->first.get_head()) {
1272 ++iter;
1273 break;
1274 }
1275 }
1276 }
1277 for_meta_scrub.objects.insert(begin, iter);
1278 cleaned_meta_map.objects.erase(begin, iter);
1279 }
1280 }
1281
1282 // digest updates which we are waiting on
1283 int num_digest_updates_pending;
1284
1285 // chunky scrub
1286 hobject_t start, end; // [start,end)
1287 hobject_t max_end; // Largest end that may have been sent to replicas
1288 eversion_t subset_last_update;
1289
1290 // chunky scrub state
1291 enum State {
1292 INACTIVE,
1293 NEW_CHUNK,
1294 WAIT_PUSHES,
1295 WAIT_LAST_UPDATE,
1296 BUILD_MAP,
1297 BUILD_MAP_DONE,
1298 WAIT_REPLICAS,
1299 COMPARE_MAPS,
1300 WAIT_DIGEST_UPDATES,
1301 FINISH,
1302 BUILD_MAP_REPLICA,
1303 } state;
1304
1305 std::unique_ptr<Scrub::Store> store;
1306 // deep scrub
1307 bool deep;
1308 int preempt_left;
1309 int preempt_divisor;
1310
1311 list<Context*> callbacks;
1312 void add_callback(Context *context) {
1313 callbacks.push_back(context);
1314 }
1315 void run_callbacks() {
1316 list<Context*> to_run;
1317 to_run.swap(callbacks);
1318 for (list<Context*>::iterator i = to_run.begin();
1319 i != to_run.end();
1320 ++i) {
1321 (*i)->complete(0);
1322 }
1323 }
1324
1325 static const char *state_string(const PG::Scrubber::State& state) {
1326 const char *ret = NULL;
1327 switch( state )
1328 {
1329 case INACTIVE: ret = "INACTIVE"; break;
1330 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1331 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1332 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1333 case BUILD_MAP: ret = "BUILD_MAP"; break;
1334 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1335 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1336 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1337 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1338 case FINISH: ret = "FINISH"; break;
1339 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1340 }
1341 return ret;
1342 }
1343
1344 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1345
1346 // clear all state
1347 void reset() {
1348 active = false;
1349 waiting_on_whom.clear();
1350 if (active_rep_scrub) {
1351 active_rep_scrub = OpRequestRef();
1352 }
1353 received_maps.clear();
1354
1355 must_scrub = false;
1356 must_deep_scrub = false;
1357 must_repair = false;
1358 auto_repair = false;
1359
1360 state = PG::Scrubber::INACTIVE;
1361 start = hobject_t();
1362 end = hobject_t();
1363 max_end = hobject_t();
1364 subset_last_update = eversion_t();
1365 shallow_errors = 0;
1366 deep_errors = 0;
1367 large_omap_objects = 0;
1368 fixed = 0;
1369 deep = false;
1370 run_callbacks();
1371 inconsistent.clear();
1372 missing.clear();
1373 authoritative.clear();
1374 num_digest_updates_pending = 0;
1375 primary_scrubmap = ScrubMap();
1376 primary_scrubmap_pos.reset();
1377 replica_scrubmap = ScrubMap();
1378 replica_scrubmap_pos.reset();
1379 cleaned_meta_map = ScrubMap();
1380 sleeping = false;
1381 needs_sleep = true;
1382 sleep_start = utime_t();
1383 }
1384
1385 void create_results(const hobject_t& obj);
1386 void cleanup_store(ObjectStore::Transaction *t);
1387 } scrubber;
1388
1389 bool scrub_after_recovery;
1390
1391 int active_pushes;
1392
1393 bool scrub_can_preempt = false;
1394 bool scrub_preempted = false;
1395
1396 // we allow some number of preemptions of the scrub, which mean we do
1397 // not block. then we start to block. once we start blocking, we do
1398 // not stop until the scrub range is completed.
1399 bool write_blocked_by_scrub(const hobject_t &soid);
1400
1401 /// true if the given range intersects the scrub interval in any way
1402 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1403
1404 void repair_object(
1405 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1406 pg_shard_t bad_peer);
1407
1408 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1409 void chunky_scrub(ThreadPool::TPHandle &handle);
1410 void scrub_compare_maps();
1411 /**
1412 * return true if any inconsistency/missing is repaired, false otherwise
1413 */
1414 bool scrub_process_inconsistent();
1415 bool ops_blocked_by_scrub() const;
1416 void scrub_finish();
1417 void scrub_clear_state();
1418 void _scan_snaps(ScrubMap &map);
1419 void _repair_oinfo_oid(ScrubMap &map);
1420 void _scan_rollback_obs(
1421 const vector<ghobject_t> &rollback_obs,
1422 ThreadPool::TPHandle &handle);
1423 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1424 hobject_t start, hobject_t end, bool deep,
1425 bool allow_preemption);
1426 int build_scrub_map_chunk(
1427 ScrubMap &map,
1428 ScrubMapBuilder &pos,
1429 hobject_t start, hobject_t end, bool deep,
1430 ThreadPool::TPHandle &handle);
1431 /**
1432 * returns true if [begin, end) is good to scrub at this time
1433 * a false return value obliges the implementer to requeue scrub when the
1434 * condition preventing scrub clears
1435 */
1436 virtual bool _range_available_for_scrub(
1437 const hobject_t &begin, const hobject_t &end) = 0;
1438 virtual void scrub_snapshot_metadata(
1439 ScrubMap &map,
1440 const std::map<hobject_t,
1441 pair<boost::optional<uint32_t>,
1442 boost::optional<uint32_t>>> &missing_digest) { }
1443 virtual void _scrub_clear_state() { }
1444 virtual void _scrub_finish() { }
1445 virtual void split_colls(
1446 spg_t child,
1447 int split_bits,
1448 int seed,
1449 const pg_pool_t *pool,
1450 ObjectStore::Transaction *t) = 0;
1451 void clear_scrub_reserved();
1452 void scrub_reserve_replicas();
1453 void scrub_unreserve_replicas();
1454 bool scrub_all_replicas_reserved() const;
1455 bool sched_scrub();
1456 void reg_next_scrub();
1457 void unreg_next_scrub();
1458
1459 void replica_scrub(
1460 OpRequestRef op,
1461 ThreadPool::TPHandle &handle);
1462 void do_replica_scrub_map(OpRequestRef op);
1463 void sub_op_scrub_map(OpRequestRef op);
1464
1465 void handle_scrub_reserve_request(OpRequestRef op);
1466 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1467 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1468 void handle_scrub_reserve_release(OpRequestRef op);
1469
1470 void reject_reservation();
1471 void schedule_backfill_retry(float retry);
1472 void schedule_recovery_retry(float retry);
1473
1474 // -- recovery state --
1475
1476 template <class EVT>
1477 struct QueuePeeringEvt : Context {
1478 PGRef pg;
1479 epoch_t epoch;
1480 EVT evt;
1481 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1482 pg(pg), epoch(epoch), evt(evt) {}
1483 void finish(int r) override {
1484 pg->lock();
1485 pg->queue_peering_event(PG::CephPeeringEvtRef(
1486 new PG::CephPeeringEvt(
1487 epoch,
1488 epoch,
1489 evt)));
1490 pg->unlock();
1491 }
1492 };
1493
1494 class CephPeeringEvt {
1495 epoch_t epoch_sent;
1496 epoch_t epoch_requested;
1497 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1498 string desc;
1499 public:
1500 MEMPOOL_CLASS_HELPERS();
1501 template <class T>
1502 CephPeeringEvt(epoch_t epoch_sent,
1503 epoch_t epoch_requested,
1504 const T &evt_) :
1505 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1506 evt(evt_.intrusive_from_this()) {
1507 stringstream out;
1508 out << "epoch_sent: " << epoch_sent
1509 << " epoch_requested: " << epoch_requested << " ";
1510 evt_.print(&out);
1511 desc = out.str();
1512 }
1513 epoch_t get_epoch_sent() { return epoch_sent; }
1514 epoch_t get_epoch_requested() { return epoch_requested; }
1515 const boost::statechart::event_base &get_event() { return *evt; }
1516 string get_desc() { return desc; }
1517 };
1518 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1519 list<CephPeeringEvtRef> peering_queue; // op queue
1520 list<CephPeeringEvtRef> peering_waiters;
1521
1522 struct QueryState : boost::statechart::event< QueryState > {
1523 Formatter *f;
1524 explicit QueryState(Formatter *f) : f(f) {}
1525 void print(std::ostream *out) const {
1526 *out << "Query";
1527 }
1528 };
1529
1530 struct MInfoRec : boost::statechart::event< MInfoRec > {
1531 pg_shard_t from;
1532 pg_info_t info;
1533 epoch_t msg_epoch;
1534 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1535 from(from), info(info), msg_epoch(msg_epoch) {}
1536 void print(std::ostream *out) const {
1537 *out << "MInfoRec from " << from << " info: " << info;
1538 }
1539 };
1540
1541 struct MLogRec : boost::statechart::event< MLogRec > {
1542 pg_shard_t from;
1543 boost::intrusive_ptr<MOSDPGLog> msg;
1544 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1545 from(from), msg(msg) {}
1546 void print(std::ostream *out) const {
1547 *out << "MLogRec from " << from;
1548 }
1549 };
1550
1551 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1552 pg_shard_t from;
1553 pg_notify_t notify;
1554 uint64_t features;
1555 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1556 from(from), notify(notify), features(f) {}
1557 void print(std::ostream *out) const {
1558 *out << "MNotifyRec from " << from << " notify: " << notify
1559 << " features: 0x" << hex << features << dec;
1560 }
1561 };
1562
1563 struct MQuery : boost::statechart::event< MQuery > {
1564 pg_shard_t from;
1565 pg_query_t query;
1566 epoch_t query_epoch;
1567 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1568 from(from), query(query), query_epoch(query_epoch) {}
1569 void print(std::ostream *out) const {
1570 *out << "MQuery from " << from
1571 << " query_epoch " << query_epoch
1572 << " query: " << query;
1573 }
1574 };
1575
1576 struct AdvMap : boost::statechart::event< AdvMap > {
1577 OSDMapRef osdmap;
1578 OSDMapRef lastmap;
1579 vector<int> newup, newacting;
1580 int up_primary, acting_primary;
1581 AdvMap(
1582 OSDMapRef osdmap, OSDMapRef lastmap,
1583 vector<int>& newup, int up_primary,
1584 vector<int>& newacting, int acting_primary):
1585 osdmap(osdmap), lastmap(lastmap),
1586 newup(newup),
1587 newacting(newacting),
1588 up_primary(up_primary),
1589 acting_primary(acting_primary) {}
1590 void print(std::ostream *out) const {
1591 *out << "AdvMap";
1592 }
1593 };
1594
1595 struct ActMap : boost::statechart::event< ActMap > {
1596 ActMap() : boost::statechart::event< ActMap >() {}
1597 void print(std::ostream *out) const {
1598 *out << "ActMap";
1599 }
1600 };
1601 struct Activate : boost::statechart::event< Activate > {
1602 epoch_t activation_epoch;
1603 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1604 activation_epoch(q) {}
1605 void print(std::ostream *out) const {
1606 *out << "Activate from " << activation_epoch;
1607 }
1608 };
1609 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1610 unsigned priority;
1611 explicit RequestBackfillPrio(unsigned prio) :
1612 boost::statechart::event< RequestBackfillPrio >(),
1613 priority(prio) {}
1614 void print(std::ostream *out) const {
1615 *out << "RequestBackfillPrio: priority " << priority;
1616 }
1617 };
1618 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1619 T() : boost::statechart::event< T >() {} \
1620 void print(std::ostream *out) const { \
1621 *out << #T; \
1622 } \
1623 };
1624 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1625 float delay;
1626 explicit DeferBackfill(float delay) : delay(delay) {}
1627 void print(std::ostream *out) const {
1628 *out << "DeferBackfill: delay " << delay;
1629 }
1630 };
1631 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1632 float delay;
1633 explicit DeferRecovery(float delay) : delay(delay) {}
1634 void print(std::ostream *out) const {
1635 *out << "DeferRecovery: delay " << delay;
1636 }
1637 };
1638 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
1639 explicit UnfoundBackfill() {}
1640 void print(std::ostream *out) const {
1641 *out << "UnfoundBackfill";
1642 }
1643 };
1644 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
1645 explicit UnfoundRecovery() {}
1646 void print(std::ostream *out) const {
1647 *out << "UnfoundRecovery";
1648 }
1649 };
1650 protected:
1651 TrivialEvent(Initialize)
1652 TrivialEvent(Load)
1653 TrivialEvent(GotInfo)
1654 TrivialEvent(NeedUpThru)
1655 TrivialEvent(NullEvt)
1656 TrivialEvent(FlushedEvt)
1657 TrivialEvent(Backfilled)
1658 TrivialEvent(LocalBackfillReserved)
1659 TrivialEvent(RemoteBackfillReserved)
1660 TrivialEvent(RejectRemoteReservation)
1661 TrivialEvent(RemoteReservationRejected)
1662 TrivialEvent(RemoteReservationCanceled)
1663 TrivialEvent(RequestBackfill)
1664 TrivialEvent(RequestRecovery)
1665 TrivialEvent(RecoveryDone)
1666 TrivialEvent(BackfillTooFull)
1667 TrivialEvent(RecoveryTooFull)
1668
1669 TrivialEvent(MakePrimary)
1670 TrivialEvent(MakeStray)
1671 TrivialEvent(NeedActingChange)
1672 TrivialEvent(IsIncomplete)
1673 TrivialEvent(IsDown)
1674
1675 TrivialEvent(AllReplicasRecovered)
1676 TrivialEvent(DoRecovery)
1677 TrivialEvent(LocalRecoveryReserved)
1678 TrivialEvent(RemoteRecoveryReserved)
1679 TrivialEvent(AllRemotesReserved)
1680 TrivialEvent(AllBackfillsReserved)
1681 TrivialEvent(GoClean)
1682
1683 TrivialEvent(AllReplicasActivated)
1684
1685 TrivialEvent(IntervalFlush)
1686
1687 /* Encapsulates PG recovery process */
1688 class RecoveryState {
1689 void start_handle(RecoveryCtx *new_ctx);
1690 void end_handle();
1691 public:
1692 void begin_block_outgoing();
1693 void end_block_outgoing();
1694 void clear_blocked_outgoing();
1695 private:
1696
1697 /* States */
1698 struct Initial;
1699 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1700 RecoveryState *state;
1701 public:
1702 PG *pg;
1703
1704 utime_t event_time;
1705 uint64_t event_count;
1706
1707 void clear_event_counters() {
1708 event_time = utime_t();
1709 event_count = 0;
1710 }
1711
1712 void log_enter(const char *state_name);
1713 void log_exit(const char *state_name, utime_t duration);
1714
1715 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1716
1717 /* Accessor functions for state methods */
1718 ObjectStore::Transaction* get_cur_transaction() {
1719 assert(state->rctx);
1720 assert(state->rctx->transaction);
1721 return state->rctx->transaction;
1722 }
1723
1724 void send_query(pg_shard_t to, const pg_query_t &query) {
1725 assert(state->rctx);
1726 assert(state->rctx->query_map);
1727 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1728 query;
1729 }
1730
1731 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1732 assert(state->rctx);
1733 assert(state->rctx->query_map);
1734 return state->rctx->query_map;
1735 }
1736
1737 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1738 assert(state->rctx);
1739 assert(state->rctx->info_map);
1740 return state->rctx->info_map;
1741 }
1742
1743 list< Context* > *get_on_safe_context_list() {
1744 assert(state->rctx);
1745 assert(state->rctx->on_safe);
1746 return &(state->rctx->on_safe->contexts);
1747 }
1748
1749 list< Context * > *get_on_applied_context_list() {
1750 assert(state->rctx);
1751 assert(state->rctx->on_applied);
1752 return &(state->rctx->on_applied->contexts);
1753 }
1754
1755 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1756
1757 void send_notify(pg_shard_t to,
1758 const pg_notify_t &info, const PastIntervals &pi) {
1759 assert(state->rctx);
1760 state->rctx->send_notify(to, info, pi);
1761 }
1762 };
1763 friend class RecoveryMachine;
1764
1765 /* States */
1766 // Initial
1767 // Reset
1768 // Start
1769 // Started
1770 // Primary
1771 // WaitActingChange
1772 // Peering
1773 // GetInfo
1774 // GetLog
1775 // GetMissing
1776 // WaitUpThru
1777 // Incomplete
1778 // Active
1779 // Activating
1780 // Clean
1781 // Recovered
1782 // Backfilling
1783 // WaitRemoteBackfillReserved
1784 // WaitLocalBackfillReserved
1785 // NotBackfilling
1786 // NotRecovering
1787 // Recovering
1788 // WaitRemoteRecoveryReserved
1789 // WaitLocalRecoveryReserved
1790 // ReplicaActive
1791 // RepNotRecovering
1792 // RepRecovering
1793 // RepWaitBackfillReserved
1794 // RepWaitRecoveryReserved
1795 // Stray
1796
1797 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1798 explicit Crashed(my_context ctx);
1799 };
1800
1801 struct Reset;
1802
1803 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1804 explicit Initial(my_context ctx);
1805 void exit();
1806
1807 typedef boost::mpl::list <
1808 boost::statechart::transition< Initialize, Reset >,
1809 boost::statechart::custom_reaction< Load >,
1810 boost::statechart::custom_reaction< NullEvt >,
1811 boost::statechart::transition< boost::statechart::event_base, Crashed >
1812 > reactions;
1813
1814 boost::statechart::result react(const Load&);
1815 boost::statechart::result react(const MNotifyRec&);
1816 boost::statechart::result react(const MInfoRec&);
1817 boost::statechart::result react(const MLogRec&);
1818 boost::statechart::result react(const boost::statechart::event_base&) {
1819 return discard_event();
1820 }
1821 };
1822
1823 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1824 explicit Reset(my_context ctx);
1825 void exit();
1826
1827 typedef boost::mpl::list <
1828 boost::statechart::custom_reaction< QueryState >,
1829 boost::statechart::custom_reaction< AdvMap >,
1830 boost::statechart::custom_reaction< ActMap >,
1831 boost::statechart::custom_reaction< NullEvt >,
1832 boost::statechart::custom_reaction< FlushedEvt >,
1833 boost::statechart::custom_reaction< IntervalFlush >,
1834 boost::statechart::transition< boost::statechart::event_base, Crashed >
1835 > reactions;
1836 boost::statechart::result react(const QueryState& q);
1837 boost::statechart::result react(const AdvMap&);
1838 boost::statechart::result react(const ActMap&);
1839 boost::statechart::result react(const FlushedEvt&);
1840 boost::statechart::result react(const IntervalFlush&);
1841 boost::statechart::result react(const boost::statechart::event_base&) {
1842 return discard_event();
1843 }
1844 };
1845
1846 struct Start;
1847
1848 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1849 explicit Started(my_context ctx);
1850 void exit();
1851
1852 typedef boost::mpl::list <
1853 boost::statechart::custom_reaction< QueryState >,
1854 boost::statechart::custom_reaction< AdvMap >,
1855 boost::statechart::custom_reaction< NullEvt >,
1856 boost::statechart::custom_reaction< FlushedEvt >,
1857 boost::statechart::custom_reaction< IntervalFlush >,
1858 boost::statechart::transition< boost::statechart::event_base, Crashed >
1859 > reactions;
1860 boost::statechart::result react(const QueryState& q);
1861 boost::statechart::result react(const AdvMap&);
1862 boost::statechart::result react(const FlushedEvt&);
1863 boost::statechart::result react(const IntervalFlush&);
1864 boost::statechart::result react(const boost::statechart::event_base&) {
1865 return discard_event();
1866 }
1867 };
1868
1869 struct Primary;
1870 struct Stray;
1871
1872 struct Start : boost::statechart::state< Start, Started >, NamedState {
1873 explicit Start(my_context ctx);
1874 void exit();
1875
1876 typedef boost::mpl::list <
1877 boost::statechart::transition< MakePrimary, Primary >,
1878 boost::statechart::transition< MakeStray, Stray >
1879 > reactions;
1880 };
1881
1882 struct Peering;
1883 struct WaitActingChange;
1884 struct Incomplete;
1885 struct Down;
1886
1887 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1888 explicit Primary(my_context ctx);
1889 void exit();
1890
1891 typedef boost::mpl::list <
1892 boost::statechart::custom_reaction< ActMap >,
1893 boost::statechart::custom_reaction< MNotifyRec >,
1894 boost::statechart::transition< NeedActingChange, WaitActingChange >
1895 > reactions;
1896 boost::statechart::result react(const ActMap&);
1897 boost::statechart::result react(const MNotifyRec&);
1898 };
1899
1900 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1901 NamedState {
1902 typedef boost::mpl::list <
1903 boost::statechart::custom_reaction< QueryState >,
1904 boost::statechart::custom_reaction< AdvMap >,
1905 boost::statechart::custom_reaction< MLogRec >,
1906 boost::statechart::custom_reaction< MInfoRec >,
1907 boost::statechart::custom_reaction< MNotifyRec >
1908 > reactions;
1909 explicit WaitActingChange(my_context ctx);
1910 boost::statechart::result react(const QueryState& q);
1911 boost::statechart::result react(const AdvMap&);
1912 boost::statechart::result react(const MLogRec&);
1913 boost::statechart::result react(const MInfoRec&);
1914 boost::statechart::result react(const MNotifyRec&);
1915 void exit();
1916 };
1917
1918 struct GetInfo;
1919 struct Active;
1920
1921 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1922 PastIntervals::PriorSet prior_set;
1923 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1924
1925 explicit Peering(my_context ctx);
1926 void exit();
1927
1928 typedef boost::mpl::list <
1929 boost::statechart::custom_reaction< QueryState >,
1930 boost::statechart::transition< Activate, Active >,
1931 boost::statechart::custom_reaction< AdvMap >
1932 > reactions;
1933 boost::statechart::result react(const QueryState& q);
1934 boost::statechart::result react(const AdvMap &advmap);
1935 };
1936
1937 struct WaitLocalRecoveryReserved;
1938 struct Activating;
1939 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1940 explicit Active(my_context ctx);
1941 void exit();
1942
1943 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1944 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1945 bool all_replicas_activated;
1946
1947 typedef boost::mpl::list <
1948 boost::statechart::custom_reaction< QueryState >,
1949 boost::statechart::custom_reaction< ActMap >,
1950 boost::statechart::custom_reaction< AdvMap >,
1951 boost::statechart::custom_reaction< MInfoRec >,
1952 boost::statechart::custom_reaction< MNotifyRec >,
1953 boost::statechart::custom_reaction< MLogRec >,
1954 boost::statechart::custom_reaction< Backfilled >,
1955 boost::statechart::custom_reaction< AllReplicasActivated >,
1956 boost::statechart::custom_reaction< DeferRecovery >,
1957 boost::statechart::custom_reaction< DeferBackfill >,
1958 boost::statechart::custom_reaction< UnfoundRecovery >,
1959 boost::statechart::custom_reaction< UnfoundBackfill >,
1960 boost::statechart::custom_reaction< DoRecovery>
1961 > reactions;
1962 boost::statechart::result react(const QueryState& q);
1963 boost::statechart::result react(const ActMap&);
1964 boost::statechart::result react(const AdvMap&);
1965 boost::statechart::result react(const MInfoRec& infoevt);
1966 boost::statechart::result react(const MNotifyRec& notevt);
1967 boost::statechart::result react(const MLogRec& logevt);
1968 boost::statechart::result react(const Backfilled&) {
1969 return discard_event();
1970 }
1971 boost::statechart::result react(const AllReplicasActivated&);
1972 boost::statechart::result react(const DeferRecovery& evt) {
1973 return discard_event();
1974 }
1975 boost::statechart::result react(const DeferBackfill& evt) {
1976 return discard_event();
1977 }
1978 boost::statechart::result react(const UnfoundRecovery& evt) {
1979 return discard_event();
1980 }
1981 boost::statechart::result react(const UnfoundBackfill& evt) {
1982 return discard_event();
1983 }
1984 boost::statechart::result react(const DoRecovery&) {
1985 return discard_event();
1986 }
1987 };
1988
1989 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1990 typedef boost::mpl::list<
1991 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1992 > reactions;
1993 explicit Clean(my_context ctx);
1994 void exit();
1995 };
1996
1997 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1998 typedef boost::mpl::list<
1999 boost::statechart::transition< GoClean, Clean >,
2000 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2001 boost::statechart::custom_reaction< AllReplicasActivated >
2002 > reactions;
2003 explicit Recovered(my_context ctx);
2004 void exit();
2005 boost::statechart::result react(const AllReplicasActivated&) {
2006 post_event(GoClean());
2007 return forward_event();
2008 }
2009 };
2010
2011 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2012 typedef boost::mpl::list<
2013 boost::statechart::transition< Backfilled, Recovered >,
2014 boost::statechart::custom_reaction< DeferBackfill >,
2015 boost::statechart::custom_reaction< UnfoundBackfill >,
2016 boost::statechart::custom_reaction< RemoteReservationRejected >
2017 > reactions;
2018 explicit Backfilling(my_context ctx);
2019 boost::statechart::result react(const RemoteReservationRejected& evt);
2020 boost::statechart::result react(const DeferBackfill& evt);
2021 boost::statechart::result react(const UnfoundBackfill& evt);
2022 void exit();
2023 };
2024
2025 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2026 typedef boost::mpl::list<
2027 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2028 boost::statechart::custom_reaction< RemoteReservationRejected >,
2029 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2030 > reactions;
2031 set<pg_shard_t>::const_iterator backfill_osd_it;
2032 explicit WaitRemoteBackfillReserved(my_context ctx);
2033 void exit();
2034 boost::statechart::result react(const RemoteBackfillReserved& evt);
2035 boost::statechart::result react(const RemoteReservationRejected& evt);
2036 };
2037
2038 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2039 typedef boost::mpl::list<
2040 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2041 > reactions;
2042 explicit WaitLocalBackfillReserved(my_context ctx);
2043 void exit();
2044 };
2045
2046 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2047 typedef boost::mpl::list<
2048 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2049 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2050 boost::statechart::custom_reaction< RemoteReservationRejected >
2051 > reactions;
2052 explicit NotBackfilling(my_context ctx);
2053 void exit();
2054 boost::statechart::result react(const RemoteBackfillReserved& evt);
2055 boost::statechart::result react(const RemoteReservationRejected& evt);
2056 };
2057
2058 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2059 typedef boost::mpl::list<
2060 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2061 boost::statechart::custom_reaction< DeferRecovery >,
2062 boost::statechart::custom_reaction< UnfoundRecovery >
2063 > reactions;
2064 explicit NotRecovering(my_context ctx);
2065 boost::statechart::result react(const DeferRecovery& evt) {
2066 /* no-op */
2067 return discard_event();
2068 }
2069 boost::statechart::result react(const UnfoundRecovery& evt) {
2070 /* no-op */
2071 return discard_event();
2072 }
2073 void exit();
2074 };
2075
2076 struct RepNotRecovering;
2077 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2078 explicit ReplicaActive(my_context ctx);
2079 void exit();
2080
2081 typedef boost::mpl::list <
2082 boost::statechart::custom_reaction< QueryState >,
2083 boost::statechart::custom_reaction< ActMap >,
2084 boost::statechart::custom_reaction< MQuery >,
2085 boost::statechart::custom_reaction< MInfoRec >,
2086 boost::statechart::custom_reaction< MLogRec >,
2087 boost::statechart::custom_reaction< Activate >,
2088 boost::statechart::custom_reaction< DeferRecovery >,
2089 boost::statechart::custom_reaction< DeferBackfill >,
2090 boost::statechart::custom_reaction< UnfoundRecovery >,
2091 boost::statechart::custom_reaction< UnfoundBackfill >
2092 > reactions;
2093 boost::statechart::result react(const QueryState& q);
2094 boost::statechart::result react(const MInfoRec& infoevt);
2095 boost::statechart::result react(const MLogRec& logevt);
2096 boost::statechart::result react(const ActMap&);
2097 boost::statechart::result react(const MQuery&);
2098 boost::statechart::result react(const Activate&);
2099 boost::statechart::result react(const DeferRecovery& evt) {
2100 return discard_event();
2101 }
2102 boost::statechart::result react(const DeferBackfill& evt) {
2103 return discard_event();
2104 }
2105 boost::statechart::result react(const UnfoundRecovery& evt) {
2106 return discard_event();
2107 }
2108 boost::statechart::result react(const UnfoundBackfill& evt) {
2109 return discard_event();
2110 }
2111 };
2112
2113 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2114 typedef boost::mpl::list<
2115 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2116 // for compat with old peers
2117 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2118 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2119 boost::statechart::custom_reaction< BackfillTooFull >
2120 > reactions;
2121 explicit RepRecovering(my_context ctx);
2122 boost::statechart::result react(const BackfillTooFull &evt);
2123 void exit();
2124 };
2125
2126 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2127 typedef boost::mpl::list<
2128 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2129 boost::statechart::custom_reaction< RejectRemoteReservation >,
2130 boost::statechart::custom_reaction< RemoteReservationRejected >,
2131 boost::statechart::custom_reaction< RemoteReservationCanceled >
2132 > reactions;
2133 explicit RepWaitBackfillReserved(my_context ctx);
2134 void exit();
2135 boost::statechart::result react(const RemoteBackfillReserved &evt);
2136 boost::statechart::result react(const RejectRemoteReservation &evt);
2137 boost::statechart::result react(const RemoteReservationRejected &evt);
2138 boost::statechart::result react(const RemoteReservationCanceled &evt);
2139 };
2140
2141 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2142 typedef boost::mpl::list<
2143 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2144 // for compat with old peers
2145 boost::statechart::custom_reaction< RemoteReservationRejected >,
2146 boost::statechart::custom_reaction< RemoteReservationCanceled >
2147 > reactions;
2148 explicit RepWaitRecoveryReserved(my_context ctx);
2149 void exit();
2150 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2151 boost::statechart::result react(const RemoteReservationRejected &evt) {
2152 // for compat with old peers
2153 post_event(RemoteReservationCanceled());
2154 return discard_event();
2155 }
2156 boost::statechart::result react(const RemoteReservationCanceled &evt);
2157 };
2158
2159 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2160 typedef boost::mpl::list<
2161 boost::statechart::custom_reaction< RequestBackfillPrio >,
2162 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
2163 boost::statechart::custom_reaction< RejectRemoteReservation >,
2164 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2165 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2166 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2167 > reactions;
2168 explicit RepNotRecovering(my_context ctx);
2169 boost::statechart::result react(const RequestBackfillPrio &evt);
2170 boost::statechart::result react(const RejectRemoteReservation &evt);
2171 void exit();
2172 };
2173
2174 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2175 typedef boost::mpl::list <
2176 boost::statechart::custom_reaction< AllReplicasRecovered >,
2177 boost::statechart::custom_reaction< DeferRecovery >,
2178 boost::statechart::custom_reaction< UnfoundRecovery >,
2179 boost::statechart::custom_reaction< RequestBackfill >
2180 > reactions;
2181 explicit Recovering(my_context ctx);
2182 void exit();
2183 void release_reservations(bool cancel = false);
2184 boost::statechart::result react(const AllReplicasRecovered &evt);
2185 boost::statechart::result react(const DeferRecovery& evt);
2186 boost::statechart::result react(const UnfoundRecovery& evt);
2187 boost::statechart::result react(const RequestBackfill &evt);
2188 };
2189
2190 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2191 typedef boost::mpl::list <
2192 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2193 boost::statechart::transition< AllRemotesReserved, Recovering >
2194 > reactions;
2195 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2196 explicit WaitRemoteRecoveryReserved(my_context ctx);
2197 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2198 void exit();
2199 };
2200
2201 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2202 typedef boost::mpl::list <
2203 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2204 boost::statechart::custom_reaction< RecoveryTooFull >
2205 > reactions;
2206 explicit WaitLocalRecoveryReserved(my_context ctx);
2207 void exit();
2208 boost::statechart::result react(const RecoveryTooFull &evt);
2209 };
2210
2211 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2212 typedef boost::mpl::list <
2213 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2214 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2215 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2216 > reactions;
2217 explicit Activating(my_context ctx);
2218 void exit();
2219 };
2220
2221 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2222 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2223
2224 explicit Stray(my_context ctx);
2225 void exit();
2226
2227 typedef boost::mpl::list <
2228 boost::statechart::custom_reaction< MQuery >,
2229 boost::statechart::custom_reaction< MLogRec >,
2230 boost::statechart::custom_reaction< MInfoRec >,
2231 boost::statechart::custom_reaction< ActMap >,
2232 boost::statechart::custom_reaction< RecoveryDone >
2233 > reactions;
2234 boost::statechart::result react(const MQuery& query);
2235 boost::statechart::result react(const MLogRec& logevt);
2236 boost::statechart::result react(const MInfoRec& infoevt);
2237 boost::statechart::result react(const ActMap&);
2238 boost::statechart::result react(const RecoveryDone&) {
2239 return discard_event();
2240 }
2241 };
2242
2243 struct GetLog;
2244
2245 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2246 set<pg_shard_t> peer_info_requested;
2247
2248 explicit GetInfo(my_context ctx);
2249 void exit();
2250 void get_infos();
2251
2252 typedef boost::mpl::list <
2253 boost::statechart::custom_reaction< QueryState >,
2254 boost::statechart::transition< GotInfo, GetLog >,
2255 boost::statechart::custom_reaction< MNotifyRec >,
2256 boost::statechart::transition< IsDown, Down >
2257 > reactions;
2258 boost::statechart::result react(const QueryState& q);
2259 boost::statechart::result react(const MNotifyRec& infoevt);
2260 };
2261
2262 struct GotLog : boost::statechart::event< GotLog > {
2263 GotLog() : boost::statechart::event< GotLog >() {}
2264 };
2265
2266 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2267 pg_shard_t auth_log_shard;
2268 boost::intrusive_ptr<MOSDPGLog> msg;
2269
2270 explicit GetLog(my_context ctx);
2271 void exit();
2272
2273 typedef boost::mpl::list <
2274 boost::statechart::custom_reaction< QueryState >,
2275 boost::statechart::custom_reaction< MLogRec >,
2276 boost::statechart::custom_reaction< GotLog >,
2277 boost::statechart::custom_reaction< AdvMap >,
2278 boost::statechart::transition< IsIncomplete, Incomplete >
2279 > reactions;
2280 boost::statechart::result react(const AdvMap&);
2281 boost::statechart::result react(const QueryState& q);
2282 boost::statechart::result react(const MLogRec& logevt);
2283 boost::statechart::result react(const GotLog&);
2284 };
2285
2286 struct WaitUpThru;
2287
2288 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2289 set<pg_shard_t> peer_missing_requested;
2290
2291 explicit GetMissing(my_context ctx);
2292 void exit();
2293
2294 typedef boost::mpl::list <
2295 boost::statechart::custom_reaction< QueryState >,
2296 boost::statechart::custom_reaction< MLogRec >,
2297 boost::statechart::transition< NeedUpThru, WaitUpThru >
2298 > reactions;
2299 boost::statechart::result react(const QueryState& q);
2300 boost::statechart::result react(const MLogRec& logevt);
2301 };
2302
2303 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2304 explicit WaitUpThru(my_context ctx);
2305 void exit();
2306
2307 typedef boost::mpl::list <
2308 boost::statechart::custom_reaction< QueryState >,
2309 boost::statechart::custom_reaction< ActMap >,
2310 boost::statechart::custom_reaction< MLogRec >
2311 > reactions;
2312 boost::statechart::result react(const QueryState& q);
2313 boost::statechart::result react(const ActMap& am);
2314 boost::statechart::result react(const MLogRec& logrec);
2315 };
2316
2317 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2318 explicit Down(my_context ctx);
2319 typedef boost::mpl::list <
2320 boost::statechart::custom_reaction< QueryState >
2321 > reactions;
2322 boost::statechart::result react(const QueryState& infoevt);
2323 void exit();
2324 };
2325
2326 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2327 typedef boost::mpl::list <
2328 boost::statechart::custom_reaction< AdvMap >,
2329 boost::statechart::custom_reaction< MNotifyRec >,
2330 boost::statechart::custom_reaction< QueryState >
2331 > reactions;
2332 explicit Incomplete(my_context ctx);
2333 boost::statechart::result react(const AdvMap &advmap);
2334 boost::statechart::result react(const MNotifyRec& infoevt);
2335 boost::statechart::result react(const QueryState& infoevt);
2336 void exit();
2337 };
2338
2339
2340 RecoveryMachine machine;
2341 PG *pg;
2342
2343 /// context passed in by state machine caller
2344 RecoveryCtx *orig_ctx;
2345
2346 /// populated if we are buffering messages pending a flush
2347 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2348
2349 /**
2350 * populated between start_handle() and end_handle(), points into
2351 * the message lists for messages_pending_flush while blocking messages
2352 * or into orig_ctx otherwise
2353 */
2354 boost::optional<RecoveryCtx> rctx;
2355
2356 public:
2357 explicit RecoveryState(PG *pg)
2358 : machine(this, pg), pg(pg), orig_ctx(0) {
2359 machine.initiate();
2360 }
2361
2362 void handle_event(const boost::statechart::event_base &evt,
2363 RecoveryCtx *rctx) {
2364 start_handle(rctx);
2365 machine.process_event(evt);
2366 end_handle();
2367 }
2368
2369 void handle_event(CephPeeringEvtRef evt,
2370 RecoveryCtx *rctx) {
2371 start_handle(rctx);
2372 machine.process_event(evt->get_event());
2373 end_handle();
2374 }
2375
2376 } recovery_state;
2377
2378
2379 public:
2380 PG(OSDService *o, OSDMapRef curmap,
2381 const PGPool &pool, spg_t p);
2382 ~PG() override;
2383
2384 private:
2385 // Prevent copying
2386 explicit PG(const PG& rhs);
2387 PG& operator=(const PG& rhs);
2388 const spg_t pg_id;
2389 uint64_t peer_features;
2390 uint64_t acting_features;
2391 uint64_t upacting_features;
2392
2393 epoch_t last_epoch;
2394
2395 public:
2396 const spg_t& get_pgid() const { return pg_id; }
2397
2398 void reset_min_peer_features() {
2399 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2400 }
2401 uint64_t get_min_peer_features() const { return peer_features; }
2402 void apply_peer_features(uint64_t f) { peer_features &= f; }
2403
2404 uint64_t get_min_acting_features() const { return acting_features; }
2405 uint64_t get_min_upacting_features() const { return upacting_features; }
2406 bool perform_deletes_during_peering() const {
2407 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2408 }
2409
2410 void init_primary_up_acting(
2411 const vector<int> &newup,
2412 const vector<int> &newacting,
2413 int new_up_primary,
2414 int new_acting_primary) {
2415 actingset.clear();
2416 acting = newacting;
2417 for (uint8_t i = 0; i < acting.size(); ++i) {
2418 if (acting[i] != CRUSH_ITEM_NONE)
2419 actingset.insert(
2420 pg_shard_t(
2421 acting[i],
2422 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2423 }
2424 upset.clear();
2425 up = newup;
2426 for (uint8_t i = 0; i < up.size(); ++i) {
2427 if (up[i] != CRUSH_ITEM_NONE)
2428 upset.insert(
2429 pg_shard_t(
2430 up[i],
2431 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2432 }
2433 if (!pool.info.ec_pool()) {
2434 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2435 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2436 return;
2437 }
2438 up_primary = pg_shard_t();
2439 primary = pg_shard_t();
2440 for (uint8_t i = 0; i < up.size(); ++i) {
2441 if (up[i] == new_up_primary) {
2442 up_primary = pg_shard_t(up[i], shard_id_t(i));
2443 break;
2444 }
2445 }
2446 for (uint8_t i = 0; i < acting.size(); ++i) {
2447 if (acting[i] == new_acting_primary) {
2448 primary = pg_shard_t(acting[i], shard_id_t(i));
2449 break;
2450 }
2451 }
2452 assert(up_primary.osd == new_up_primary);
2453 assert(primary.osd == new_acting_primary);
2454 }
2455 pg_shard_t get_primary() const { return primary; }
2456
2457 int get_role() const { return role; }
2458 void set_role(int r) { role = r; }
2459
2460 bool is_primary() const { return pg_whoami == primary; }
2461 bool is_replica() const { return role > 0; }
2462
2463 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2464
2465 //int get_state() const { return state; }
2466 bool state_test(int m) const { return (state & m) != 0; }
2467 void state_set(int m) { state |= m; }
2468 void state_clear(int m) { state &= ~m; }
2469
2470 bool is_complete() const { return info.last_complete == info.last_update; }
2471 bool should_send_notify() const { return send_notify; }
2472
2473 int get_state() const { return state; }
2474 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2475 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2476 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2477 bool is_down() const { return state_test(PG_STATE_DOWN); }
2478 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2479 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2480 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2481 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2482 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2483 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2484
2485 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2486 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2487 bool is_peered() const {
2488 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2489 }
2490
2491 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2492
2493 void init(
2494 int role,
2495 const vector<int>& up,
2496 int up_primary,
2497 const vector<int>& acting,
2498 int acting_primary,
2499 const pg_history_t& history,
2500 const PastIntervals& pim,
2501 bool backfill,
2502 ObjectStore::Transaction *t);
2503
2504 // pg on-disk state
2505 void do_pending_flush();
2506
2507 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2508 static void _init(ObjectStore::Transaction& t,
2509 spg_t pgid, const pg_pool_t *pool);
2510
2511 private:
2512 void prepare_write_info(map<string,bufferlist> *km);
2513
2514 void update_store_with_options();
2515 void update_store_on_load();
2516
2517 public:
2518 static int _prepare_write_info(
2519 CephContext* cct,
2520 map<string,bufferlist> *km,
2521 epoch_t epoch,
2522 pg_info_t &info,
2523 pg_info_t &last_written_info,
2524 PastIntervals &past_intervals,
2525 bool dirty_big_info,
2526 bool dirty_epoch,
2527 bool try_fast_info,
2528 PerfCounters *logger = nullptr);
2529 void write_if_dirty(ObjectStore::Transaction& t);
2530
2531 PGLog::IndexedLog projected_log;
2532 bool check_in_progress_op(
2533 const osd_reqid_t &r,
2534 eversion_t *version,
2535 version_t *user_version,
2536 int *return_code) const;
2537 eversion_t projected_last_update;
2538 eversion_t get_next_version() const {
2539 eversion_t at_version(
2540 get_osdmap()->get_epoch(),
2541 projected_last_update.version+1);
2542 assert(at_version > info.last_update);
2543 assert(at_version > pg_log.get_head());
2544 assert(at_version > projected_last_update);
2545 return at_version;
2546 }
2547
2548 void add_log_entry(const pg_log_entry_t& e, bool applied);
2549 void append_log(
2550 const vector<pg_log_entry_t>& logv,
2551 eversion_t trim_to,
2552 eversion_t roll_forward_to,
2553 ObjectStore::Transaction &t,
2554 bool transaction_applied = true);
2555 bool check_log_for_corruption(ObjectStore *store);
2556 void trim_log();
2557
2558 std::string get_corrupt_pg_log_name() const;
2559 static int read_info(
2560 ObjectStore *store, spg_t pgid, const coll_t &coll,
2561 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2562 __u8 &);
2563 void read_state(ObjectStore *store, bufferlist &bl);
2564 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2565 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2566 epoch_t *pepoch, bufferlist *bl);
2567 void update_snap_map(
2568 const vector<pg_log_entry_t> &log_entries,
2569 ObjectStore::Transaction& t);
2570
2571 void filter_snapc(vector<snapid_t> &snaps);
2572
2573 void log_weirdness();
2574
2575 virtual void kick_snap_trim() = 0;
2576 virtual void snap_trimmer_scrub_complete() = 0;
2577 bool requeue_scrub(bool high_priority = false);
2578 void queue_recovery();
2579 bool queue_scrub();
2580 unsigned get_scrub_priority();
2581
2582 /// share pg info after a pg is active
2583 void share_pg_info();
2584
2585
2586 bool append_log_entries_update_missing(
2587 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2588 ObjectStore::Transaction &t,
2589 boost::optional<eversion_t> trim_to,
2590 boost::optional<eversion_t> roll_forward_to);
2591
2592 /**
2593 * Merge entries updating missing as necessary on all
2594 * actingbackfill logs and missings (also missing_loc)
2595 */
2596 void merge_new_log_entries(
2597 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2598 ObjectStore::Transaction &t,
2599 boost::optional<eversion_t> trim_to,
2600 boost::optional<eversion_t> roll_forward_to);
2601
2602 void reset_interval_flush();
2603 void start_peering_interval(
2604 const OSDMapRef lastmap,
2605 const vector<int>& newup, int up_primary,
2606 const vector<int>& newacting, int acting_primary,
2607 ObjectStore::Transaction *t);
2608 void on_new_interval();
2609 virtual void _on_new_interval() = 0;
2610 void start_flush(ObjectStore::Transaction *t,
2611 list<Context *> *on_applied,
2612 list<Context *> *on_safe);
2613 void set_last_peering_reset();
2614 bool pg_has_reset_since(epoch_t e) {
2615 assert(is_locked());
2616 return deleting || e < get_last_peering_reset();
2617 }
2618
2619 void update_history(const pg_history_t& history);
2620 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2621 pair<pg_shard_t, pg_info_t> &notify_info);
2622 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2623 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
2624 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2625
2626 bool should_restart_peering(
2627 int newupprimary,
2628 int newactingprimary,
2629 const vector<int>& newup,
2630 const vector<int>& newacting,
2631 OSDMapRef lastmap,
2632 OSDMapRef osdmap);
2633
2634 // OpRequest queueing
2635 bool can_discard_op(OpRequestRef& op);
2636 bool can_discard_scan(OpRequestRef op);
2637 bool can_discard_backfill(OpRequestRef op);
2638 bool can_discard_request(OpRequestRef& op);
2639
2640 template<typename T, int MSGTYPE>
2641 bool can_discard_replica_op(OpRequestRef& op);
2642
2643 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2644 bool old_peering_evt(CephPeeringEvtRef evt) {
2645 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2646 }
2647 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2648 return e <= cur_epoch;
2649 }
2650 bool have_same_or_newer_map(epoch_t e) {
2651 return e <= get_osdmap()->get_epoch();
2652 }
2653
2654 bool op_has_sufficient_caps(OpRequestRef& op);
2655
2656
2657 // recovery bits
2658 void take_waiters();
2659 void queue_peering_event(CephPeeringEvtRef evt);
2660 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2661 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2662 pg_shard_t from, const pg_query_t& q);
2663 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2664 void queue_flushed(epoch_t started_at);
2665 void handle_advance_map(
2666 OSDMapRef osdmap, OSDMapRef lastmap,
2667 vector<int>& newup, int up_primary,
2668 vector<int>& newacting, int acting_primary,
2669 RecoveryCtx *rctx);
2670 void handle_activate_map(RecoveryCtx *rctx);
2671 void handle_create(RecoveryCtx *rctx);
2672 void handle_loaded(RecoveryCtx *rctx);
2673 void handle_query_state(Formatter *f);
2674
2675 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2676
2677
2678 // abstract bits
2679 virtual void do_request(
2680 OpRequestRef& op,
2681 ThreadPool::TPHandle &handle
2682 ) = 0;
2683
2684 virtual void do_op(OpRequestRef& op) = 0;
2685 virtual void do_sub_op(OpRequestRef op) = 0;
2686 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2687 virtual void do_scan(
2688 OpRequestRef op,
2689 ThreadPool::TPHandle &handle
2690 ) = 0;
2691 virtual void do_backfill(OpRequestRef op) = 0;
2692 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2693
2694 virtual int do_command(
2695 cmdmap_t cmdmap,
2696 ostream& ss,
2697 bufferlist& idata,
2698 bufferlist& odata,
2699 ConnectionRef conn,
2700 ceph_tid_t tid) = 0;
2701
2702 virtual void on_role_change() = 0;
2703 virtual void on_pool_change() = 0;
2704 virtual void on_change(ObjectStore::Transaction *t) = 0;
2705 virtual void on_activate() = 0;
2706 virtual void on_flushed() = 0;
2707 virtual void on_shutdown() = 0;
2708 virtual void check_blacklisted_watchers() = 0;
2709 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2710
2711 virtual bool agent_work(int max) = 0;
2712 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2713 virtual void agent_stop() = 0;
2714 virtual void agent_delay() = 0;
2715 virtual void agent_clear() = 0;
2716 virtual void agent_choose_mode_restart() = 0;
2717 };
2718
2719 ostream& operator<<(ostream& out, const PG& pg);
2720
2721 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2722
2723 #endif