]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
update source to 12.2.11
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include <boost/container/flat_set.hpp>
28 #include "include/memory.h"
29 #include "include/mempool.h"
30
31 // re-include our assert to clobber boost's
32 #include "include/assert.h"
33
34 #include "include/types.h"
35 #include "include/stringify.h"
36 #include "osd_types.h"
37 #include "include/xlist.h"
38 #include "SnapMapper.h"
39 #include "Session.h"
40 #include "common/Timer.h"
41
42 #include "PGLog.h"
43 #include "OSDMap.h"
44 #include "messages/MOSDPGLog.h"
45 #include "include/str_list.h"
46 #include "PGBackend.h"
47
48 #include <atomic>
49 #include <list>
50 #include <memory>
51 #include <stack>
52 #include <string>
53 #include <tuple>
54 using namespace std;
55
56 // #include "include/unordered_map.h"
57 // #include "include/unordered_set.h"
58
59 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
60
61 class OSD;
62 class OSDService;
63 class MOSDOp;
64 class MOSDPGScan;
65 class MOSDPGBackfill;
66 class MOSDPGInfo;
67
68 class PG;
69 struct OpRequest;
70 typedef OpRequest::Ref OpRequestRef;
71 class MOSDPGLog;
72 class CephContext;
73
74 namespace Scrub {
75 class Store;
76 }
77
78 void intrusive_ptr_add_ref(PG *pg);
79 void intrusive_ptr_release(PG *pg);
80
81 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
82 using embedded_state = std::pair<utime_t, const char*>;
83
84 struct PGStateInstance {
85 // Time spent in pg states
86
87 void setepoch(const epoch_t current_epoch) {
88 this_epoch = current_epoch;
89 }
90
91 void enter_state(const utime_t entime, const char* state) {
92 embedded_states.push(std::make_pair(entime, state));
93 }
94
95 void exit_state(const utime_t extime) {
96 embedded_state this_state = embedded_states.top();
97 state_history.push_back(state_history_entry{
98 this_state.first, extime, this_state.second});
99 embedded_states.pop();
100 }
101
102 epoch_t this_epoch;
103 utime_t enter_time;
104 std::vector<state_history_entry> state_history;
105 std::stack<embedded_state> embedded_states;
106 };
107
108 class PGStateHistory {
109 // Member access protected with the PG lock
110 public:
111 PGStateHistory() : buffer(10) {}
112
113 void enter(PG* pg, const utime_t entime, const char* state);
114
115 void exit(const char* state);
116
117 void reset() {
118 pi = nullptr;
119 }
120
121 void set_pg_in_destructor() { pg_in_destructor = true; }
122
123 void dump(Formatter* f) const;
124
125 private:
126 bool pg_in_destructor = false;
127 PG* thispg = nullptr;
128 std::unique_ptr<PGStateInstance> tmppi;
129 PGStateInstance* pi = nullptr;
130 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
131
132 };
133
134 #ifdef PG_DEBUG_REFS
135 #include "common/tracked_int_ptr.hpp"
136 uint64_t get_with_id(PG *pg);
137 void put_with_id(PG *pg, uint64_t id);
138 typedef TrackedIntPtr<PG> PGRef;
139 #else
140 typedef boost::intrusive_ptr<PG> PGRef;
141 #endif
142
143 class PGRecoveryStats {
144 struct per_state_info {
145 uint64_t enter, exit; // enter/exit counts
146 uint64_t events;
147 utime_t event_time; // time spent processing events
148 utime_t total_time; // total time in state
149 utime_t min_time, max_time;
150
151 // cppcheck-suppress unreachableCode
152 per_state_info() : enter(0), exit(0), events(0) {}
153 };
154 map<const char *,per_state_info> info;
155 Mutex lock;
156
157 public:
158 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
159
160 void reset() {
161 Mutex::Locker l(lock);
162 info.clear();
163 }
164 void dump(ostream& out) {
165 Mutex::Locker l(lock);
166 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
167 per_state_info& i = p->second;
168 out << i.enter << "\t" << i.exit << "\t"
169 << i.events << "\t" << i.event_time << "\t"
170 << i.total_time << "\t"
171 << i.min_time << "\t" << i.max_time << "\t"
172 << p->first << "\n";
173 }
174 }
175
176 void dump_formatted(Formatter *f) {
177 Mutex::Locker l(lock);
178 f->open_array_section("pg_recovery_stats");
179 for (map<const char *,per_state_info>::iterator p = info.begin();
180 p != info.end(); ++p) {
181 per_state_info& i = p->second;
182 f->open_object_section("recovery_state");
183 f->dump_int("enter", i.enter);
184 f->dump_int("exit", i.exit);
185 f->dump_int("events", i.events);
186 f->dump_stream("event_time") << i.event_time;
187 f->dump_stream("total_time") << i.total_time;
188 f->dump_stream("min_time") << i.min_time;
189 f->dump_stream("max_time") << i.max_time;
190 vector<string> states;
191 get_str_vec(p->first, "/", states);
192 f->open_array_section("nested_states");
193 for (vector<string>::iterator st = states.begin();
194 st != states.end(); ++st) {
195 f->dump_string("state", *st);
196 }
197 f->close_section();
198 f->close_section();
199 }
200 f->close_section();
201 }
202
203 void log_enter(const char *s) {
204 Mutex::Locker l(lock);
205 info[s].enter++;
206 }
207 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
208 Mutex::Locker l(lock);
209 per_state_info &i = info[s];
210 i.exit++;
211 i.total_time += dur;
212 if (dur > i.max_time)
213 i.max_time = dur;
214 if (dur < i.min_time || i.min_time == utime_t())
215 i.min_time = dur;
216 i.events += events;
217 i.event_time += event_dur;
218 }
219 };
220
221 struct PGPool {
222 CephContext* cct;
223 epoch_t cached_epoch;
224 int64_t id;
225 string name;
226 uint64_t auid;
227
228 pg_pool_t info;
229 SnapContext snapc; // the default pool snapc, ready to go.
230
231 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
232 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
233
234 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
235 : cct(cct),
236 cached_epoch(map->get_epoch()),
237 id(i),
238 name(map->get_pool_name(id)),
239 auid(map->get_pg_pool(id)->auid) {
240 const pg_pool_t *pi = map->get_pg_pool(id);
241 assert(pi);
242 info = *pi;
243 snapc = pi->get_snap_context();
244 pi->build_removed_snaps(cached_removed_snaps);
245 }
246
247 void update(OSDMapRef map);
248 };
249
250 /** PG - Replica Placement Group
251 *
252 */
253
254 class PG : public DoutPrefixProvider {
255 protected:
256 OSDService *osd;
257 CephContext *cct;
258 OSDriver osdriver;
259 SnapMapper snap_mapper;
260 bool eio_errors_to_process = false;
261
262 virtual PGBackend *get_pgbackend() = 0;
263 public:
264 std::string gen_prefix() const override;
265 CephContext *get_cct() const override { return cct; }
266 unsigned get_subsys() const override { return ceph_subsys_osd; }
267
268 /*** PG ****/
269 void update_snap_mapper_bits(uint32_t bits) {
270 snap_mapper.update_bits(bits);
271 }
272 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
273 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
274 return get_pgbackend()->get_is_recoverable_predicate();
275 }
276 protected:
277 OSDMapRef osdmap_ref;
278 OSDMapRef last_persisted_osdmap_ref;
279 PGPool pool;
280
281 void requeue_map_waiters();
282
283 void update_osdmap_ref(OSDMapRef newmap) {
284 assert(_lock.is_locked_by_me());
285 osdmap_ref = std::move(newmap);
286 }
287
288 public:
289 OSDMapRef get_osdmap() const {
290 assert(is_locked());
291 assert(osdmap_ref);
292 return osdmap_ref;
293 }
294 protected:
295
296 /** locking and reference counting.
297 * I destroy myself when the reference count hits zero.
298 * lock() should be called before doing anything.
299 * get() should be called on pointer copy (to another thread, etc.).
300 * put() should be called on destruction of some previously copied pointer.
301 * unlock() when done with the current pointer (_most common_).
302 */
303 mutable Mutex _lock;
304 std::atomic_uint ref{0};
305
306 #ifdef PG_DEBUG_REFS
307 Mutex _ref_id_lock;
308 map<uint64_t, string> _live_ids;
309 map<string, uint64_t> _tag_counts;
310 uint64_t _ref_id;
311 #endif
312
313 public:
314 bool deleting; // true while in removing or OSD is shutting down
315
316 ZTracer::Endpoint trace_endpoint;
317
318 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
319 void lock(bool no_lockdep = false) const;
320 void unlock() const {
321 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
322 assert(!dirty_info);
323 assert(!dirty_big_info);
324 _lock.Unlock();
325 }
326
327 bool is_locked() const {
328 return _lock.is_locked();
329 }
330
331 #ifdef PG_DEBUG_REFS
332 uint64_t get_with_id();
333 void put_with_id(uint64_t);
334 void dump_live_ids();
335 #endif
336 void get(const char* tag);
337 void put(const char* tag);
338
339 bool dirty_info, dirty_big_info;
340
341 public:
342 bool is_ec_pg() const {
343 return pool.info.ec_pool();
344 }
345 // pg state
346 pg_info_t info; ///< current pg info
347 pg_info_t last_written_info; ///< last written info
348 __u8 info_struct_v;
349 static const __u8 cur_struct_v = 10;
350 // v10 is the new past_intervals encoding
351 // v9 was fastinfo_key addition
352 // v8 was the move to a per-pg pgmeta object
353 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
354 // (first appeared in cuttlefish).
355 static const __u8 compat_struct_v = 7;
356 bool must_upgrade() {
357 return info_struct_v < cur_struct_v;
358 }
359 bool can_upgrade() {
360 return info_struct_v >= compat_struct_v;
361 }
362 void upgrade(ObjectStore *store);
363
364 const coll_t coll;
365 ObjectStore::CollectionHandle ch;
366 PGLog pg_log;
367 static string get_info_key(spg_t pgid) {
368 return stringify(pgid) + "_info";
369 }
370 static string get_biginfo_key(spg_t pgid) {
371 return stringify(pgid) + "_biginfo";
372 }
373 static string get_epoch_key(spg_t pgid) {
374 return stringify(pgid) + "_epoch";
375 }
376 ghobject_t pgmeta_oid;
377
378 // ------------------
379 // MissingLoc
380
381 class MissingLoc {
382 public:
383 // a loc_count indicates how many locations we know in each of
384 // these distinct sets
385 struct loc_count_t {
386 int up = 0; //< up
387 int other = 0; //< other
388
389 friend bool operator<(const loc_count_t& l,
390 const loc_count_t& r) {
391 return (l.up < r.up ||
392 (l.up == r.up &&
393 (l.other < r.other)));
394 }
395 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
396 assert(l.up >= 0);
397 assert(l.other >= 0);
398 return out << "(" << l.up << "+" << l.other << ")";
399 }
400 };
401
402
403 private:
404
405 loc_count_t _get_count(const set<pg_shard_t>& shards) {
406 loc_count_t r;
407 for (auto s : shards) {
408 if (pg->upset.count(s)) {
409 r.up++;
410 } else {
411 r.other++;
412 }
413 }
414 return r;
415 }
416
417 map<hobject_t, pg_missing_item> needs_recovery_map;
418 map<hobject_t, set<pg_shard_t> > missing_loc;
419 set<pg_shard_t> missing_loc_sources;
420
421 // for every entry in missing_loc, we count how many of each type of shard we have,
422 // and maintain totals here. The sum of the values for this map will always equal
423 // missing_loc.size().
424 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
425
426 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
427 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
428 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
429 // For completely missing shards initialize with empty set<pg_shard_t>
430 for (int i = 0 ; i < num_shards ; ++i) {
431 shard_id_t shard(i);
432 pgsbs[shard];
433 }
434 for (auto pgs: s)
435 pgsbs[pgs.shard].insert(pgs);
436 } else {
437 pgsbs[shard_id_t::NO_SHARD] = s;
438 }
439 }
440
441 void _inc_count(const set<pg_shard_t>& s) {
442 map< shard_id_t, set<pg_shard_t> > pgsbs;
443 pgs_by_shard_id(s, pgsbs);
444 for (auto shard: pgsbs)
445 ++missing_by_count[shard.first][_get_count(shard.second)];
446 }
447 void _dec_count(const set<pg_shard_t>& s) {
448 map< shard_id_t, set<pg_shard_t> > pgsbs;
449 pgs_by_shard_id(s, pgsbs);
450 for (auto shard: pgsbs) {
451 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
452 assert(p != missing_by_count[shard.first].end());
453 if (--p->second == 0) {
454 missing_by_count[shard.first].erase(p);
455 }
456 }
457 }
458
459 PG *pg;
460 set<pg_shard_t> empty_set;
461 public:
462 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
463 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
464 explicit MissingLoc(PG *pg)
465 : pg(pg) { }
466 void set_backend_predicates(
467 IsPGReadablePredicate *_is_readable,
468 IsPGRecoverablePredicate *_is_recoverable) {
469 is_readable.reset(_is_readable);
470 is_recoverable.reset(_is_recoverable);
471 }
472 string gen_prefix() const { return pg->gen_prefix(); }
473 bool needs_recovery(
474 const hobject_t &hoid,
475 eversion_t *v = 0) const {
476 map<hobject_t, pg_missing_item>::const_iterator i =
477 needs_recovery_map.find(hoid);
478 if (i == needs_recovery_map.end())
479 return false;
480 if (v)
481 *v = i->second.need;
482 return true;
483 }
484 bool is_deleted(const hobject_t &hoid) const {
485 auto i = needs_recovery_map.find(hoid);
486 if (i == needs_recovery_map.end())
487 return false;
488 return i->second.is_delete();
489 }
490 bool is_unfound(const hobject_t &hoid) const {
491 return needs_recovery(hoid) && !is_deleted(hoid) && (
492 !missing_loc.count(hoid) ||
493 !(*is_recoverable)(missing_loc.find(hoid)->second));
494 }
495 bool readable_with_acting(
496 const hobject_t &hoid,
497 const set<pg_shard_t> &acting) const;
498 uint64_t num_unfound() const {
499 uint64_t ret = 0;
500 for (map<hobject_t, pg_missing_item>::const_iterator i =
501 needs_recovery_map.begin();
502 i != needs_recovery_map.end();
503 ++i) {
504 if (is_unfound(i->first))
505 ++ret;
506 }
507 return ret;
508 }
509
510 bool have_unfound() const {
511 for (map<hobject_t, pg_missing_item>::const_iterator i =
512 needs_recovery_map.begin();
513 i != needs_recovery_map.end();
514 ++i) {
515 if (is_unfound(i->first))
516 return true;
517 }
518 return false;
519 }
520 void clear() {
521 needs_recovery_map.clear();
522 missing_loc.clear();
523 missing_loc_sources.clear();
524 missing_by_count.clear();
525 }
526
527 void add_location(const hobject_t &hoid, pg_shard_t location) {
528 auto p = missing_loc.find(hoid);
529 if (p == missing_loc.end()) {
530 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
531 } else {
532 _dec_count(p->second);
533 }
534 p->second.insert(location);
535 _inc_count(p->second);
536 }
537 void remove_location(const hobject_t &hoid, pg_shard_t location) {
538 auto p = missing_loc.find(hoid);
539 if (p != missing_loc.end()) {
540 _dec_count(p->second);
541 p->second.erase(location);
542 _inc_count(p->second);
543 }
544 }
545 void add_active_missing(const pg_missing_t &missing) {
546 for (map<hobject_t, pg_missing_item>::const_iterator i =
547 missing.get_items().begin();
548 i != missing.get_items().end();
549 ++i) {
550 map<hobject_t, pg_missing_item>::const_iterator j =
551 needs_recovery_map.find(i->first);
552 if (j == needs_recovery_map.end()) {
553 needs_recovery_map.insert(*i);
554 } else {
555 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
556 << i->first << " have " << j->second
557 << " tried to add " << i->second << dendl;
558 assert(i->second.need == j->second.need);
559 }
560 }
561 }
562
563 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
564 needs_recovery_map[hoid] = pg_missing_item(need, have);
565 }
566 void revise_need(const hobject_t &hoid, eversion_t need) {
567 assert(needs_recovery(hoid));
568 needs_recovery_map[hoid].need = need;
569 }
570
571 /// Adds info about a possible recovery source
572 bool add_source_info(
573 pg_shard_t source, ///< [in] source
574 const pg_info_t &oinfo, ///< [in] info
575 const pg_missing_t &omissing, ///< [in] (optional) missing
576 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
577 ); ///< @return whether a new object location was discovered
578
579 /// Adds recovery sources in batch
580 void add_batch_sources_info(
581 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
582 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
583 );
584
585 /// Uses osdmap to update structures for now down sources
586 void check_recovery_sources(const OSDMapRef& osdmap);
587
588 /// Call when hoid is no longer missing in acting set
589 void recovered(const hobject_t &hoid) {
590 needs_recovery_map.erase(hoid);
591 auto p = missing_loc.find(hoid);
592 if (p != missing_loc.end()) {
593 _dec_count(p->second);
594 missing_loc.erase(p);
595 }
596 }
597
598 /// Call to update structures for hoid after a change
599 void rebuild(
600 const hobject_t &hoid,
601 pg_shard_t self,
602 const set<pg_shard_t> to_recover,
603 const pg_info_t &info,
604 const pg_missing_t &missing,
605 const map<pg_shard_t, pg_missing_t> &pmissing,
606 const map<pg_shard_t, pg_info_t> &pinfo) {
607 recovered(hoid);
608 boost::optional<pg_missing_item> item;
609 auto miter = missing.get_items().find(hoid);
610 if (miter != missing.get_items().end()) {
611 item = miter->second;
612 } else {
613 for (auto &&i: to_recover) {
614 if (i == self)
615 continue;
616 auto pmiter = pmissing.find(i);
617 assert(pmiter != pmissing.end());
618 miter = pmiter->second.get_items().find(hoid);
619 if (miter != pmiter->second.get_items().end()) {
620 item = miter->second;
621 break;
622 }
623 }
624 }
625 if (!item)
626 return; // recovered!
627
628 needs_recovery_map[hoid] = *item;
629 if (item->is_delete())
630 return;
631 auto mliter =
632 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
633 assert(info.last_backfill.is_max());
634 assert(info.last_update >= item->need);
635 if (!missing.is_missing(hoid))
636 mliter->second.insert(self);
637 for (auto &&i: pmissing) {
638 auto pinfoiter = pinfo.find(i.first);
639 assert(pinfoiter != pinfo.end());
640 if (item->need <= pinfoiter->second.last_update &&
641 hoid <= pinfoiter->second.last_backfill &&
642 !i.second.is_missing(hoid))
643 mliter->second.insert(i.first);
644 }
645 _inc_count(mliter->second);
646 }
647
648 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
649 return missing_loc.count(hoid) ?
650 missing_loc.find(hoid)->second : empty_set;
651 }
652 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
653 return missing_loc;
654 }
655 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
656 return needs_recovery_map;
657 }
658 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
659 return missing_by_count;
660 }
661 } missing_loc;
662
663 PastIntervals past_intervals;
664
665 interval_set<snapid_t> snap_trimq;
666
667 /* You should not use these items without taking their respective queue locks
668 * (if they have one) */
669 xlist<PG*>::item stat_queue_item;
670 bool scrub_queued;
671 bool recovery_queued;
672
673 int recovery_ops_active;
674 set<pg_shard_t> waiting_on_backfill;
675 #ifdef DEBUG_RECOVERY_OIDS
676 set<hobject_t> recovering_oids;
677 #endif
678
679 protected:
680 int role; // 0 = primary, 1 = replica, -1=none.
681 unsigned state; // PG_STATE_*
682
683 bool send_notify; ///< true if we are non-primary and should notify the primary
684
685 public:
686 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
687 eversion_t last_complete_ondisk; // last_complete that has committed.
688 eversion_t last_update_applied;
689
690
691 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
692 PGRef pg;
693 epoch_t e;
694 eversion_t v;
695 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
696 : pg(pg), e(e), v(v) {}
697 void finish(int) override {
698 pg->lock();
699 if (!pg->pg_has_reset_since(e)) {
700 pg->last_rollback_info_trimmed_to_applied = v;
701 }
702 pg->unlock();
703 }
704 };
705 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
706 // and the transaction has applied
707 eversion_t last_rollback_info_trimmed_to_applied;
708
709 // primary state
710 public:
711 pg_shard_t primary;
712 pg_shard_t pg_whoami;
713 pg_shard_t up_primary;
714 vector<int> up, acting, want_acting;
715 set<pg_shard_t> actingbackfill, actingset, upset;
716 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
717 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
718 eversion_t pg_trim_to;
719
720 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
721
722 // [primary only] content recovery state
723
724 public:
725 struct BufferedRecoveryMessages {
726 map<int, map<spg_t, pg_query_t> > query_map;
727 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
728 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
729 };
730
731 public:
732 bool dne() { return info.dne(); }
733 struct RecoveryCtx {
734 utime_t start_time;
735 map<int, map<spg_t, pg_query_t> > *query_map;
736 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
737 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
738 set<PGRef> created_pgs;
739 C_Contexts *on_applied;
740 C_Contexts *on_safe;
741 ObjectStore::Transaction *transaction;
742 ThreadPool::TPHandle* handle;
743 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
744 map<int,
745 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
746 map<int,
747 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
748 C_Contexts *on_applied,
749 C_Contexts *on_safe,
750 ObjectStore::Transaction *transaction)
751 : query_map(query_map), info_map(info_map),
752 notify_list(notify_list),
753 on_applied(on_applied),
754 on_safe(on_safe),
755 transaction(transaction),
756 handle(NULL) {}
757
758 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
759 : query_map(&(buf.query_map)),
760 info_map(&(buf.info_map)),
761 notify_list(&(buf.notify_list)),
762 on_applied(rctx.on_applied),
763 on_safe(rctx.on_safe),
764 transaction(rctx.transaction),
765 handle(rctx.handle) {}
766
767 void accept_buffered_messages(BufferedRecoveryMessages &m) {
768 assert(query_map);
769 assert(info_map);
770 assert(notify_list);
771 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
772 i != m.query_map.end();
773 ++i) {
774 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
775 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
776 j != i->second.end();
777 ++j) {
778 omap[j->first] = j->second;
779 }
780 }
781 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
782 = m.info_map.begin();
783 i != m.info_map.end();
784 ++i) {
785 vector<pair<pg_notify_t, PastIntervals> > &ovec =
786 (*info_map)[i->first];
787 ovec.reserve(ovec.size() + i->second.size());
788 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
789 }
790 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
791 = m.notify_list.begin();
792 i != m.notify_list.end();
793 ++i) {
794 vector<pair<pg_notify_t, PastIntervals> > &ovec =
795 (*notify_list)[i->first];
796 ovec.reserve(ovec.size() + i->second.size());
797 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
798 }
799 }
800
801 void send_notify(pg_shard_t to,
802 const pg_notify_t &info, const PastIntervals &pi) {
803 assert(notify_list);
804 (*notify_list)[to.osd].push_back(make_pair(info, pi));
805 }
806 };
807
808
809 PGStateHistory pgstate_history;
810
811 struct NamedState {
812 const char *state_name;
813 utime_t enter_time;
814 PG* pg;
815 const char *get_state_name() { return state_name; }
816 NamedState(PG *pg_, const char *state_name_)
817 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
818 pg->pgstate_history.enter(pg, enter_time, state_name);
819 }
820 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
821 };
822
823
824
825 protected:
826
827 /*
828 * peer_info -- projected (updates _before_ replicas ack)
829 * peer_missing -- committed (updates _after_ replicas ack)
830 */
831
832 bool need_up_thru;
833 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
834 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
835 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
836 set<pg_shard_t> peer_purged; // peers purged
837 map<pg_shard_t, pg_missing_t> peer_missing;
838 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
839 set<pg_shard_t> peer_missing_requested;
840
841 // i deleted these strays; ignore racing PGInfo from them
842 set<pg_shard_t> peer_activated;
843
844 // primary-only, recovery-only state
845 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
846 // which are unfound on the primary
847 epoch_t last_peering_reset;
848
849
850 /* heartbeat peers */
851 void set_probe_targets(const set<pg_shard_t> &probe_set);
852 void clear_probe_targets();
853 public:
854 Mutex heartbeat_peer_lock;
855 set<int> heartbeat_peers;
856 set<int> probe_targets;
857
858 /**
859 * BackfillInterval
860 *
861 * Represents the objects in a range [begin, end)
862 *
863 * Possible states:
864 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
865 * 2) Else, objects contains all objects in [begin, end)
866 */
867 struct BackfillInterval {
868 // info about a backfill interval on a peer
869 eversion_t version; /// version at which the scan occurred
870 map<hobject_t,eversion_t> objects;
871 hobject_t begin;
872 hobject_t end;
873
874 /// clear content
875 void clear() {
876 *this = BackfillInterval();
877 }
878
879 /// clear objects list only
880 void clear_objects() {
881 objects.clear();
882 }
883
884 /// reinstantiate with a new start+end position and sort order
885 void reset(hobject_t start) {
886 clear();
887 begin = end = start;
888 }
889
890 /// true if there are no objects in this interval
891 bool empty() const {
892 return objects.empty();
893 }
894
895 /// true if interval extends to the end of the range
896 bool extends_to_end() const {
897 return end.is_max();
898 }
899
900 /// removes items <= soid and adjusts begin to the first object
901 void trim_to(const hobject_t &soid) {
902 trim();
903 while (!objects.empty() &&
904 objects.begin()->first <= soid) {
905 pop_front();
906 }
907 }
908
909 /// Adjusts begin to the first object
910 void trim() {
911 if (!objects.empty())
912 begin = objects.begin()->first;
913 else
914 begin = end;
915 }
916
917 /// drop first entry, and adjust @begin accordingly
918 void pop_front() {
919 assert(!objects.empty());
920 objects.erase(objects.begin());
921 trim();
922 }
923
924 /// dump
925 void dump(Formatter *f) const {
926 f->dump_stream("begin") << begin;
927 f->dump_stream("end") << end;
928 f->open_array_section("objects");
929 for (map<hobject_t, eversion_t>::const_iterator i =
930 objects.begin();
931 i != objects.end();
932 ++i) {
933 f->open_object_section("object");
934 f->dump_stream("object") << i->first;
935 f->dump_stream("version") << i->second;
936 f->close_section();
937 }
938 f->close_section();
939 }
940 };
941
942 protected:
943 BackfillInterval backfill_info;
944 map<pg_shard_t, BackfillInterval> peer_backfill_info;
945 bool backfill_reserved;
946 bool backfill_reserving;
947
948 friend class OSD;
949
950 public:
951 set<pg_shard_t> backfill_targets;
952
953 bool is_backfill_targets(pg_shard_t osd) {
954 return backfill_targets.count(osd);
955 }
956
957 protected:
958
959 /*
960 * blocked request wait hierarchy
961 *
962 * In order to preserve request ordering we need to be careful about the
963 * order in which blocked requests get requeued. Generally speaking, we
964 * push the requests back up to the op_wq in reverse order (most recent
965 * request first) so that they come back out again in the original order.
966 * However, because there are multiple wait queues, we need to requeue
967 * waitlists in order. Generally speaking, we requeue the wait lists
968 * that are checked first.
969 *
970 * Here are the various wait lists, in the order they are used during
971 * request processing, with notes:
972 *
973 * - waiting_for_map
974 * - may start or stop blocking at any time (depending on client epoch)
975 * - waiting_for_peered
976 * - !is_peered() or flushes_in_progress
977 * - only starts blocking on interval change; never restarts
978 * - waiting_for_active
979 * - !is_active()
980 * - only starts blocking on interval change; never restarts
981 * - waiting_for_flush
982 * - is_active() and flushes_in_progress
983 * - waiting for final flush during activate
984 * - waiting_for_scrub
985 * - starts and stops blocking for varying intervals during scrub
986 * - waiting_for_unreadable_object
987 * - never restarts once object is readable (* except for EIO?)
988 * - waiting_for_degraded_object
989 * - never restarts once object is writeable (* except for EIO?)
990 * - waiting_for_blocked_object
991 * - starts and stops based on proxied op activity
992 * - obc rwlocks
993 * - starts and stops based on read/write activity
994 *
995 * Notes:
996 *
997 * 1. During and interval change, we requeue *everything* in the above order.
998 *
999 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1000 * the op there if it applies. We ignore the unreadable/degraded/blocked
1001 * queues because we assume they cannot apply at that time (this is
1002 * probably mostly true).
1003 *
1004 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1005 * it is non-empty.
1006 *
1007 * These three behaviors are generally sufficient to maintain ordering, with
1008 * the possible exception of cases where we make an object degraded or
1009 * unreadable that was previously okay, e.g. when scrub or op processing
1010 * encounter an unexpected error. FIXME.
1011 */
1012
1013 // pg waiters
1014 unsigned flushes_in_progress;
1015
1016 // ops with newer maps than our (or blocked behind them)
1017 // track these by client, since inter-request ordering doesn't otherwise
1018 // matter.
1019 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1020
1021 // ops waiting on peered
1022 list<OpRequestRef> waiting_for_peered;
1023
1024 // ops waiting on active (require peered as well)
1025 list<OpRequestRef> waiting_for_active;
1026 list<OpRequestRef> waiting_for_flush;
1027 list<OpRequestRef> waiting_for_scrub;
1028
1029 list<OpRequestRef> waiting_for_cache_not_full;
1030 list<OpRequestRef> waiting_for_clean_to_primary_repair;
1031 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1032 waiting_for_degraded_object,
1033 waiting_for_blocked_object;
1034
1035 set<hobject_t> objects_blocked_on_cache_full;
1036 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1037 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1038
1039 // Callbacks should assume pg (and nothing else) is locked
1040 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1041
1042 map<eversion_t,
1043 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
1044
1045 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1046 void requeue_op(OpRequestRef op);
1047 void requeue_ops(list<OpRequestRef> &l);
1048
1049 // stats that persist lazily
1050 object_stat_collection_t unstable_stats;
1051
1052 // publish stats
1053 Mutex pg_stats_publish_lock;
1054 bool pg_stats_publish_valid;
1055 pg_stat_t pg_stats_publish;
1056
1057 // for ordering writes
1058 ceph::shared_ptr<ObjectStore::Sequencer> osr;
1059
1060 void _update_calc_stats();
1061 void _update_blocked_by();
1062 void publish_stats_to_osd();
1063 void clear_publish_stats();
1064
1065 public:
1066 void clear_primary_state();
1067
1068 bool is_actingbackfill(pg_shard_t osd) const {
1069 return actingbackfill.count(osd);
1070 }
1071 bool is_acting(pg_shard_t osd) const {
1072 return has_shard(pool.info.ec_pool(), acting, osd);
1073 }
1074 bool is_up(pg_shard_t osd) const {
1075 return has_shard(pool.info.ec_pool(), up, osd);
1076 }
1077 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1078 if (ec) {
1079 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1080 } else {
1081 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1082 }
1083 }
1084
1085 bool needs_recovery() const;
1086 bool needs_backfill() const;
1087
1088 /// clip calculated priority to reasonable range
1089 inline int clamp_recovery_priority(int priority);
1090 /// get log recovery reservation priority
1091 unsigned get_recovery_priority();
1092 /// get backfill reservation priority
1093 unsigned get_backfill_priority();
1094
1095 void mark_clean(); ///< mark an active pg clean
1096 void _change_recovery_force_mode(int new_mode, bool clear);
1097
1098 /// return [start,end) bounds for required past_intervals
1099 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1100 const pg_info_t &info,
1101 epoch_t oldest_map) {
1102 epoch_t start = MAX(
1103 info.history.last_epoch_clean ? info.history.last_epoch_clean :
1104 info.history.epoch_pool_created,
1105 oldest_map);
1106 epoch_t end = MAX(
1107 info.history.same_interval_since,
1108 info.history.epoch_pool_created);
1109 return make_pair(start, end);
1110 }
1111 void check_past_interval_bounds() const;
1112 PastIntervals::PriorSet build_prior();
1113
1114 void remove_down_peer_info(const OSDMapRef osdmap);
1115
1116 bool adjust_need_up_thru(const OSDMapRef osdmap);
1117
1118 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1119 virtual void dump_recovery_info(Formatter *f) const = 0;
1120
1121 bool calc_min_last_complete_ondisk() {
1122 eversion_t min = last_complete_ondisk;
1123 assert(!actingbackfill.empty());
1124 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1125 i != actingbackfill.end();
1126 ++i) {
1127 if (*i == get_primary()) continue;
1128 if (peer_last_complete_ondisk.count(*i) == 0)
1129 return false; // we don't have complete info
1130 eversion_t a = peer_last_complete_ondisk[*i];
1131 if (a < min)
1132 min = a;
1133 }
1134 if (min == min_last_complete_ondisk)
1135 return false;
1136 min_last_complete_ondisk = min;
1137 return true;
1138 }
1139
1140 virtual void calc_trim_to() = 0;
1141
1142 virtual void calc_trim_to_aggressive() = 0;
1143
1144 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1145 pg_missing_t& omissing, pg_shard_t from);
1146 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1147 pg_missing_t& omissing, pg_shard_t from);
1148 bool proc_replica_info(
1149 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1150
1151 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1152 PG *pg;
1153 ObjectStore::Transaction *t;
1154 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1155
1156 // LogEntryHandler
1157 void remove(const hobject_t &hoid) override {
1158 pg->get_pgbackend()->remove(hoid, t);
1159 }
1160 void try_stash(const hobject_t &hoid, version_t v) override {
1161 pg->get_pgbackend()->try_stash(hoid, v, t);
1162 }
1163 void rollback(const pg_log_entry_t &entry) override {
1164 assert(entry.can_rollback());
1165 pg->get_pgbackend()->rollback(entry, t);
1166 }
1167 void rollforward(const pg_log_entry_t &entry) override {
1168 pg->get_pgbackend()->rollforward(entry, t);
1169 }
1170 void trim(const pg_log_entry_t &entry) override {
1171 pg->get_pgbackend()->trim(entry, t);
1172 }
1173 };
1174
1175 void update_object_snap_mapping(
1176 ObjectStore::Transaction *t, const hobject_t &soid,
1177 const set<snapid_t> &snaps);
1178 void clear_object_snap_mapping(
1179 ObjectStore::Transaction *t, const hobject_t &soid);
1180 void remove_snap_mapped_object(
1181 ObjectStore::Transaction& t, const hobject_t& soid);
1182 void merge_log(
1183 ObjectStore::Transaction& t, pg_info_t &oinfo,
1184 pg_log_t &olog, pg_shard_t from);
1185 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1186 bool search_for_missing(
1187 const pg_info_t &oinfo, const pg_missing_t &omissing,
1188 pg_shard_t fromosd,
1189 RecoveryCtx*);
1190
1191 void check_for_lost_objects();
1192 void forget_lost_objects();
1193
1194 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1195
1196 void trim_write_ahead();
1197
1198 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1199 const map<pg_shard_t, pg_info_t> &infos,
1200 bool restrict_to_up_acting,
1201 bool *history_les_bound) const;
1202 static void calc_ec_acting(
1203 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1204 unsigned size,
1205 const vector<int> &acting,
1206 pg_shard_t acting_primary,
1207 const vector<int> &up,
1208 pg_shard_t up_primary,
1209 const map<pg_shard_t, pg_info_t> &all_info,
1210 bool restrict_to_up_acting,
1211 vector<int> *want,
1212 set<pg_shard_t> *backfill,
1213 set<pg_shard_t> *acting_backfill,
1214 pg_shard_t *want_primary,
1215 ostream &ss);
1216 static void calc_replicated_acting(
1217 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1218 unsigned size,
1219 const vector<int> &acting,
1220 pg_shard_t acting_primary,
1221 const vector<int> &up,
1222 pg_shard_t up_primary,
1223 const map<pg_shard_t, pg_info_t> &all_info,
1224 bool restrict_to_up_acting,
1225 vector<int> *want,
1226 set<pg_shard_t> *backfill,
1227 set<pg_shard_t> *acting_backfill,
1228 pg_shard_t *want_primary,
1229 ostream &ss);
1230 bool choose_acting(pg_shard_t &auth_log_shard,
1231 bool restrict_to_up_acting,
1232 bool *history_les_bound);
1233 void build_might_have_unfound();
1234 void activate(
1235 ObjectStore::Transaction& t,
1236 epoch_t activation_epoch,
1237 list<Context*>& tfin,
1238 map<int, map<spg_t,pg_query_t> >& query_map,
1239 map<int,
1240 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1241 RecoveryCtx *ctx);
1242 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1243 void all_activated_and_committed();
1244
1245 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1246
1247 bool have_unfound() const {
1248 return missing_loc.have_unfound();
1249 }
1250 uint64_t get_num_unfound() const {
1251 return missing_loc.num_unfound();
1252 }
1253
1254 virtual void check_local() = 0;
1255
1256 /**
1257 * @param ops_begun returns how many recovery ops the function started
1258 * @returns true if any useful work was accomplished; false otherwise
1259 */
1260 virtual bool start_recovery_ops(
1261 uint64_t max,
1262 ThreadPool::TPHandle &handle,
1263 uint64_t *ops_begun) = 0;
1264
1265 void purge_strays();
1266
1267 void update_heartbeat_peers();
1268
1269 Context *finish_sync_event;
1270
1271 void finish_recovery(list<Context*>& tfin);
1272 void _finish_recovery(Context *c);
1273 void cancel_recovery();
1274 void clear_recovery_state();
1275 virtual void _clear_recovery_state() = 0;
1276 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1277 void start_recovery_op(const hobject_t& soid);
1278 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1279
1280 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1281 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1282
1283 friend class C_OSD_RepModify_Commit;
1284
1285 // -- backoff --
1286 Mutex backoff_lock; // orders inside Backoff::lock
1287 map<hobject_t,set<BackoffRef>> backoffs;
1288
1289 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1290 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1291 void release_backoffs(const hobject_t& o) {
1292 release_backoffs(o, o);
1293 }
1294 void clear_backoffs();
1295
1296 void add_pg_backoff(SessionRef s) {
1297 hobject_t begin = info.pgid.pgid.get_hobj_start();
1298 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1299 add_backoff(s, begin, end);
1300 }
1301 void release_pg_backoffs() {
1302 hobject_t begin = info.pgid.pgid.get_hobj_start();
1303 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1304 release_backoffs(begin, end);
1305 }
1306
1307 void rm_backoff(BackoffRef b);
1308
1309 // -- scrub --
1310 struct Scrubber {
1311 Scrubber();
1312 ~Scrubber();
1313
1314 // metadata
1315 set<pg_shard_t> reserved_peers;
1316 bool reserved, reserve_failed;
1317 epoch_t epoch_start;
1318
1319 // common to both scrubs
1320 bool active;
1321 set<pg_shard_t> waiting_on_whom;
1322 int shallow_errors;
1323 int deep_errors;
1324 int large_omap_objects = 0;
1325 int fixed;
1326 ScrubMap primary_scrubmap;
1327 ScrubMapBuilder primary_scrubmap_pos;
1328 epoch_t replica_scrub_start = 0;
1329 ScrubMap replica_scrubmap;
1330 ScrubMapBuilder replica_scrubmap_pos;
1331 map<pg_shard_t, ScrubMap> received_maps;
1332 OpRequestRef active_rep_scrub;
1333 utime_t scrub_reg_stamp; // stamp we registered for
1334
1335 // For async sleep
1336 bool sleeping = false;
1337 bool needs_sleep = true;
1338 utime_t sleep_start;
1339
1340 // flags to indicate explicitly requested scrubs (by admin)
1341 bool must_scrub, must_deep_scrub, must_repair;
1342
1343 // Priority to use for scrub scheduling
1344 unsigned priority;
1345
1346 // this flag indicates whether we would like to do auto-repair of the PG or not
1347 bool auto_repair;
1348
1349 // Maps from objects with errors to missing/inconsistent peers
1350 map<hobject_t, set<pg_shard_t>> missing;
1351 map<hobject_t, set<pg_shard_t>> inconsistent;
1352
1353 // Map from object with errors to good peers
1354 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1355
1356 // Cleaned map pending snap metadata scrub
1357 ScrubMap cleaned_meta_map;
1358
1359 void clean_meta_map(ScrubMap &for_meta_scrub) {
1360 if (end.is_max() ||
1361 cleaned_meta_map.objects.empty()) {
1362 cleaned_meta_map.swap(for_meta_scrub);
1363 } else {
1364 auto iter = cleaned_meta_map.objects.end();
1365 --iter; // not empty, see if clause
1366 auto begin = cleaned_meta_map.objects.begin();
1367 if (iter->first.has_snapset()) {
1368 ++iter;
1369 } else {
1370 while (iter != begin) {
1371 auto next = iter--;
1372 if (next->first.get_head() != iter->first.get_head()) {
1373 ++iter;
1374 break;
1375 }
1376 }
1377 }
1378 for_meta_scrub.objects.insert(begin, iter);
1379 cleaned_meta_map.objects.erase(begin, iter);
1380 }
1381 }
1382
1383 // digest updates which we are waiting on
1384 int num_digest_updates_pending;
1385
1386 // chunky scrub
1387 hobject_t start, end; // [start,end)
1388 hobject_t max_end; // Largest end that may have been sent to replicas
1389 eversion_t subset_last_update;
1390
1391 // chunky scrub state
1392 enum State {
1393 INACTIVE,
1394 NEW_CHUNK,
1395 WAIT_PUSHES,
1396 WAIT_LAST_UPDATE,
1397 BUILD_MAP,
1398 BUILD_MAP_DONE,
1399 WAIT_REPLICAS,
1400 COMPARE_MAPS,
1401 WAIT_DIGEST_UPDATES,
1402 FINISH,
1403 BUILD_MAP_REPLICA,
1404 } state;
1405
1406 std::unique_ptr<Scrub::Store> store;
1407 // deep scrub
1408 bool deep;
1409 int preempt_left;
1410 int preempt_divisor;
1411
1412 list<Context*> callbacks;
1413 void add_callback(Context *context) {
1414 callbacks.push_back(context);
1415 }
1416 void run_callbacks() {
1417 list<Context*> to_run;
1418 to_run.swap(callbacks);
1419 for (list<Context*>::iterator i = to_run.begin();
1420 i != to_run.end();
1421 ++i) {
1422 (*i)->complete(0);
1423 }
1424 }
1425
1426 static const char *state_string(const PG::Scrubber::State& state) {
1427 const char *ret = NULL;
1428 switch( state )
1429 {
1430 case INACTIVE: ret = "INACTIVE"; break;
1431 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1432 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1433 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1434 case BUILD_MAP: ret = "BUILD_MAP"; break;
1435 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1436 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1437 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1438 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1439 case FINISH: ret = "FINISH"; break;
1440 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1441 }
1442 return ret;
1443 }
1444
1445 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1446
1447 // clear all state
1448 void reset() {
1449 active = false;
1450 waiting_on_whom.clear();
1451 if (active_rep_scrub) {
1452 active_rep_scrub = OpRequestRef();
1453 }
1454 received_maps.clear();
1455
1456 must_scrub = false;
1457 must_deep_scrub = false;
1458 must_repair = false;
1459 auto_repair = false;
1460
1461 state = PG::Scrubber::INACTIVE;
1462 start = hobject_t();
1463 end = hobject_t();
1464 max_end = hobject_t();
1465 subset_last_update = eversion_t();
1466 shallow_errors = 0;
1467 deep_errors = 0;
1468 large_omap_objects = 0;
1469 fixed = 0;
1470 deep = false;
1471 run_callbacks();
1472 inconsistent.clear();
1473 missing.clear();
1474 authoritative.clear();
1475 num_digest_updates_pending = 0;
1476 primary_scrubmap = ScrubMap();
1477 primary_scrubmap_pos.reset();
1478 replica_scrubmap = ScrubMap();
1479 replica_scrubmap_pos.reset();
1480 cleaned_meta_map = ScrubMap();
1481 sleeping = false;
1482 needs_sleep = true;
1483 sleep_start = utime_t();
1484 }
1485
1486 void create_results(const hobject_t& obj);
1487 void cleanup_store(ObjectStore::Transaction *t);
1488 } scrubber;
1489
1490 bool scrub_after_recovery;
1491
1492 int active_pushes;
1493
1494 bool scrub_can_preempt = false;
1495 bool scrub_preempted = false;
1496
1497 // we allow some number of preemptions of the scrub, which mean we do
1498 // not block. then we start to block. once we start blocking, we do
1499 // not stop until the scrub range is completed.
1500 bool write_blocked_by_scrub(const hobject_t &soid);
1501
1502 /// true if the given range intersects the scrub interval in any way
1503 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1504
1505 void repair_object(
1506 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1507 pg_shard_t bad_peer);
1508
1509 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1510 void chunky_scrub(ThreadPool::TPHandle &handle);
1511 void scrub_compare_maps();
1512 /**
1513 * return true if any inconsistency/missing is repaired, false otherwise
1514 */
1515 bool scrub_process_inconsistent();
1516 bool ops_blocked_by_scrub() const;
1517 void scrub_finish();
1518 void scrub_clear_state();
1519 void _scan_snaps(ScrubMap &map);
1520 void _repair_oinfo_oid(ScrubMap &map);
1521 void _scan_rollback_obs(
1522 const vector<ghobject_t> &rollback_obs,
1523 ThreadPool::TPHandle &handle);
1524 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1525 hobject_t start, hobject_t end, bool deep,
1526 bool allow_preemption);
1527 int build_scrub_map_chunk(
1528 ScrubMap &map,
1529 ScrubMapBuilder &pos,
1530 hobject_t start, hobject_t end, bool deep,
1531 ThreadPool::TPHandle &handle);
1532 /**
1533 * returns true if [begin, end) is good to scrub at this time
1534 * a false return value obliges the implementer to requeue scrub when the
1535 * condition preventing scrub clears
1536 */
1537 virtual bool _range_available_for_scrub(
1538 const hobject_t &begin, const hobject_t &end) = 0;
1539 virtual void scrub_snapshot_metadata(
1540 ScrubMap &map,
1541 const std::map<hobject_t,
1542 pair<boost::optional<uint32_t>,
1543 boost::optional<uint32_t>>> &missing_digest) { }
1544 virtual void _scrub_clear_state() { }
1545 virtual void _scrub_finish() { }
1546 virtual void split_colls(
1547 spg_t child,
1548 int split_bits,
1549 int seed,
1550 const pg_pool_t *pool,
1551 ObjectStore::Transaction *t) = 0;
1552 void clear_scrub_reserved();
1553 void scrub_reserve_replicas();
1554 void scrub_unreserve_replicas();
1555 bool scrub_all_replicas_reserved() const;
1556 bool sched_scrub();
1557 void reg_next_scrub();
1558 void unreg_next_scrub();
1559
1560 void replica_scrub(
1561 OpRequestRef op,
1562 ThreadPool::TPHandle &handle);
1563 void do_replica_scrub_map(OpRequestRef op);
1564 void sub_op_scrub_map(OpRequestRef op);
1565
1566 void handle_scrub_reserve_request(OpRequestRef op);
1567 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1568 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1569 void handle_scrub_reserve_release(OpRequestRef op);
1570
1571 void reject_reservation();
1572 void schedule_backfill_retry(float retry);
1573 void schedule_recovery_retry(float retry);
1574
1575 // -- recovery state --
1576
1577 template <class EVT>
1578 struct QueuePeeringEvt : Context {
1579 PGRef pg;
1580 epoch_t epoch;
1581 EVT evt;
1582 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1583 pg(pg), epoch(epoch), evt(evt) {}
1584 void finish(int r) override {
1585 pg->lock();
1586 pg->queue_peering_event(PG::CephPeeringEvtRef(
1587 new PG::CephPeeringEvt(
1588 epoch,
1589 epoch,
1590 evt)));
1591 pg->unlock();
1592 }
1593 };
1594
1595 class CephPeeringEvt {
1596 epoch_t epoch_sent;
1597 epoch_t epoch_requested;
1598 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1599 string desc;
1600 public:
1601 MEMPOOL_CLASS_HELPERS();
1602 template <class T>
1603 CephPeeringEvt(epoch_t epoch_sent,
1604 epoch_t epoch_requested,
1605 const T &evt_) :
1606 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1607 evt(evt_.intrusive_from_this()) {
1608 stringstream out;
1609 out << "epoch_sent: " << epoch_sent
1610 << " epoch_requested: " << epoch_requested << " ";
1611 evt_.print(&out);
1612 desc = out.str();
1613 }
1614 epoch_t get_epoch_sent() { return epoch_sent; }
1615 epoch_t get_epoch_requested() { return epoch_requested; }
1616 const boost::statechart::event_base &get_event() { return *evt; }
1617 string get_desc() { return desc; }
1618 };
1619 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1620 list<CephPeeringEvtRef> peering_queue; // op queue
1621 list<CephPeeringEvtRef> peering_waiters;
1622
1623 struct QueryState : boost::statechart::event< QueryState > {
1624 Formatter *f;
1625 explicit QueryState(Formatter *f) : f(f) {}
1626 void print(std::ostream *out) const {
1627 *out << "Query";
1628 }
1629 };
1630
1631 struct MInfoRec : boost::statechart::event< MInfoRec > {
1632 pg_shard_t from;
1633 pg_info_t info;
1634 epoch_t msg_epoch;
1635 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1636 from(from), info(info), msg_epoch(msg_epoch) {}
1637 void print(std::ostream *out) const {
1638 *out << "MInfoRec from " << from << " info: " << info;
1639 }
1640 };
1641
1642 struct MLogRec : boost::statechart::event< MLogRec > {
1643 pg_shard_t from;
1644 boost::intrusive_ptr<MOSDPGLog> msg;
1645 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1646 from(from), msg(msg) {}
1647 void print(std::ostream *out) const {
1648 *out << "MLogRec from " << from;
1649 }
1650 };
1651
1652 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1653 pg_shard_t from;
1654 pg_notify_t notify;
1655 uint64_t features;
1656 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1657 from(from), notify(notify), features(f) {}
1658 void print(std::ostream *out) const {
1659 *out << "MNotifyRec from " << from << " notify: " << notify
1660 << " features: 0x" << hex << features << dec;
1661 }
1662 };
1663
1664 struct MQuery : boost::statechart::event< MQuery > {
1665 pg_shard_t from;
1666 pg_query_t query;
1667 epoch_t query_epoch;
1668 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1669 from(from), query(query), query_epoch(query_epoch) {}
1670 void print(std::ostream *out) const {
1671 *out << "MQuery from " << from
1672 << " query_epoch " << query_epoch
1673 << " query: " << query;
1674 }
1675 };
1676
1677 struct AdvMap : boost::statechart::event< AdvMap > {
1678 OSDMapRef osdmap;
1679 OSDMapRef lastmap;
1680 vector<int> newup, newacting;
1681 int up_primary, acting_primary;
1682 AdvMap(
1683 OSDMapRef osdmap, OSDMapRef lastmap,
1684 vector<int>& newup, int up_primary,
1685 vector<int>& newacting, int acting_primary):
1686 osdmap(osdmap), lastmap(lastmap),
1687 newup(newup),
1688 newacting(newacting),
1689 up_primary(up_primary),
1690 acting_primary(acting_primary) {}
1691 void print(std::ostream *out) const {
1692 *out << "AdvMap";
1693 }
1694 };
1695
1696 struct ActMap : boost::statechart::event< ActMap > {
1697 ActMap() : boost::statechart::event< ActMap >() {}
1698 void print(std::ostream *out) const {
1699 *out << "ActMap";
1700 }
1701 };
1702 struct Activate : boost::statechart::event< Activate > {
1703 epoch_t activation_epoch;
1704 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1705 activation_epoch(q) {}
1706 void print(std::ostream *out) const {
1707 *out << "Activate from " << activation_epoch;
1708 }
1709 };
1710 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1711 unsigned priority;
1712 explicit RequestBackfillPrio(unsigned prio) :
1713 boost::statechart::event< RequestBackfillPrio >(),
1714 priority(prio) {}
1715 void print(std::ostream *out) const {
1716 *out << "RequestBackfillPrio: priority " << priority;
1717 }
1718 };
1719 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1720 T() : boost::statechart::event< T >() {} \
1721 void print(std::ostream *out) const { \
1722 *out << #T; \
1723 } \
1724 };
1725 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1726 float delay;
1727 explicit DeferBackfill(float delay) : delay(delay) {}
1728 void print(std::ostream *out) const {
1729 *out << "DeferBackfill: delay " << delay;
1730 }
1731 };
1732 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1733 float delay;
1734 explicit DeferRecovery(float delay) : delay(delay) {}
1735 void print(std::ostream *out) const {
1736 *out << "DeferRecovery: delay " << delay;
1737 }
1738 };
1739 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
1740 explicit UnfoundBackfill() {}
1741 void print(std::ostream *out) const {
1742 *out << "UnfoundBackfill";
1743 }
1744 };
1745 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
1746 explicit UnfoundRecovery() {}
1747 void print(std::ostream *out) const {
1748 *out << "UnfoundRecovery";
1749 }
1750 };
1751 protected:
1752 TrivialEvent(Initialize)
1753 TrivialEvent(Load)
1754 TrivialEvent(GotInfo)
1755 TrivialEvent(NeedUpThru)
1756 TrivialEvent(NullEvt)
1757 TrivialEvent(FlushedEvt)
1758 TrivialEvent(Backfilled)
1759 TrivialEvent(LocalBackfillReserved)
1760 TrivialEvent(RemoteBackfillReserved)
1761 TrivialEvent(RejectRemoteReservation)
1762 TrivialEvent(RemoteReservationRejected)
1763 TrivialEvent(RemoteReservationCanceled)
1764 TrivialEvent(RequestBackfill)
1765 TrivialEvent(RequestRecovery)
1766 TrivialEvent(RecoveryDone)
1767 TrivialEvent(BackfillTooFull)
1768 TrivialEvent(RecoveryTooFull)
1769
1770 TrivialEvent(MakePrimary)
1771 TrivialEvent(MakeStray)
1772 TrivialEvent(NeedActingChange)
1773 TrivialEvent(IsIncomplete)
1774 TrivialEvent(IsDown)
1775
1776 TrivialEvent(AllReplicasRecovered)
1777 TrivialEvent(DoRecovery)
1778 TrivialEvent(LocalRecoveryReserved)
1779 TrivialEvent(RemoteRecoveryReserved)
1780 TrivialEvent(AllRemotesReserved)
1781 TrivialEvent(AllBackfillsReserved)
1782 TrivialEvent(GoClean)
1783
1784 TrivialEvent(AllReplicasActivated)
1785
1786 TrivialEvent(IntervalFlush)
1787
1788 /* Encapsulates PG recovery process */
1789 class RecoveryState {
1790 void start_handle(RecoveryCtx *new_ctx);
1791 void end_handle();
1792 public:
1793 void begin_block_outgoing();
1794 void end_block_outgoing();
1795 void clear_blocked_outgoing();
1796 private:
1797
1798 /* States */
1799 struct Initial;
1800 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1801 RecoveryState *state;
1802 public:
1803 PG *pg;
1804
1805 utime_t event_time;
1806 uint64_t event_count;
1807
1808 void clear_event_counters() {
1809 event_time = utime_t();
1810 event_count = 0;
1811 }
1812
1813 void log_enter(const char *state_name);
1814 void log_exit(const char *state_name, utime_t duration);
1815
1816 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1817
1818 /* Accessor functions for state methods */
1819 ObjectStore::Transaction* get_cur_transaction() {
1820 assert(state->rctx);
1821 assert(state->rctx->transaction);
1822 return state->rctx->transaction;
1823 }
1824
1825 void send_query(pg_shard_t to, const pg_query_t &query) {
1826 assert(state->rctx);
1827 assert(state->rctx->query_map);
1828 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1829 query;
1830 }
1831
1832 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1833 assert(state->rctx);
1834 assert(state->rctx->query_map);
1835 return state->rctx->query_map;
1836 }
1837
1838 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1839 assert(state->rctx);
1840 assert(state->rctx->info_map);
1841 return state->rctx->info_map;
1842 }
1843
1844 list< Context* > *get_on_safe_context_list() {
1845 assert(state->rctx);
1846 assert(state->rctx->on_safe);
1847 return &(state->rctx->on_safe->contexts);
1848 }
1849
1850 list< Context * > *get_on_applied_context_list() {
1851 assert(state->rctx);
1852 assert(state->rctx->on_applied);
1853 return &(state->rctx->on_applied->contexts);
1854 }
1855
1856 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1857
1858 void send_notify(pg_shard_t to,
1859 const pg_notify_t &info, const PastIntervals &pi) {
1860 assert(state->rctx);
1861 state->rctx->send_notify(to, info, pi);
1862 }
1863 };
1864 friend class RecoveryMachine;
1865
1866 /* States */
1867 // Initial
1868 // Reset
1869 // Start
1870 // Started
1871 // Primary
1872 // WaitActingChange
1873 // Peering
1874 // GetInfo
1875 // GetLog
1876 // GetMissing
1877 // WaitUpThru
1878 // Incomplete
1879 // Active
1880 // Activating
1881 // Clean
1882 // Recovered
1883 // Backfilling
1884 // WaitRemoteBackfillReserved
1885 // WaitLocalBackfillReserved
1886 // NotBackfilling
1887 // NotRecovering
1888 // Recovering
1889 // WaitRemoteRecoveryReserved
1890 // WaitLocalRecoveryReserved
1891 // ReplicaActive
1892 // RepNotRecovering
1893 // RepRecovering
1894 // RepWaitBackfillReserved
1895 // RepWaitRecoveryReserved
1896 // Stray
1897
1898 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1899 explicit Crashed(my_context ctx);
1900 };
1901
1902 struct Reset;
1903
1904 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1905 explicit Initial(my_context ctx);
1906 void exit();
1907
1908 typedef boost::mpl::list <
1909 boost::statechart::transition< Initialize, Reset >,
1910 boost::statechart::custom_reaction< Load >,
1911 boost::statechart::custom_reaction< NullEvt >,
1912 boost::statechart::transition< boost::statechart::event_base, Crashed >
1913 > reactions;
1914
1915 boost::statechart::result react(const Load&);
1916 boost::statechart::result react(const MNotifyRec&);
1917 boost::statechart::result react(const MInfoRec&);
1918 boost::statechart::result react(const MLogRec&);
1919 boost::statechart::result react(const boost::statechart::event_base&) {
1920 return discard_event();
1921 }
1922 };
1923
1924 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1925 explicit Reset(my_context ctx);
1926 void exit();
1927
1928 typedef boost::mpl::list <
1929 boost::statechart::custom_reaction< QueryState >,
1930 boost::statechart::custom_reaction< AdvMap >,
1931 boost::statechart::custom_reaction< ActMap >,
1932 boost::statechart::custom_reaction< NullEvt >,
1933 boost::statechart::custom_reaction< FlushedEvt >,
1934 boost::statechart::custom_reaction< IntervalFlush >,
1935 boost::statechart::transition< boost::statechart::event_base, Crashed >
1936 > reactions;
1937 boost::statechart::result react(const QueryState& q);
1938 boost::statechart::result react(const AdvMap&);
1939 boost::statechart::result react(const ActMap&);
1940 boost::statechart::result react(const FlushedEvt&);
1941 boost::statechart::result react(const IntervalFlush&);
1942 boost::statechart::result react(const boost::statechart::event_base&) {
1943 return discard_event();
1944 }
1945 };
1946
1947 struct Start;
1948
1949 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1950 explicit Started(my_context ctx);
1951 void exit();
1952
1953 typedef boost::mpl::list <
1954 boost::statechart::custom_reaction< QueryState >,
1955 boost::statechart::custom_reaction< AdvMap >,
1956 boost::statechart::custom_reaction< NullEvt >,
1957 boost::statechart::custom_reaction< FlushedEvt >,
1958 boost::statechart::custom_reaction< IntervalFlush >,
1959 boost::statechart::transition< boost::statechart::event_base, Crashed >
1960 > reactions;
1961 boost::statechart::result react(const QueryState& q);
1962 boost::statechart::result react(const AdvMap&);
1963 boost::statechart::result react(const FlushedEvt&);
1964 boost::statechart::result react(const IntervalFlush&);
1965 boost::statechart::result react(const boost::statechart::event_base&) {
1966 return discard_event();
1967 }
1968 };
1969
1970 struct Primary;
1971 struct Stray;
1972
1973 struct Start : boost::statechart::state< Start, Started >, NamedState {
1974 explicit Start(my_context ctx);
1975 void exit();
1976
1977 typedef boost::mpl::list <
1978 boost::statechart::transition< MakePrimary, Primary >,
1979 boost::statechart::transition< MakeStray, Stray >
1980 > reactions;
1981 };
1982
1983 struct Peering;
1984 struct WaitActingChange;
1985 struct Incomplete;
1986 struct Down;
1987
1988 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1989 explicit Primary(my_context ctx);
1990 void exit();
1991
1992 typedef boost::mpl::list <
1993 boost::statechart::custom_reaction< ActMap >,
1994 boost::statechart::custom_reaction< MNotifyRec >,
1995 boost::statechart::transition< NeedActingChange, WaitActingChange >
1996 > reactions;
1997 boost::statechart::result react(const ActMap&);
1998 boost::statechart::result react(const MNotifyRec&);
1999 };
2000
2001 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2002 NamedState {
2003 typedef boost::mpl::list <
2004 boost::statechart::custom_reaction< QueryState >,
2005 boost::statechart::custom_reaction< AdvMap >,
2006 boost::statechart::custom_reaction< MLogRec >,
2007 boost::statechart::custom_reaction< MInfoRec >,
2008 boost::statechart::custom_reaction< MNotifyRec >
2009 > reactions;
2010 explicit WaitActingChange(my_context ctx);
2011 boost::statechart::result react(const QueryState& q);
2012 boost::statechart::result react(const AdvMap&);
2013 boost::statechart::result react(const MLogRec&);
2014 boost::statechart::result react(const MInfoRec&);
2015 boost::statechart::result react(const MNotifyRec&);
2016 void exit();
2017 };
2018
2019 struct GetInfo;
2020 struct Active;
2021
2022 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2023 PastIntervals::PriorSet prior_set;
2024 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2025
2026 explicit Peering(my_context ctx);
2027 void exit();
2028
2029 typedef boost::mpl::list <
2030 boost::statechart::custom_reaction< QueryState >,
2031 boost::statechart::transition< Activate, Active >,
2032 boost::statechart::custom_reaction< AdvMap >
2033 > reactions;
2034 boost::statechart::result react(const QueryState& q);
2035 boost::statechart::result react(const AdvMap &advmap);
2036 };
2037
2038 struct WaitLocalRecoveryReserved;
2039 struct Activating;
2040 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2041 explicit Active(my_context ctx);
2042 void exit();
2043
2044 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2045 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2046 bool all_replicas_activated;
2047
2048 typedef boost::mpl::list <
2049 boost::statechart::custom_reaction< QueryState >,
2050 boost::statechart::custom_reaction< ActMap >,
2051 boost::statechart::custom_reaction< AdvMap >,
2052 boost::statechart::custom_reaction< MInfoRec >,
2053 boost::statechart::custom_reaction< MNotifyRec >,
2054 boost::statechart::custom_reaction< MLogRec >,
2055 boost::statechart::custom_reaction< Backfilled >,
2056 boost::statechart::custom_reaction< AllReplicasActivated >,
2057 boost::statechart::custom_reaction< DeferRecovery >,
2058 boost::statechart::custom_reaction< DeferBackfill >,
2059 boost::statechart::custom_reaction< UnfoundRecovery >,
2060 boost::statechart::custom_reaction< UnfoundBackfill >,
2061 boost::statechart::custom_reaction< DoRecovery>
2062 > reactions;
2063 boost::statechart::result react(const QueryState& q);
2064 boost::statechart::result react(const ActMap&);
2065 boost::statechart::result react(const AdvMap&);
2066 boost::statechart::result react(const MInfoRec& infoevt);
2067 boost::statechart::result react(const MNotifyRec& notevt);
2068 boost::statechart::result react(const MLogRec& logevt);
2069 boost::statechart::result react(const Backfilled&) {
2070 return discard_event();
2071 }
2072 boost::statechart::result react(const AllReplicasActivated&);
2073 boost::statechart::result react(const DeferRecovery& evt) {
2074 return discard_event();
2075 }
2076 boost::statechart::result react(const DeferBackfill& evt) {
2077 return discard_event();
2078 }
2079 boost::statechart::result react(const UnfoundRecovery& evt) {
2080 return discard_event();
2081 }
2082 boost::statechart::result react(const UnfoundBackfill& evt) {
2083 return discard_event();
2084 }
2085 boost::statechart::result react(const DoRecovery&) {
2086 return discard_event();
2087 }
2088 };
2089
2090 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2091 typedef boost::mpl::list<
2092 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
2093 > reactions;
2094 explicit Clean(my_context ctx);
2095 void exit();
2096 };
2097
2098 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2099 typedef boost::mpl::list<
2100 boost::statechart::transition< GoClean, Clean >,
2101 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2102 boost::statechart::custom_reaction< AllReplicasActivated >
2103 > reactions;
2104 explicit Recovered(my_context ctx);
2105 void exit();
2106 boost::statechart::result react(const AllReplicasActivated&) {
2107 post_event(GoClean());
2108 return forward_event();
2109 }
2110 };
2111
2112 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2113 typedef boost::mpl::list<
2114 boost::statechart::transition< Backfilled, Recovered >,
2115 boost::statechart::custom_reaction< DeferBackfill >,
2116 boost::statechart::custom_reaction< UnfoundBackfill >,
2117 boost::statechart::custom_reaction< RemoteReservationRejected >
2118 > reactions;
2119 explicit Backfilling(my_context ctx);
2120 boost::statechart::result react(const RemoteReservationRejected& evt);
2121 boost::statechart::result react(const DeferBackfill& evt);
2122 boost::statechart::result react(const UnfoundBackfill& evt);
2123 void exit();
2124 };
2125
2126 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2127 typedef boost::mpl::list<
2128 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2129 boost::statechart::custom_reaction< RemoteReservationRejected >,
2130 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2131 > reactions;
2132 set<pg_shard_t>::const_iterator backfill_osd_it;
2133 explicit WaitRemoteBackfillReserved(my_context ctx);
2134 void exit();
2135 boost::statechart::result react(const RemoteBackfillReserved& evt);
2136 boost::statechart::result react(const RemoteReservationRejected& evt);
2137 };
2138
2139 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2140 typedef boost::mpl::list<
2141 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2142 > reactions;
2143 explicit WaitLocalBackfillReserved(my_context ctx);
2144 void exit();
2145 };
2146
2147 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2148 typedef boost::mpl::list<
2149 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2150 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2151 boost::statechart::custom_reaction< RemoteReservationRejected >
2152 > reactions;
2153 explicit NotBackfilling(my_context ctx);
2154 void exit();
2155 boost::statechart::result react(const RemoteBackfillReserved& evt);
2156 boost::statechart::result react(const RemoteReservationRejected& evt);
2157 };
2158
2159 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2160 typedef boost::mpl::list<
2161 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2162 boost::statechart::custom_reaction< DeferRecovery >,
2163 boost::statechart::custom_reaction< UnfoundRecovery >
2164 > reactions;
2165 explicit NotRecovering(my_context ctx);
2166 boost::statechart::result react(const DeferRecovery& evt) {
2167 /* no-op */
2168 return discard_event();
2169 }
2170 boost::statechart::result react(const UnfoundRecovery& evt) {
2171 /* no-op */
2172 return discard_event();
2173 }
2174 void exit();
2175 };
2176
2177 struct RepNotRecovering;
2178 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2179 explicit ReplicaActive(my_context ctx);
2180 void exit();
2181
2182 typedef boost::mpl::list <
2183 boost::statechart::custom_reaction< QueryState >,
2184 boost::statechart::custom_reaction< ActMap >,
2185 boost::statechart::custom_reaction< MQuery >,
2186 boost::statechart::custom_reaction< MInfoRec >,
2187 boost::statechart::custom_reaction< MLogRec >,
2188 boost::statechart::custom_reaction< Activate >,
2189 boost::statechart::custom_reaction< DeferRecovery >,
2190 boost::statechart::custom_reaction< DeferBackfill >,
2191 boost::statechart::custom_reaction< UnfoundRecovery >,
2192 boost::statechart::custom_reaction< UnfoundBackfill >
2193 > reactions;
2194 boost::statechart::result react(const QueryState& q);
2195 boost::statechart::result react(const MInfoRec& infoevt);
2196 boost::statechart::result react(const MLogRec& logevt);
2197 boost::statechart::result react(const ActMap&);
2198 boost::statechart::result react(const MQuery&);
2199 boost::statechart::result react(const Activate&);
2200 boost::statechart::result react(const DeferRecovery& evt) {
2201 return discard_event();
2202 }
2203 boost::statechart::result react(const DeferBackfill& evt) {
2204 return discard_event();
2205 }
2206 boost::statechart::result react(const UnfoundRecovery& evt) {
2207 return discard_event();
2208 }
2209 boost::statechart::result react(const UnfoundBackfill& evt) {
2210 return discard_event();
2211 }
2212 };
2213
2214 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2215 typedef boost::mpl::list<
2216 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2217 // for compat with old peers
2218 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2219 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2220 boost::statechart::custom_reaction< BackfillTooFull >
2221 > reactions;
2222 explicit RepRecovering(my_context ctx);
2223 boost::statechart::result react(const BackfillTooFull &evt);
2224 void exit();
2225 };
2226
2227 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2228 typedef boost::mpl::list<
2229 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2230 boost::statechart::custom_reaction< RejectRemoteReservation >,
2231 boost::statechart::custom_reaction< RemoteReservationRejected >,
2232 boost::statechart::custom_reaction< RemoteReservationCanceled >
2233 > reactions;
2234 explicit RepWaitBackfillReserved(my_context ctx);
2235 void exit();
2236 boost::statechart::result react(const RemoteBackfillReserved &evt);
2237 boost::statechart::result react(const RejectRemoteReservation &evt);
2238 boost::statechart::result react(const RemoteReservationRejected &evt);
2239 boost::statechart::result react(const RemoteReservationCanceled &evt);
2240 };
2241
2242 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2243 typedef boost::mpl::list<
2244 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2245 // for compat with old peers
2246 boost::statechart::custom_reaction< RemoteReservationRejected >,
2247 boost::statechart::custom_reaction< RemoteReservationCanceled >
2248 > reactions;
2249 explicit RepWaitRecoveryReserved(my_context ctx);
2250 void exit();
2251 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2252 boost::statechart::result react(const RemoteReservationRejected &evt) {
2253 // for compat with old peers
2254 post_event(RemoteReservationCanceled());
2255 return discard_event();
2256 }
2257 boost::statechart::result react(const RemoteReservationCanceled &evt);
2258 };
2259
2260 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2261 typedef boost::mpl::list<
2262 boost::statechart::custom_reaction< RequestBackfillPrio >,
2263 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
2264 boost::statechart::custom_reaction< RejectRemoteReservation >,
2265 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2266 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2267 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2268 > reactions;
2269 explicit RepNotRecovering(my_context ctx);
2270 boost::statechart::result react(const RequestBackfillPrio &evt);
2271 boost::statechart::result react(const RejectRemoteReservation &evt);
2272 void exit();
2273 };
2274
2275 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2276 typedef boost::mpl::list <
2277 boost::statechart::custom_reaction< AllReplicasRecovered >,
2278 boost::statechart::custom_reaction< DeferRecovery >,
2279 boost::statechart::custom_reaction< UnfoundRecovery >,
2280 boost::statechart::custom_reaction< RequestBackfill >
2281 > reactions;
2282 explicit Recovering(my_context ctx);
2283 void exit();
2284 void release_reservations(bool cancel = false);
2285 boost::statechart::result react(const AllReplicasRecovered &evt);
2286 boost::statechart::result react(const DeferRecovery& evt);
2287 boost::statechart::result react(const UnfoundRecovery& evt);
2288 boost::statechart::result react(const RequestBackfill &evt);
2289 };
2290
2291 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2292 typedef boost::mpl::list <
2293 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2294 boost::statechart::transition< AllRemotesReserved, Recovering >
2295 > reactions;
2296 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2297 explicit WaitRemoteRecoveryReserved(my_context ctx);
2298 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2299 void exit();
2300 };
2301
2302 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2303 typedef boost::mpl::list <
2304 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2305 boost::statechart::custom_reaction< RecoveryTooFull >
2306 > reactions;
2307 explicit WaitLocalRecoveryReserved(my_context ctx);
2308 void exit();
2309 boost::statechart::result react(const RecoveryTooFull &evt);
2310 };
2311
2312 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2313 typedef boost::mpl::list <
2314 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2315 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2316 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2317 > reactions;
2318 explicit Activating(my_context ctx);
2319 void exit();
2320 };
2321
2322 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2323 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2324
2325 explicit Stray(my_context ctx);
2326 void exit();
2327
2328 typedef boost::mpl::list <
2329 boost::statechart::custom_reaction< MQuery >,
2330 boost::statechart::custom_reaction< MLogRec >,
2331 boost::statechart::custom_reaction< MInfoRec >,
2332 boost::statechart::custom_reaction< ActMap >,
2333 boost::statechart::custom_reaction< RecoveryDone >
2334 > reactions;
2335 boost::statechart::result react(const MQuery& query);
2336 boost::statechart::result react(const MLogRec& logevt);
2337 boost::statechart::result react(const MInfoRec& infoevt);
2338 boost::statechart::result react(const ActMap&);
2339 boost::statechart::result react(const RecoveryDone&) {
2340 return discard_event();
2341 }
2342 };
2343
2344 struct GetLog;
2345
2346 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2347 set<pg_shard_t> peer_info_requested;
2348
2349 explicit GetInfo(my_context ctx);
2350 void exit();
2351 void get_infos();
2352
2353 typedef boost::mpl::list <
2354 boost::statechart::custom_reaction< QueryState >,
2355 boost::statechart::transition< GotInfo, GetLog >,
2356 boost::statechart::custom_reaction< MNotifyRec >,
2357 boost::statechart::transition< IsDown, Down >
2358 > reactions;
2359 boost::statechart::result react(const QueryState& q);
2360 boost::statechart::result react(const MNotifyRec& infoevt);
2361 };
2362
2363 struct GotLog : boost::statechart::event< GotLog > {
2364 GotLog() : boost::statechart::event< GotLog >() {}
2365 };
2366
2367 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2368 pg_shard_t auth_log_shard;
2369 boost::intrusive_ptr<MOSDPGLog> msg;
2370
2371 explicit GetLog(my_context ctx);
2372 void exit();
2373
2374 typedef boost::mpl::list <
2375 boost::statechart::custom_reaction< QueryState >,
2376 boost::statechart::custom_reaction< MLogRec >,
2377 boost::statechart::custom_reaction< GotLog >,
2378 boost::statechart::custom_reaction< AdvMap >,
2379 boost::statechart::transition< IsIncomplete, Incomplete >
2380 > reactions;
2381 boost::statechart::result react(const AdvMap&);
2382 boost::statechart::result react(const QueryState& q);
2383 boost::statechart::result react(const MLogRec& logevt);
2384 boost::statechart::result react(const GotLog&);
2385 };
2386
2387 struct WaitUpThru;
2388
2389 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2390 set<pg_shard_t> peer_missing_requested;
2391
2392 explicit GetMissing(my_context ctx);
2393 void exit();
2394
2395 typedef boost::mpl::list <
2396 boost::statechart::custom_reaction< QueryState >,
2397 boost::statechart::custom_reaction< MLogRec >,
2398 boost::statechart::transition< NeedUpThru, WaitUpThru >
2399 > reactions;
2400 boost::statechart::result react(const QueryState& q);
2401 boost::statechart::result react(const MLogRec& logevt);
2402 };
2403
2404 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2405 explicit WaitUpThru(my_context ctx);
2406 void exit();
2407
2408 typedef boost::mpl::list <
2409 boost::statechart::custom_reaction< QueryState >,
2410 boost::statechart::custom_reaction< ActMap >,
2411 boost::statechart::custom_reaction< MLogRec >
2412 > reactions;
2413 boost::statechart::result react(const QueryState& q);
2414 boost::statechart::result react(const ActMap& am);
2415 boost::statechart::result react(const MLogRec& logrec);
2416 };
2417
2418 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2419 explicit Down(my_context ctx);
2420 typedef boost::mpl::list <
2421 boost::statechart::custom_reaction< QueryState >
2422 > reactions;
2423 boost::statechart::result react(const QueryState& infoevt);
2424 void exit();
2425 };
2426
2427 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2428 typedef boost::mpl::list <
2429 boost::statechart::custom_reaction< AdvMap >,
2430 boost::statechart::custom_reaction< MNotifyRec >,
2431 boost::statechart::custom_reaction< QueryState >
2432 > reactions;
2433 explicit Incomplete(my_context ctx);
2434 boost::statechart::result react(const AdvMap &advmap);
2435 boost::statechart::result react(const MNotifyRec& infoevt);
2436 boost::statechart::result react(const QueryState& infoevt);
2437 void exit();
2438 };
2439
2440
2441 RecoveryMachine machine;
2442 PG *pg;
2443
2444 /// context passed in by state machine caller
2445 RecoveryCtx *orig_ctx;
2446
2447 /// populated if we are buffering messages pending a flush
2448 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2449
2450 /**
2451 * populated between start_handle() and end_handle(), points into
2452 * the message lists for messages_pending_flush while blocking messages
2453 * or into orig_ctx otherwise
2454 */
2455 boost::optional<RecoveryCtx> rctx;
2456
2457 public:
2458 explicit RecoveryState(PG *pg)
2459 : machine(this, pg), pg(pg), orig_ctx(0) {
2460 machine.initiate();
2461 }
2462
2463 void handle_event(const boost::statechart::event_base &evt,
2464 RecoveryCtx *rctx) {
2465 start_handle(rctx);
2466 machine.process_event(evt);
2467 end_handle();
2468 }
2469
2470 void handle_event(CephPeeringEvtRef evt,
2471 RecoveryCtx *rctx) {
2472 start_handle(rctx);
2473 machine.process_event(evt->get_event());
2474 end_handle();
2475 }
2476
2477 } recovery_state;
2478
2479
2480 public:
2481 PG(OSDService *o, OSDMapRef curmap,
2482 const PGPool &pool, spg_t p);
2483 ~PG() override;
2484
2485 private:
2486 // Prevent copying
2487 explicit PG(const PG& rhs);
2488 PG& operator=(const PG& rhs);
2489 const spg_t pg_id;
2490 uint64_t peer_features;
2491 uint64_t acting_features;
2492 uint64_t upacting_features;
2493
2494 epoch_t last_epoch;
2495
2496 public:
2497 const spg_t& get_pgid() const { return pg_id; }
2498
2499 void reset_min_peer_features() {
2500 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2501 }
2502 uint64_t get_min_peer_features() const { return peer_features; }
2503 void apply_peer_features(uint64_t f) { peer_features &= f; }
2504
2505 uint64_t get_min_acting_features() const { return acting_features; }
2506 uint64_t get_min_upacting_features() const { return upacting_features; }
2507 bool perform_deletes_during_peering() const {
2508 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2509 }
2510
2511 bool hard_limit_pglog() const {
2512 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2513 }
2514
2515 void init_primary_up_acting(
2516 const vector<int> &newup,
2517 const vector<int> &newacting,
2518 int new_up_primary,
2519 int new_acting_primary) {
2520 actingset.clear();
2521 acting = newacting;
2522 for (uint8_t i = 0; i < acting.size(); ++i) {
2523 if (acting[i] != CRUSH_ITEM_NONE)
2524 actingset.insert(
2525 pg_shard_t(
2526 acting[i],
2527 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2528 }
2529 upset.clear();
2530 up = newup;
2531 for (uint8_t i = 0; i < up.size(); ++i) {
2532 if (up[i] != CRUSH_ITEM_NONE)
2533 upset.insert(
2534 pg_shard_t(
2535 up[i],
2536 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2537 }
2538 if (!pool.info.ec_pool()) {
2539 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2540 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2541 return;
2542 }
2543 up_primary = pg_shard_t();
2544 primary = pg_shard_t();
2545 for (uint8_t i = 0; i < up.size(); ++i) {
2546 if (up[i] == new_up_primary) {
2547 up_primary = pg_shard_t(up[i], shard_id_t(i));
2548 break;
2549 }
2550 }
2551 for (uint8_t i = 0; i < acting.size(); ++i) {
2552 if (acting[i] == new_acting_primary) {
2553 primary = pg_shard_t(acting[i], shard_id_t(i));
2554 break;
2555 }
2556 }
2557 assert(up_primary.osd == new_up_primary);
2558 assert(primary.osd == new_acting_primary);
2559 }
2560 pg_shard_t get_primary() const { return primary; }
2561
2562 int get_role() const { return role; }
2563 void set_role(int r) { role = r; }
2564
2565 bool is_primary() const { return pg_whoami == primary; }
2566 bool is_replica() const { return role > 0; }
2567
2568 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2569
2570 //int get_state() const { return state; }
2571 bool state_test(int m) const { return (state & m) != 0; }
2572 void state_set(int m) { state |= m; }
2573 void state_clear(int m) { state &= ~m; }
2574
2575 bool is_complete() const { return info.last_complete == info.last_update; }
2576 bool should_send_notify() const { return send_notify; }
2577
2578 int get_state() const { return state; }
2579 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2580 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2581 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2582 bool is_down() const { return state_test(PG_STATE_DOWN); }
2583 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2584 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2585 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2586 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2587 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2588 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2589
2590 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2591 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2592 bool is_peered() const {
2593 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2594 }
2595 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
2596
2597 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2598
2599 void init(
2600 int role,
2601 const vector<int>& up,
2602 int up_primary,
2603 const vector<int>& acting,
2604 int acting_primary,
2605 const pg_history_t& history,
2606 const PastIntervals& pim,
2607 bool backfill,
2608 ObjectStore::Transaction *t);
2609
2610 // pg on-disk state
2611 void do_pending_flush();
2612
2613 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2614 static void _init(ObjectStore::Transaction& t,
2615 spg_t pgid, const pg_pool_t *pool);
2616
2617 private:
2618 void prepare_write_info(map<string,bufferlist> *km);
2619
2620 void update_store_with_options();
2621 void update_store_on_load();
2622
2623 public:
2624 static int _prepare_write_info(
2625 CephContext* cct,
2626 map<string,bufferlist> *km,
2627 epoch_t epoch,
2628 pg_info_t &info,
2629 pg_info_t &last_written_info,
2630 PastIntervals &past_intervals,
2631 bool dirty_big_info,
2632 bool dirty_epoch,
2633 bool try_fast_info,
2634 PerfCounters *logger = nullptr);
2635 void write_if_dirty(ObjectStore::Transaction& t);
2636
2637 PGLog::IndexedLog projected_log;
2638 bool check_in_progress_op(
2639 const osd_reqid_t &r,
2640 eversion_t *version,
2641 version_t *user_version,
2642 int *return_code) const;
2643 eversion_t projected_last_update;
2644 eversion_t get_next_version() const {
2645 eversion_t at_version(
2646 get_osdmap()->get_epoch(),
2647 projected_last_update.version+1);
2648 assert(at_version > info.last_update);
2649 assert(at_version > pg_log.get_head());
2650 assert(at_version > projected_last_update);
2651 return at_version;
2652 }
2653
2654 void add_log_entry(const pg_log_entry_t& e, bool applied);
2655 void append_log(
2656 const vector<pg_log_entry_t>& logv,
2657 eversion_t trim_to,
2658 eversion_t roll_forward_to,
2659 ObjectStore::Transaction &t,
2660 bool transaction_applied = true);
2661 bool check_log_for_corruption(ObjectStore *store);
2662
2663 std::string get_corrupt_pg_log_name() const;
2664 static int read_info(
2665 ObjectStore *store, spg_t pgid, const coll_t &coll,
2666 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2667 __u8 &);
2668 void read_state(ObjectStore *store, bufferlist &bl);
2669 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2670 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2671 epoch_t *pepoch, bufferlist *bl);
2672 void update_snap_map(
2673 const vector<pg_log_entry_t> &log_entries,
2674 ObjectStore::Transaction& t);
2675
2676 void filter_snapc(vector<snapid_t> &snaps);
2677
2678 void log_weirdness();
2679
2680 virtual void kick_snap_trim() = 0;
2681 virtual void snap_trimmer_scrub_complete() = 0;
2682 bool requeue_scrub(bool high_priority = false);
2683 void queue_recovery();
2684 bool queue_scrub();
2685 unsigned get_scrub_priority();
2686
2687 /// share pg info after a pg is active
2688 void share_pg_info();
2689
2690
2691 bool append_log_entries_update_missing(
2692 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2693 ObjectStore::Transaction &t,
2694 boost::optional<eversion_t> trim_to,
2695 boost::optional<eversion_t> roll_forward_to);
2696
2697 /**
2698 * Merge entries updating missing as necessary on all
2699 * actingbackfill logs and missings (also missing_loc)
2700 */
2701 void merge_new_log_entries(
2702 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2703 ObjectStore::Transaction &t,
2704 boost::optional<eversion_t> trim_to,
2705 boost::optional<eversion_t> roll_forward_to);
2706
2707 void reset_interval_flush();
2708 void start_peering_interval(
2709 const OSDMapRef lastmap,
2710 const vector<int>& newup, int up_primary,
2711 const vector<int>& newacting, int acting_primary,
2712 ObjectStore::Transaction *t);
2713 void on_new_interval();
2714 virtual void _on_new_interval() = 0;
2715 void start_flush(ObjectStore::Transaction *t,
2716 list<Context *> *on_applied,
2717 list<Context *> *on_safe);
2718 void set_last_peering_reset();
2719 bool pg_has_reset_since(epoch_t e) {
2720 assert(is_locked());
2721 return deleting || e < get_last_peering_reset();
2722 }
2723
2724 void update_history(const pg_history_t& history);
2725 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2726 pair<pg_shard_t, pg_info_t> &notify_info);
2727 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2728 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
2729 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2730
2731 bool should_restart_peering(
2732 int newupprimary,
2733 int newactingprimary,
2734 const vector<int>& newup,
2735 const vector<int>& newacting,
2736 OSDMapRef lastmap,
2737 OSDMapRef osdmap);
2738
2739 // OpRequest queueing
2740 bool can_discard_op(OpRequestRef& op);
2741 bool can_discard_scan(OpRequestRef op);
2742 bool can_discard_backfill(OpRequestRef op);
2743 bool can_discard_request(OpRequestRef& op);
2744
2745 template<typename T, int MSGTYPE>
2746 bool can_discard_replica_op(OpRequestRef& op);
2747
2748 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2749 bool old_peering_evt(CephPeeringEvtRef evt) {
2750 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2751 }
2752 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2753 return e <= cur_epoch;
2754 }
2755 bool have_same_or_newer_map(epoch_t e) {
2756 return e <= get_osdmap()->get_epoch();
2757 }
2758
2759 bool op_has_sufficient_caps(OpRequestRef& op);
2760
2761
2762 // recovery bits
2763 void take_waiters();
2764 void queue_peering_event(CephPeeringEvtRef evt);
2765 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2766 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2767 pg_shard_t from, const pg_query_t& q);
2768 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2769 void queue_flushed(epoch_t started_at);
2770 void handle_advance_map(
2771 OSDMapRef osdmap, OSDMapRef lastmap,
2772 vector<int>& newup, int up_primary,
2773 vector<int>& newacting, int acting_primary,
2774 RecoveryCtx *rctx);
2775 void handle_activate_map(RecoveryCtx *rctx);
2776 void handle_create(RecoveryCtx *rctx);
2777 void handle_loaded(RecoveryCtx *rctx);
2778 void handle_query_state(Formatter *f);
2779
2780 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2781
2782
2783 // abstract bits
2784 virtual void do_request(
2785 OpRequestRef& op,
2786 ThreadPool::TPHandle &handle
2787 ) = 0;
2788
2789 virtual void do_op(OpRequestRef& op) = 0;
2790 virtual void do_sub_op(OpRequestRef op) = 0;
2791 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2792 virtual void do_scan(
2793 OpRequestRef op,
2794 ThreadPool::TPHandle &handle
2795 ) = 0;
2796 virtual void do_backfill(OpRequestRef op) = 0;
2797 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2798
2799 virtual int do_command(
2800 cmdmap_t cmdmap,
2801 ostream& ss,
2802 bufferlist& idata,
2803 bufferlist& odata,
2804 ConnectionRef conn,
2805 ceph_tid_t tid) = 0;
2806
2807 virtual void on_role_change() = 0;
2808 virtual void on_pool_change() = 0;
2809 virtual void on_change(ObjectStore::Transaction *t) = 0;
2810 virtual void on_activate() = 0;
2811 virtual void on_flushed() = 0;
2812 virtual void on_shutdown() = 0;
2813 virtual void check_blacklisted_watchers() = 0;
2814 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2815
2816 virtual bool agent_work(int max) = 0;
2817 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2818 virtual void agent_stop() = 0;
2819 virtual void agent_delay() = 0;
2820 virtual void agent_clear() = 0;
2821 virtual void agent_choose_mode_restart() = 0;
2822 };
2823
2824 ostream& operator<<(ostream& out, const PG& pg);
2825
2826 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2827
2828 #endif