]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
import ceph 12.2.12
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include <boost/container/flat_set.hpp>
28 #include "include/memory.h"
29 #include "include/mempool.h"
30
31 // re-include our assert to clobber boost's
32 #include "include/assert.h"
33
34 #include "include/types.h"
35 #include "include/stringify.h"
36 #include "osd_types.h"
37 #include "include/xlist.h"
38 #include "SnapMapper.h"
39 #include "Session.h"
40 #include "common/Timer.h"
41
42 #include "PGLog.h"
43 #include "OSDMap.h"
44 #include "messages/MOSDPGLog.h"
45 #include "include/str_list.h"
46 #include "PGBackend.h"
47
48 #include <atomic>
49 #include <list>
50 #include <memory>
51 #include <stack>
52 #include <string>
53 #include <tuple>
54 using namespace std;
55
56 // #include "include/unordered_map.h"
57 // #include "include/unordered_set.h"
58
59 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
60
61 class OSD;
62 class OSDService;
63 class MOSDOp;
64 class MOSDPGScan;
65 class MOSDPGBackfill;
66 class MOSDPGInfo;
67
68 class PG;
69 struct OpRequest;
70 typedef OpRequest::Ref OpRequestRef;
71 class MOSDPGLog;
72 class CephContext;
73
74 namespace Scrub {
75 class Store;
76 }
77
78 void intrusive_ptr_add_ref(PG *pg);
79 void intrusive_ptr_release(PG *pg);
80
81 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
82 using embedded_state = std::pair<utime_t, const char*>;
83
84 struct PGStateInstance {
85 // Time spent in pg states
86
87 void setepoch(const epoch_t current_epoch) {
88 this_epoch = current_epoch;
89 }
90
91 void enter_state(const utime_t entime, const char* state) {
92 embedded_states.push(std::make_pair(entime, state));
93 }
94
95 void exit_state(const utime_t extime) {
96 embedded_state this_state = embedded_states.top();
97 state_history.push_back(state_history_entry{
98 this_state.first, extime, this_state.second});
99 embedded_states.pop();
100 }
101
102 epoch_t this_epoch;
103 utime_t enter_time;
104 std::vector<state_history_entry> state_history;
105 std::stack<embedded_state> embedded_states;
106 };
107
108 class PGStateHistory {
109 // Member access protected with the PG lock
110 public:
111 PGStateHistory() : buffer(10) {}
112
113 void enter(PG* pg, const utime_t entime, const char* state);
114
115 void exit(const char* state);
116
117 void reset() {
118 pi = nullptr;
119 }
120
121 void set_pg_in_destructor() { pg_in_destructor = true; }
122
123 void dump(Formatter* f) const;
124
125 string get_current_state() {
126 if (pi == nullptr) return "unknown";
127 return std::get<1>(pi->embedded_states.top());
128 }
129
130 private:
131 bool pg_in_destructor = false;
132 PG* thispg = nullptr;
133 std::unique_ptr<PGStateInstance> tmppi;
134 PGStateInstance* pi = nullptr;
135 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
136
137 };
138
139 #ifdef PG_DEBUG_REFS
140 #include "common/tracked_int_ptr.hpp"
141 uint64_t get_with_id(PG *pg);
142 void put_with_id(PG *pg, uint64_t id);
143 typedef TrackedIntPtr<PG> PGRef;
144 #else
145 typedef boost::intrusive_ptr<PG> PGRef;
146 #endif
147
148 class PGRecoveryStats {
149 struct per_state_info {
150 uint64_t enter, exit; // enter/exit counts
151 uint64_t events;
152 utime_t event_time; // time spent processing events
153 utime_t total_time; // total time in state
154 utime_t min_time, max_time;
155
156 // cppcheck-suppress unreachableCode
157 per_state_info() : enter(0), exit(0), events(0) {}
158 };
159 map<const char *,per_state_info> info;
160 Mutex lock;
161
162 public:
163 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
164
165 void reset() {
166 Mutex::Locker l(lock);
167 info.clear();
168 }
169 void dump(ostream& out) {
170 Mutex::Locker l(lock);
171 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
172 per_state_info& i = p->second;
173 out << i.enter << "\t" << i.exit << "\t"
174 << i.events << "\t" << i.event_time << "\t"
175 << i.total_time << "\t"
176 << i.min_time << "\t" << i.max_time << "\t"
177 << p->first << "\n";
178 }
179 }
180
181 void dump_formatted(Formatter *f) {
182 Mutex::Locker l(lock);
183 f->open_array_section("pg_recovery_stats");
184 for (map<const char *,per_state_info>::iterator p = info.begin();
185 p != info.end(); ++p) {
186 per_state_info& i = p->second;
187 f->open_object_section("recovery_state");
188 f->dump_int("enter", i.enter);
189 f->dump_int("exit", i.exit);
190 f->dump_int("events", i.events);
191 f->dump_stream("event_time") << i.event_time;
192 f->dump_stream("total_time") << i.total_time;
193 f->dump_stream("min_time") << i.min_time;
194 f->dump_stream("max_time") << i.max_time;
195 vector<string> states;
196 get_str_vec(p->first, "/", states);
197 f->open_array_section("nested_states");
198 for (vector<string>::iterator st = states.begin();
199 st != states.end(); ++st) {
200 f->dump_string("state", *st);
201 }
202 f->close_section();
203 f->close_section();
204 }
205 f->close_section();
206 }
207
208 void log_enter(const char *s) {
209 Mutex::Locker l(lock);
210 info[s].enter++;
211 }
212 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
213 Mutex::Locker l(lock);
214 per_state_info &i = info[s];
215 i.exit++;
216 i.total_time += dur;
217 if (dur > i.max_time)
218 i.max_time = dur;
219 if (dur < i.min_time || i.min_time == utime_t())
220 i.min_time = dur;
221 i.events += events;
222 i.event_time += event_dur;
223 }
224 };
225
226 struct PGPool {
227 CephContext* cct;
228 epoch_t cached_epoch;
229 int64_t id;
230 string name;
231 uint64_t auid;
232
233 pg_pool_t info;
234 SnapContext snapc; // the default pool snapc, ready to go.
235
236 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
237 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
238
239 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
240 : cct(cct),
241 cached_epoch(map->get_epoch()),
242 id(i),
243 name(map->get_pool_name(id)),
244 auid(map->get_pg_pool(id)->auid) {
245 const pg_pool_t *pi = map->get_pg_pool(id);
246 assert(pi);
247 info = *pi;
248 snapc = pi->get_snap_context();
249 pi->build_removed_snaps(cached_removed_snaps);
250 }
251
252 void update(OSDMapRef map);
253 };
254
255 /** PG - Replica Placement Group
256 *
257 */
258
259 class PG : public DoutPrefixProvider {
260 public:
261 bool set_force_recovery(bool b);
262 bool set_force_backfill(bool b);
263
264 protected:
265 OSDService *osd;
266 CephContext *cct;
267 OSDriver osdriver;
268 SnapMapper snap_mapper;
269 bool eio_errors_to_process = false;
270
271 virtual PGBackend *get_pgbackend() = 0;
272 public:
273 std::string gen_prefix() const override;
274 CephContext *get_cct() const override { return cct; }
275 unsigned get_subsys() const override { return ceph_subsys_osd; }
276
277 /*** PG ****/
278 void update_snap_mapper_bits(uint32_t bits) {
279 snap_mapper.update_bits(bits);
280 }
281 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
282 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
283 return get_pgbackend()->get_is_recoverable_predicate();
284 }
285 protected:
286 OSDMapRef osdmap_ref;
287 OSDMapRef last_persisted_osdmap_ref;
288 PGPool pool;
289
290 void requeue_map_waiters();
291
292 void update_osdmap_ref(OSDMapRef newmap) {
293 assert(_lock.is_locked_by_me());
294 osdmap_ref = std::move(newmap);
295 }
296
297 public:
298 OSDMapRef get_osdmap() const {
299 assert(is_locked());
300 assert(osdmap_ref);
301 return osdmap_ref;
302 }
303 protected:
304
305 /** locking and reference counting.
306 * I destroy myself when the reference count hits zero.
307 * lock() should be called before doing anything.
308 * get() should be called on pointer copy (to another thread, etc.).
309 * put() should be called on destruction of some previously copied pointer.
310 * unlock() when done with the current pointer (_most common_).
311 */
312 mutable Mutex _lock;
313 std::atomic_uint ref{0};
314
315 #ifdef PG_DEBUG_REFS
316 Mutex _ref_id_lock;
317 map<uint64_t, string> _live_ids;
318 map<string, uint64_t> _tag_counts;
319 uint64_t _ref_id;
320 #endif
321
322 public:
323 bool deleting; // true while in removing or OSD is shutting down
324
325 ZTracer::Endpoint trace_endpoint;
326
327 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
328 void lock(bool no_lockdep = false) const;
329 void unlock() const {
330 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
331 assert(!dirty_info);
332 assert(!dirty_big_info);
333 _lock.Unlock();
334 }
335
336 bool is_locked() const {
337 return _lock.is_locked();
338 }
339
340 #ifdef PG_DEBUG_REFS
341 uint64_t get_with_id();
342 void put_with_id(uint64_t);
343 void dump_live_ids();
344 #endif
345 void get(const char* tag);
346 void put(const char* tag);
347
348 bool dirty_info, dirty_big_info;
349
350 public:
351 bool is_ec_pg() const {
352 return pool.info.ec_pool();
353 }
354 // pg state
355 pg_info_t info; ///< current pg info
356 pg_info_t last_written_info; ///< last written info
357 __u8 info_struct_v;
358 static const __u8 cur_struct_v = 10;
359 // v10 is the new past_intervals encoding
360 // v9 was fastinfo_key addition
361 // v8 was the move to a per-pg pgmeta object
362 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
363 // (first appeared in cuttlefish).
364 static const __u8 compat_struct_v = 7;
365 bool must_upgrade() {
366 return info_struct_v < cur_struct_v;
367 }
368 bool can_upgrade() {
369 return info_struct_v >= compat_struct_v;
370 }
371 void upgrade(ObjectStore *store);
372
373 const coll_t coll;
374 ObjectStore::CollectionHandle ch;
375 PGLog pg_log;
376 static string get_info_key(spg_t pgid) {
377 return stringify(pgid) + "_info";
378 }
379 static string get_biginfo_key(spg_t pgid) {
380 return stringify(pgid) + "_biginfo";
381 }
382 static string get_epoch_key(spg_t pgid) {
383 return stringify(pgid) + "_epoch";
384 }
385 ghobject_t pgmeta_oid;
386
387 // ------------------
388 // MissingLoc
389
390 class MissingLoc {
391 public:
392 // a loc_count indicates how many locations we know in each of
393 // these distinct sets
394 struct loc_count_t {
395 int up = 0; //< up
396 int other = 0; //< other
397
398 friend bool operator<(const loc_count_t& l,
399 const loc_count_t& r) {
400 return (l.up < r.up ||
401 (l.up == r.up &&
402 (l.other < r.other)));
403 }
404 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
405 assert(l.up >= 0);
406 assert(l.other >= 0);
407 return out << "(" << l.up << "+" << l.other << ")";
408 }
409 };
410
411
412 private:
413
414 loc_count_t _get_count(const set<pg_shard_t>& shards) {
415 loc_count_t r;
416 for (auto s : shards) {
417 if (pg->upset.count(s)) {
418 r.up++;
419 } else {
420 r.other++;
421 }
422 }
423 return r;
424 }
425
426 map<hobject_t, pg_missing_item> needs_recovery_map;
427 map<hobject_t, set<pg_shard_t> > missing_loc;
428 set<pg_shard_t> missing_loc_sources;
429
430 // for every entry in missing_loc, we count how many of each type of shard we have,
431 // and maintain totals here. The sum of the values for this map will always equal
432 // missing_loc.size().
433 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
434
435 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
436 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
437 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
438 // For completely missing shards initialize with empty set<pg_shard_t>
439 for (int i = 0 ; i < num_shards ; ++i) {
440 shard_id_t shard(i);
441 pgsbs[shard];
442 }
443 for (auto pgs: s)
444 pgsbs[pgs.shard].insert(pgs);
445 } else {
446 pgsbs[shard_id_t::NO_SHARD] = s;
447 }
448 }
449
450 void _inc_count(const set<pg_shard_t>& s) {
451 map< shard_id_t, set<pg_shard_t> > pgsbs;
452 pgs_by_shard_id(s, pgsbs);
453 for (auto shard: pgsbs)
454 ++missing_by_count[shard.first][_get_count(shard.second)];
455 }
456 void _dec_count(const set<pg_shard_t>& s) {
457 map< shard_id_t, set<pg_shard_t> > pgsbs;
458 pgs_by_shard_id(s, pgsbs);
459 for (auto shard: pgsbs) {
460 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
461 assert(p != missing_by_count[shard.first].end());
462 if (--p->second == 0) {
463 missing_by_count[shard.first].erase(p);
464 }
465 }
466 }
467
468 PG *pg;
469 set<pg_shard_t> empty_set;
470 public:
471 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
472 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
473 explicit MissingLoc(PG *pg)
474 : pg(pg) { }
475 void set_backend_predicates(
476 IsPGReadablePredicate *_is_readable,
477 IsPGRecoverablePredicate *_is_recoverable) {
478 is_readable.reset(_is_readable);
479 is_recoverable.reset(_is_recoverable);
480 }
481 string gen_prefix() const { return pg->gen_prefix(); }
482 bool needs_recovery(
483 const hobject_t &hoid,
484 eversion_t *v = 0) const {
485 map<hobject_t, pg_missing_item>::const_iterator i =
486 needs_recovery_map.find(hoid);
487 if (i == needs_recovery_map.end())
488 return false;
489 if (v)
490 *v = i->second.need;
491 return true;
492 }
493 bool is_deleted(const hobject_t &hoid) const {
494 auto i = needs_recovery_map.find(hoid);
495 if (i == needs_recovery_map.end())
496 return false;
497 return i->second.is_delete();
498 }
499 bool is_unfound(const hobject_t &hoid) const {
500 return needs_recovery(hoid) && !is_deleted(hoid) && (
501 !missing_loc.count(hoid) ||
502 !(*is_recoverable)(missing_loc.find(hoid)->second));
503 }
504 bool readable_with_acting(
505 const hobject_t &hoid,
506 const set<pg_shard_t> &acting) const;
507 uint64_t num_unfound() const {
508 uint64_t ret = 0;
509 for (map<hobject_t, pg_missing_item>::const_iterator i =
510 needs_recovery_map.begin();
511 i != needs_recovery_map.end();
512 ++i) {
513 if (is_unfound(i->first))
514 ++ret;
515 }
516 return ret;
517 }
518
519 bool have_unfound() const {
520 for (map<hobject_t, pg_missing_item>::const_iterator i =
521 needs_recovery_map.begin();
522 i != needs_recovery_map.end();
523 ++i) {
524 if (is_unfound(i->first))
525 return true;
526 }
527 return false;
528 }
529 void clear() {
530 needs_recovery_map.clear();
531 missing_loc.clear();
532 missing_loc_sources.clear();
533 missing_by_count.clear();
534 }
535
536 void add_location(const hobject_t &hoid, pg_shard_t location) {
537 auto p = missing_loc.find(hoid);
538 if (p == missing_loc.end()) {
539 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
540 } else {
541 _dec_count(p->second);
542 }
543 p->second.insert(location);
544 _inc_count(p->second);
545 }
546 void remove_location(const hobject_t &hoid, pg_shard_t location) {
547 auto p = missing_loc.find(hoid);
548 if (p != missing_loc.end()) {
549 _dec_count(p->second);
550 p->second.erase(location);
551 _inc_count(p->second);
552 }
553 }
554 void add_active_missing(const pg_missing_t &missing) {
555 for (map<hobject_t, pg_missing_item>::const_iterator i =
556 missing.get_items().begin();
557 i != missing.get_items().end();
558 ++i) {
559 map<hobject_t, pg_missing_item>::const_iterator j =
560 needs_recovery_map.find(i->first);
561 if (j == needs_recovery_map.end()) {
562 needs_recovery_map.insert(*i);
563 } else {
564 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
565 << i->first << " have " << j->second
566 << " tried to add " << i->second << dendl;
567 assert(i->second.need == j->second.need);
568 }
569 }
570 }
571
572 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
573 needs_recovery_map[hoid] = pg_missing_item(need, have);
574 }
575 void revise_need(const hobject_t &hoid, eversion_t need) {
576 assert(needs_recovery(hoid));
577 needs_recovery_map[hoid].need = need;
578 }
579
580 /// Adds info about a possible recovery source
581 bool add_source_info(
582 pg_shard_t source, ///< [in] source
583 const pg_info_t &oinfo, ///< [in] info
584 const pg_missing_t &omissing, ///< [in] (optional) missing
585 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
586 ); ///< @return whether a new object location was discovered
587
588 /// Adds recovery sources in batch
589 void add_batch_sources_info(
590 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
591 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
592 );
593
594 /// Uses osdmap to update structures for now down sources
595 void check_recovery_sources(const OSDMapRef& osdmap);
596
597 /// Call when hoid is no longer missing in acting set
598 void recovered(const hobject_t &hoid) {
599 needs_recovery_map.erase(hoid);
600 auto p = missing_loc.find(hoid);
601 if (p != missing_loc.end()) {
602 _dec_count(p->second);
603 missing_loc.erase(p);
604 }
605 }
606
607 /// Call to update structures for hoid after a change
608 void rebuild(
609 const hobject_t &hoid,
610 pg_shard_t self,
611 const set<pg_shard_t> to_recover,
612 const pg_info_t &info,
613 const pg_missing_t &missing,
614 const map<pg_shard_t, pg_missing_t> &pmissing,
615 const map<pg_shard_t, pg_info_t> &pinfo) {
616 recovered(hoid);
617 boost::optional<pg_missing_item> item;
618 auto miter = missing.get_items().find(hoid);
619 if (miter != missing.get_items().end()) {
620 item = miter->second;
621 } else {
622 for (auto &&i: to_recover) {
623 if (i == self)
624 continue;
625 auto pmiter = pmissing.find(i);
626 assert(pmiter != pmissing.end());
627 miter = pmiter->second.get_items().find(hoid);
628 if (miter != pmiter->second.get_items().end()) {
629 item = miter->second;
630 break;
631 }
632 }
633 }
634 if (!item)
635 return; // recovered!
636
637 needs_recovery_map[hoid] = *item;
638 if (item->is_delete())
639 return;
640 auto mliter =
641 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
642 assert(info.last_backfill.is_max());
643 assert(info.last_update >= item->need);
644 if (!missing.is_missing(hoid))
645 mliter->second.insert(self);
646 for (auto &&i: pmissing) {
647 auto pinfoiter = pinfo.find(i.first);
648 assert(pinfoiter != pinfo.end());
649 if (item->need <= pinfoiter->second.last_update &&
650 hoid <= pinfoiter->second.last_backfill &&
651 !i.second.is_missing(hoid))
652 mliter->second.insert(i.first);
653 }
654 _inc_count(mliter->second);
655 }
656
657 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
658 return missing_loc.count(hoid) ?
659 missing_loc.find(hoid)->second : empty_set;
660 }
661 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
662 return missing_loc;
663 }
664 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
665 return needs_recovery_map;
666 }
667 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
668 return missing_by_count;
669 }
670 } missing_loc;
671
672 PastIntervals past_intervals;
673
674 interval_set<snapid_t> snap_trimq;
675
676 /* You should not use these items without taking their respective queue locks
677 * (if they have one) */
678 xlist<PG*>::item stat_queue_item;
679 bool scrub_queued;
680 bool recovery_queued;
681
682 int recovery_ops_active;
683 set<pg_shard_t> waiting_on_backfill;
684 #ifdef DEBUG_RECOVERY_OIDS
685 set<hobject_t> recovering_oids;
686 #endif
687
688 protected:
689 int role; // 0 = primary, 1 = replica, -1=none.
690 unsigned state; // PG_STATE_*
691
692 bool send_notify; ///< true if we are non-primary and should notify the primary
693
694 public:
695 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
696 eversion_t last_complete_ondisk; // last_complete that has committed.
697 eversion_t last_update_applied;
698
699
700 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
701 PGRef pg;
702 epoch_t e;
703 eversion_t v;
704 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
705 : pg(pg), e(e), v(v) {}
706 void finish(int) override {
707 pg->lock();
708 if (!pg->pg_has_reset_since(e)) {
709 pg->last_rollback_info_trimmed_to_applied = v;
710 }
711 pg->unlock();
712 }
713 };
714 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
715 // and the transaction has applied
716 eversion_t last_rollback_info_trimmed_to_applied;
717
718 // primary state
719 public:
720 pg_shard_t primary;
721 pg_shard_t pg_whoami;
722 pg_shard_t up_primary;
723 vector<int> up, acting, want_acting;
724 set<pg_shard_t> actingbackfill, actingset, upset;
725 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
726 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
727 eversion_t pg_trim_to;
728
729 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
730
731 // [primary only] content recovery state
732
733 public:
734 struct BufferedRecoveryMessages {
735 map<int, map<spg_t, pg_query_t> > query_map;
736 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
737 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
738 };
739
740 public:
741 bool dne() { return info.dne(); }
742 struct RecoveryCtx {
743 utime_t start_time;
744 map<int, map<spg_t, pg_query_t> > *query_map;
745 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
746 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
747 set<PGRef> created_pgs;
748 C_Contexts *on_applied;
749 C_Contexts *on_safe;
750 ObjectStore::Transaction *transaction;
751 ThreadPool::TPHandle* handle;
752 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
753 map<int,
754 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
755 map<int,
756 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
757 C_Contexts *on_applied,
758 C_Contexts *on_safe,
759 ObjectStore::Transaction *transaction)
760 : query_map(query_map), info_map(info_map),
761 notify_list(notify_list),
762 on_applied(on_applied),
763 on_safe(on_safe),
764 transaction(transaction),
765 handle(NULL) {}
766
767 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
768 : query_map(&(buf.query_map)),
769 info_map(&(buf.info_map)),
770 notify_list(&(buf.notify_list)),
771 on_applied(rctx.on_applied),
772 on_safe(rctx.on_safe),
773 transaction(rctx.transaction),
774 handle(rctx.handle) {}
775
776 void accept_buffered_messages(BufferedRecoveryMessages &m) {
777 assert(query_map);
778 assert(info_map);
779 assert(notify_list);
780 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
781 i != m.query_map.end();
782 ++i) {
783 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
784 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
785 j != i->second.end();
786 ++j) {
787 omap[j->first] = j->second;
788 }
789 }
790 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
791 = m.info_map.begin();
792 i != m.info_map.end();
793 ++i) {
794 vector<pair<pg_notify_t, PastIntervals> > &ovec =
795 (*info_map)[i->first];
796 ovec.reserve(ovec.size() + i->second.size());
797 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
798 }
799 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
800 = m.notify_list.begin();
801 i != m.notify_list.end();
802 ++i) {
803 vector<pair<pg_notify_t, PastIntervals> > &ovec =
804 (*notify_list)[i->first];
805 ovec.reserve(ovec.size() + i->second.size());
806 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
807 }
808 }
809
810 void send_notify(pg_shard_t to,
811 const pg_notify_t &info, const PastIntervals &pi) {
812 assert(notify_list);
813 (*notify_list)[to.osd].push_back(make_pair(info, pi));
814 }
815 };
816
817
818 PGStateHistory pgstate_history;
819
820 struct NamedState {
821 const char *state_name;
822 utime_t enter_time;
823 PG* pg;
824 const char *get_state_name() { return state_name; }
825 NamedState(PG *pg_, const char *state_name_)
826 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
827 pg->pgstate_history.enter(pg, enter_time, state_name);
828 }
829 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
830 };
831
832
833
834 protected:
835
836 /*
837 * peer_info -- projected (updates _before_ replicas ack)
838 * peer_missing -- committed (updates _after_ replicas ack)
839 */
840
841 bool need_up_thru;
842 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
843 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
844 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
845 set<pg_shard_t> peer_purged; // peers purged
846 map<pg_shard_t, pg_missing_t> peer_missing;
847 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
848 set<pg_shard_t> peer_missing_requested;
849
850 // i deleted these strays; ignore racing PGInfo from them
851 set<pg_shard_t> peer_activated;
852
853 // primary-only, recovery-only state
854 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
855 // which are unfound on the primary
856 epoch_t last_peering_reset;
857
858
859 /* heartbeat peers */
860 void set_probe_targets(const set<pg_shard_t> &probe_set);
861 void clear_probe_targets();
862 public:
863 Mutex heartbeat_peer_lock;
864 set<int> heartbeat_peers;
865 set<int> probe_targets;
866
867 /**
868 * BackfillInterval
869 *
870 * Represents the objects in a range [begin, end)
871 *
872 * Possible states:
873 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
874 * 2) Else, objects contains all objects in [begin, end)
875 */
876 struct BackfillInterval {
877 // info about a backfill interval on a peer
878 eversion_t version; /// version at which the scan occurred
879 map<hobject_t,eversion_t> objects;
880 hobject_t begin;
881 hobject_t end;
882
883 /// clear content
884 void clear() {
885 *this = BackfillInterval();
886 }
887
888 /// clear objects list only
889 void clear_objects() {
890 objects.clear();
891 }
892
893 /// reinstantiate with a new start+end position and sort order
894 void reset(hobject_t start) {
895 clear();
896 begin = end = start;
897 }
898
899 /// true if there are no objects in this interval
900 bool empty() const {
901 return objects.empty();
902 }
903
904 /// true if interval extends to the end of the range
905 bool extends_to_end() const {
906 return end.is_max();
907 }
908
909 /// removes items <= soid and adjusts begin to the first object
910 void trim_to(const hobject_t &soid) {
911 trim();
912 while (!objects.empty() &&
913 objects.begin()->first <= soid) {
914 pop_front();
915 }
916 }
917
918 /// Adjusts begin to the first object
919 void trim() {
920 if (!objects.empty())
921 begin = objects.begin()->first;
922 else
923 begin = end;
924 }
925
926 /// drop first entry, and adjust @begin accordingly
927 void pop_front() {
928 assert(!objects.empty());
929 objects.erase(objects.begin());
930 trim();
931 }
932
933 /// dump
934 void dump(Formatter *f) const {
935 f->dump_stream("begin") << begin;
936 f->dump_stream("end") << end;
937 f->open_array_section("objects");
938 for (map<hobject_t, eversion_t>::const_iterator i =
939 objects.begin();
940 i != objects.end();
941 ++i) {
942 f->open_object_section("object");
943 f->dump_stream("object") << i->first;
944 f->dump_stream("version") << i->second;
945 f->close_section();
946 }
947 f->close_section();
948 }
949 };
950
951 protected:
952 BackfillInterval backfill_info;
953 map<pg_shard_t, BackfillInterval> peer_backfill_info;
954 bool backfill_reserved;
955 bool backfill_reserving;
956
957 friend class OSD;
958
959 public:
960 set<pg_shard_t> backfill_targets;
961
962 bool is_backfill_targets(pg_shard_t osd) {
963 return backfill_targets.count(osd);
964 }
965
966 protected:
967
968 /*
969 * blocked request wait hierarchy
970 *
971 * In order to preserve request ordering we need to be careful about the
972 * order in which blocked requests get requeued. Generally speaking, we
973 * push the requests back up to the op_wq in reverse order (most recent
974 * request first) so that they come back out again in the original order.
975 * However, because there are multiple wait queues, we need to requeue
976 * waitlists in order. Generally speaking, we requeue the wait lists
977 * that are checked first.
978 *
979 * Here are the various wait lists, in the order they are used during
980 * request processing, with notes:
981 *
982 * - waiting_for_map
983 * - may start or stop blocking at any time (depending on client epoch)
984 * - waiting_for_peered
985 * - !is_peered() or flushes_in_progress
986 * - only starts blocking on interval change; never restarts
987 * - waiting_for_active
988 * - !is_active()
989 * - only starts blocking on interval change; never restarts
990 * - waiting_for_flush
991 * - is_active() and flushes_in_progress
992 * - waiting for final flush during activate
993 * - waiting_for_scrub
994 * - starts and stops blocking for varying intervals during scrub
995 * - waiting_for_unreadable_object
996 * - never restarts once object is readable (* except for EIO?)
997 * - waiting_for_degraded_object
998 * - never restarts once object is writeable (* except for EIO?)
999 * - waiting_for_blocked_object
1000 * - starts and stops based on proxied op activity
1001 * - obc rwlocks
1002 * - starts and stops based on read/write activity
1003 *
1004 * Notes:
1005 *
1006 * 1. During and interval change, we requeue *everything* in the above order.
1007 *
1008 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1009 * the op there if it applies. We ignore the unreadable/degraded/blocked
1010 * queues because we assume they cannot apply at that time (this is
1011 * probably mostly true).
1012 *
1013 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1014 * it is non-empty.
1015 *
1016 * These three behaviors are generally sufficient to maintain ordering, with
1017 * the possible exception of cases where we make an object degraded or
1018 * unreadable that was previously okay, e.g. when scrub or op processing
1019 * encounter an unexpected error. FIXME.
1020 */
1021
1022 // pg waiters
1023 unsigned flushes_in_progress;
1024
1025 // ops with newer maps than our (or blocked behind them)
1026 // track these by client, since inter-request ordering doesn't otherwise
1027 // matter.
1028 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1029
1030 // ops waiting on peered
1031 list<OpRequestRef> waiting_for_peered;
1032
1033 // ops waiting on active (require peered as well)
1034 list<OpRequestRef> waiting_for_active;
1035 list<OpRequestRef> waiting_for_flush;
1036 list<OpRequestRef> waiting_for_scrub;
1037
1038 list<OpRequestRef> waiting_for_cache_not_full;
1039 list<OpRequestRef> waiting_for_clean_to_primary_repair;
1040 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1041 waiting_for_degraded_object,
1042 waiting_for_blocked_object;
1043
1044 set<hobject_t> objects_blocked_on_cache_full;
1045 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1046 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1047
1048 // Callbacks should assume pg (and nothing else) is locked
1049 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1050
1051 map<eversion_t,
1052 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
1053
1054 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1055 void requeue_op(OpRequestRef op);
1056 void requeue_ops(list<OpRequestRef> &l);
1057
1058 // stats that persist lazily
1059 object_stat_collection_t unstable_stats;
1060
1061 // publish stats
1062 Mutex pg_stats_publish_lock;
1063 bool pg_stats_publish_valid;
1064 pg_stat_t pg_stats_publish;
1065
1066 // for ordering writes
1067 ceph::shared_ptr<ObjectStore::Sequencer> osr;
1068
1069 void _update_calc_stats();
1070 void _update_blocked_by();
1071 friend class TestOpsSocketHook;
1072 void publish_stats_to_osd();
1073 void clear_publish_stats();
1074
1075 public:
1076 void clear_primary_state();
1077
1078 bool is_actingbackfill(pg_shard_t osd) const {
1079 return actingbackfill.count(osd);
1080 }
1081 bool is_acting(pg_shard_t osd) const {
1082 return has_shard(pool.info.ec_pool(), acting, osd);
1083 }
1084 bool is_up(pg_shard_t osd) const {
1085 return has_shard(pool.info.ec_pool(), up, osd);
1086 }
1087 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1088 if (ec) {
1089 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1090 } else {
1091 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1092 }
1093 }
1094
1095 bool needs_recovery() const;
1096 bool needs_backfill() const;
1097
1098 /// clip calculated priority to reasonable range
1099 inline int clamp_recovery_priority(int priority);
1100 /// get log recovery reservation priority
1101 unsigned get_recovery_priority();
1102 /// get backfill reservation priority
1103 unsigned get_backfill_priority();
1104
1105 void mark_clean(); ///< mark an active pg clean
1106
1107 /// return [start,end) bounds for required past_intervals
1108 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1109 const pg_info_t &info,
1110 epoch_t oldest_map) {
1111 epoch_t start = MAX(
1112 info.history.last_epoch_clean ? info.history.last_epoch_clean :
1113 info.history.epoch_pool_created,
1114 oldest_map);
1115 epoch_t end = MAX(
1116 info.history.same_interval_since,
1117 info.history.epoch_pool_created);
1118 return make_pair(start, end);
1119 }
1120 void check_past_interval_bounds() const;
1121 PastIntervals::PriorSet build_prior();
1122
1123 void remove_down_peer_info(const OSDMapRef osdmap);
1124
1125 bool adjust_need_up_thru(const OSDMapRef osdmap);
1126
1127 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1128 virtual void dump_recovery_info(Formatter *f) const = 0;
1129
1130 bool calc_min_last_complete_ondisk() {
1131 eversion_t min = last_complete_ondisk;
1132 assert(!actingbackfill.empty());
1133 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1134 i != actingbackfill.end();
1135 ++i) {
1136 if (*i == get_primary()) continue;
1137 if (peer_last_complete_ondisk.count(*i) == 0)
1138 return false; // we don't have complete info
1139 eversion_t a = peer_last_complete_ondisk[*i];
1140 if (a < min)
1141 min = a;
1142 }
1143 if (min == min_last_complete_ondisk)
1144 return false;
1145 min_last_complete_ondisk = min;
1146 return true;
1147 }
1148
1149 virtual void calc_trim_to() = 0;
1150
1151 virtual void calc_trim_to_aggressive() = 0;
1152
1153 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1154 pg_missing_t& omissing, pg_shard_t from);
1155 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1156 pg_missing_t& omissing, pg_shard_t from);
1157 bool proc_replica_info(
1158 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1159
1160 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1161 PG *pg;
1162 ObjectStore::Transaction *t;
1163 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1164
1165 // LogEntryHandler
1166 void remove(const hobject_t &hoid) override {
1167 pg->get_pgbackend()->remove(hoid, t);
1168 }
1169 void try_stash(const hobject_t &hoid, version_t v) override {
1170 pg->get_pgbackend()->try_stash(hoid, v, t);
1171 }
1172 void rollback(const pg_log_entry_t &entry) override {
1173 assert(entry.can_rollback());
1174 pg->get_pgbackend()->rollback(entry, t);
1175 }
1176 void rollforward(const pg_log_entry_t &entry) override {
1177 pg->get_pgbackend()->rollforward(entry, t);
1178 }
1179 void trim(const pg_log_entry_t &entry) override {
1180 pg->get_pgbackend()->trim(entry, t);
1181 }
1182 };
1183
1184 void update_object_snap_mapping(
1185 ObjectStore::Transaction *t, const hobject_t &soid,
1186 const set<snapid_t> &snaps);
1187 void clear_object_snap_mapping(
1188 ObjectStore::Transaction *t, const hobject_t &soid);
1189 void remove_snap_mapped_object(
1190 ObjectStore::Transaction& t, const hobject_t& soid);
1191 void merge_log(
1192 ObjectStore::Transaction& t, pg_info_t &oinfo,
1193 pg_log_t &olog, pg_shard_t from);
1194 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1195 bool search_for_missing(
1196 const pg_info_t &oinfo, const pg_missing_t &omissing,
1197 pg_shard_t fromosd,
1198 RecoveryCtx*);
1199
1200 void check_for_lost_objects();
1201 void forget_lost_objects();
1202
1203 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1204
1205 void trim_write_ahead();
1206
1207 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1208 const map<pg_shard_t, pg_info_t> &infos,
1209 bool restrict_to_up_acting,
1210 bool *history_les_bound) const;
1211 static void calc_ec_acting(
1212 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1213 unsigned size,
1214 const vector<int> &acting,
1215 pg_shard_t acting_primary,
1216 const vector<int> &up,
1217 pg_shard_t up_primary,
1218 const map<pg_shard_t, pg_info_t> &all_info,
1219 bool restrict_to_up_acting,
1220 vector<int> *want,
1221 set<pg_shard_t> *backfill,
1222 set<pg_shard_t> *acting_backfill,
1223 pg_shard_t *want_primary,
1224 ostream &ss);
1225 static void calc_replicated_acting(
1226 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1227 unsigned size,
1228 const vector<int> &acting,
1229 pg_shard_t acting_primary,
1230 const vector<int> &up,
1231 pg_shard_t up_primary,
1232 const map<pg_shard_t, pg_info_t> &all_info,
1233 bool restrict_to_up_acting,
1234 vector<int> *want,
1235 set<pg_shard_t> *backfill,
1236 set<pg_shard_t> *acting_backfill,
1237 pg_shard_t *want_primary,
1238 ostream &ss);
1239 bool choose_acting(pg_shard_t &auth_log_shard,
1240 bool restrict_to_up_acting,
1241 bool *history_les_bound);
1242 void build_might_have_unfound();
1243 void activate(
1244 ObjectStore::Transaction& t,
1245 epoch_t activation_epoch,
1246 list<Context*>& tfin,
1247 map<int, map<spg_t,pg_query_t> >& query_map,
1248 map<int,
1249 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1250 RecoveryCtx *ctx);
1251 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1252 void all_activated_and_committed();
1253
1254 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1255
1256 bool have_unfound() const {
1257 return missing_loc.have_unfound();
1258 }
1259 uint64_t get_num_unfound() const {
1260 return missing_loc.num_unfound();
1261 }
1262
1263 virtual void check_local() = 0;
1264
1265 /**
1266 * @param ops_begun returns how many recovery ops the function started
1267 * @returns true if any useful work was accomplished; false otherwise
1268 */
1269 virtual bool start_recovery_ops(
1270 uint64_t max,
1271 ThreadPool::TPHandle &handle,
1272 uint64_t *ops_begun) = 0;
1273
1274 void purge_strays();
1275
1276 void update_heartbeat_peers();
1277
1278 Context *finish_sync_event;
1279
1280 void finish_recovery(list<Context*>& tfin);
1281 void _finish_recovery(Context *c);
1282 void cancel_recovery();
1283 void clear_recovery_state();
1284 virtual void _clear_recovery_state() = 0;
1285 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1286 void start_recovery_op(const hobject_t& soid);
1287 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1288
1289 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1290 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1291
1292 friend class C_OSD_RepModify_Commit;
1293
1294 // -- backoff --
1295 Mutex backoff_lock; // orders inside Backoff::lock
1296 map<hobject_t,set<BackoffRef>> backoffs;
1297
1298 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1299 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1300 void release_backoffs(const hobject_t& o) {
1301 release_backoffs(o, o);
1302 }
1303 void clear_backoffs();
1304
1305 void add_pg_backoff(SessionRef s) {
1306 hobject_t begin = info.pgid.pgid.get_hobj_start();
1307 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1308 add_backoff(s, begin, end);
1309 }
1310 void release_pg_backoffs() {
1311 hobject_t begin = info.pgid.pgid.get_hobj_start();
1312 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1313 release_backoffs(begin, end);
1314 }
1315
1316 void rm_backoff(BackoffRef b);
1317
1318 // -- scrub --
1319 struct Scrubber {
1320 Scrubber();
1321 ~Scrubber();
1322
1323 // metadata
1324 set<pg_shard_t> reserved_peers;
1325 bool reserved, reserve_failed;
1326 epoch_t epoch_start;
1327
1328 // common to both scrubs
1329 bool active;
1330 set<pg_shard_t> waiting_on_whom;
1331 int shallow_errors;
1332 int deep_errors;
1333 int large_omap_objects = 0;
1334 int fixed;
1335 ScrubMap primary_scrubmap;
1336 ScrubMapBuilder primary_scrubmap_pos;
1337 epoch_t replica_scrub_start = 0;
1338 ScrubMap replica_scrubmap;
1339 ScrubMapBuilder replica_scrubmap_pos;
1340 map<pg_shard_t, ScrubMap> received_maps;
1341 OpRequestRef active_rep_scrub;
1342 utime_t scrub_reg_stamp; // stamp we registered for
1343
1344 // For async sleep
1345 bool sleeping = false;
1346 bool needs_sleep = true;
1347 utime_t sleep_start;
1348
1349 // flags to indicate explicitly requested scrubs (by admin)
1350 bool must_scrub, must_deep_scrub, must_repair;
1351
1352 // Priority to use for scrub scheduling
1353 unsigned priority;
1354
1355 // this flag indicates whether we would like to do auto-repair of the PG or not
1356 bool auto_repair;
1357
1358 // Maps from objects with errors to missing/inconsistent peers
1359 map<hobject_t, set<pg_shard_t>> missing;
1360 map<hobject_t, set<pg_shard_t>> inconsistent;
1361
1362 // Map from object with errors to good peers
1363 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1364
1365 // Cleaned map pending snap metadata scrub
1366 ScrubMap cleaned_meta_map;
1367
1368 void clean_meta_map(ScrubMap &for_meta_scrub) {
1369 if (end.is_max() ||
1370 cleaned_meta_map.objects.empty()) {
1371 cleaned_meta_map.swap(for_meta_scrub);
1372 } else {
1373 auto iter = cleaned_meta_map.objects.end();
1374 --iter; // not empty, see if clause
1375 auto begin = cleaned_meta_map.objects.begin();
1376 if (iter->first.has_snapset()) {
1377 ++iter;
1378 } else {
1379 while (iter != begin) {
1380 auto next = iter--;
1381 if (next->first.get_head() != iter->first.get_head()) {
1382 ++iter;
1383 break;
1384 }
1385 }
1386 }
1387 for_meta_scrub.objects.insert(begin, iter);
1388 cleaned_meta_map.objects.erase(begin, iter);
1389 }
1390 }
1391
1392 // digest updates which we are waiting on
1393 int num_digest_updates_pending;
1394
1395 // chunky scrub
1396 hobject_t start, end; // [start,end)
1397 hobject_t max_end; // Largest end that may have been sent to replicas
1398 eversion_t subset_last_update;
1399
1400 // chunky scrub state
1401 enum State {
1402 INACTIVE,
1403 NEW_CHUNK,
1404 WAIT_PUSHES,
1405 WAIT_LAST_UPDATE,
1406 BUILD_MAP,
1407 BUILD_MAP_DONE,
1408 WAIT_REPLICAS,
1409 COMPARE_MAPS,
1410 WAIT_DIGEST_UPDATES,
1411 FINISH,
1412 BUILD_MAP_REPLICA,
1413 } state;
1414
1415 std::unique_ptr<Scrub::Store> store;
1416 // deep scrub
1417 bool deep;
1418 int preempt_left;
1419 int preempt_divisor;
1420
1421 list<Context*> callbacks;
1422 void add_callback(Context *context) {
1423 callbacks.push_back(context);
1424 }
1425 void run_callbacks() {
1426 list<Context*> to_run;
1427 to_run.swap(callbacks);
1428 for (list<Context*>::iterator i = to_run.begin();
1429 i != to_run.end();
1430 ++i) {
1431 (*i)->complete(0);
1432 }
1433 }
1434
1435 static const char *state_string(const PG::Scrubber::State& state) {
1436 const char *ret = NULL;
1437 switch( state )
1438 {
1439 case INACTIVE: ret = "INACTIVE"; break;
1440 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1441 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1442 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1443 case BUILD_MAP: ret = "BUILD_MAP"; break;
1444 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1445 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1446 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1447 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1448 case FINISH: ret = "FINISH"; break;
1449 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1450 }
1451 return ret;
1452 }
1453
1454 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1455
1456 // clear all state
1457 void reset() {
1458 active = false;
1459 waiting_on_whom.clear();
1460 if (active_rep_scrub) {
1461 active_rep_scrub = OpRequestRef();
1462 }
1463 received_maps.clear();
1464
1465 must_scrub = false;
1466 must_deep_scrub = false;
1467 must_repair = false;
1468 auto_repair = false;
1469
1470 state = PG::Scrubber::INACTIVE;
1471 start = hobject_t();
1472 end = hobject_t();
1473 max_end = hobject_t();
1474 subset_last_update = eversion_t();
1475 shallow_errors = 0;
1476 deep_errors = 0;
1477 large_omap_objects = 0;
1478 fixed = 0;
1479 deep = false;
1480 run_callbacks();
1481 inconsistent.clear();
1482 missing.clear();
1483 authoritative.clear();
1484 num_digest_updates_pending = 0;
1485 primary_scrubmap = ScrubMap();
1486 primary_scrubmap_pos.reset();
1487 replica_scrubmap = ScrubMap();
1488 replica_scrubmap_pos.reset();
1489 cleaned_meta_map = ScrubMap();
1490 sleeping = false;
1491 needs_sleep = true;
1492 sleep_start = utime_t();
1493 }
1494
1495 void create_results(const hobject_t& obj);
1496 void cleanup_store(ObjectStore::Transaction *t);
1497 } scrubber;
1498
1499 bool scrub_after_recovery;
1500
1501 int active_pushes;
1502
1503 bool scrub_can_preempt = false;
1504 bool scrub_preempted = false;
1505
1506 // we allow some number of preemptions of the scrub, which mean we do
1507 // not block. then we start to block. once we start blocking, we do
1508 // not stop until the scrub range is completed.
1509 bool write_blocked_by_scrub(const hobject_t &soid);
1510
1511 /// true if the given range intersects the scrub interval in any way
1512 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1513
1514 void repair_object(
1515 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1516 pg_shard_t bad_peer);
1517
1518 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1519 void chunky_scrub(ThreadPool::TPHandle &handle);
1520 void scrub_compare_maps();
1521 /**
1522 * return true if any inconsistency/missing is repaired, false otherwise
1523 */
1524 bool scrub_process_inconsistent();
1525 bool ops_blocked_by_scrub() const;
1526 void scrub_finish();
1527 void scrub_clear_state();
1528 void _scan_snaps(ScrubMap &map);
1529 void _repair_oinfo_oid(ScrubMap &map);
1530 void _scan_rollback_obs(
1531 const vector<ghobject_t> &rollback_obs,
1532 ThreadPool::TPHandle &handle);
1533 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1534 hobject_t start, hobject_t end, bool deep,
1535 bool allow_preemption);
1536 int build_scrub_map_chunk(
1537 ScrubMap &map,
1538 ScrubMapBuilder &pos,
1539 hobject_t start, hobject_t end, bool deep,
1540 ThreadPool::TPHandle &handle);
1541 /**
1542 * returns true if [begin, end) is good to scrub at this time
1543 * a false return value obliges the implementer to requeue scrub when the
1544 * condition preventing scrub clears
1545 */
1546 virtual bool _range_available_for_scrub(
1547 const hobject_t &begin, const hobject_t &end) = 0;
1548 virtual void scrub_snapshot_metadata(
1549 ScrubMap &map,
1550 const std::map<hobject_t,
1551 pair<boost::optional<uint32_t>,
1552 boost::optional<uint32_t>>> &missing_digest) { }
1553 virtual void _scrub_clear_state() { }
1554 virtual void _scrub_finish() { }
1555 virtual void split_colls(
1556 spg_t child,
1557 int split_bits,
1558 int seed,
1559 const pg_pool_t *pool,
1560 ObjectStore::Transaction *t) = 0;
1561 void clear_scrub_reserved();
1562 void scrub_reserve_replicas();
1563 void scrub_unreserve_replicas();
1564 bool scrub_all_replicas_reserved() const;
1565 bool sched_scrub();
1566 void reg_next_scrub();
1567 void unreg_next_scrub();
1568
1569 void replica_scrub(
1570 OpRequestRef op,
1571 ThreadPool::TPHandle &handle);
1572 void do_replica_scrub_map(OpRequestRef op);
1573 void sub_op_scrub_map(OpRequestRef op);
1574
1575 void handle_scrub_reserve_request(OpRequestRef op);
1576 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1577 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1578 void handle_scrub_reserve_release(OpRequestRef op);
1579
1580 void reject_reservation();
1581 void schedule_backfill_retry(float retry);
1582 void schedule_recovery_retry(float retry);
1583
1584 // -- recovery state --
1585
1586 template <class EVT>
1587 struct QueuePeeringEvt : Context {
1588 PGRef pg;
1589 epoch_t epoch;
1590 EVT evt;
1591 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1592 pg(pg), epoch(epoch), evt(evt) {}
1593 void finish(int r) override {
1594 pg->lock();
1595 pg->queue_peering_event(PG::CephPeeringEvtRef(
1596 new PG::CephPeeringEvt(
1597 epoch,
1598 epoch,
1599 evt)));
1600 pg->unlock();
1601 }
1602 };
1603
1604 class CephPeeringEvt {
1605 epoch_t epoch_sent;
1606 epoch_t epoch_requested;
1607 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1608 string desc;
1609 public:
1610 MEMPOOL_CLASS_HELPERS();
1611 template <class T>
1612 CephPeeringEvt(epoch_t epoch_sent,
1613 epoch_t epoch_requested,
1614 const T &evt_) :
1615 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1616 evt(evt_.intrusive_from_this()) {
1617 stringstream out;
1618 out << "epoch_sent: " << epoch_sent
1619 << " epoch_requested: " << epoch_requested << " ";
1620 evt_.print(&out);
1621 desc = out.str();
1622 }
1623 epoch_t get_epoch_sent() { return epoch_sent; }
1624 epoch_t get_epoch_requested() { return epoch_requested; }
1625 const boost::statechart::event_base &get_event() { return *evt; }
1626 string get_desc() { return desc; }
1627 };
1628 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1629 list<CephPeeringEvtRef> peering_queue; // op queue
1630 list<CephPeeringEvtRef> peering_waiters;
1631
1632 struct QueryState : boost::statechart::event< QueryState > {
1633 Formatter *f;
1634 explicit QueryState(Formatter *f) : f(f) {}
1635 void print(std::ostream *out) const {
1636 *out << "Query";
1637 }
1638 };
1639
1640 struct MInfoRec : boost::statechart::event< MInfoRec > {
1641 pg_shard_t from;
1642 pg_info_t info;
1643 epoch_t msg_epoch;
1644 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1645 from(from), info(info), msg_epoch(msg_epoch) {}
1646 void print(std::ostream *out) const {
1647 *out << "MInfoRec from " << from << " info: " << info;
1648 }
1649 };
1650
1651 struct MLogRec : boost::statechart::event< MLogRec > {
1652 pg_shard_t from;
1653 boost::intrusive_ptr<MOSDPGLog> msg;
1654 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1655 from(from), msg(msg) {}
1656 void print(std::ostream *out) const {
1657 *out << "MLogRec from " << from;
1658 }
1659 };
1660
1661 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1662 pg_shard_t from;
1663 pg_notify_t notify;
1664 uint64_t features;
1665 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1666 from(from), notify(notify), features(f) {}
1667 void print(std::ostream *out) const {
1668 *out << "MNotifyRec from " << from << " notify: " << notify
1669 << " features: 0x" << hex << features << dec;
1670 }
1671 };
1672
1673 struct MQuery : boost::statechart::event< MQuery > {
1674 pg_shard_t from;
1675 pg_query_t query;
1676 epoch_t query_epoch;
1677 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1678 from(from), query(query), query_epoch(query_epoch) {}
1679 void print(std::ostream *out) const {
1680 *out << "MQuery from " << from
1681 << " query_epoch " << query_epoch
1682 << " query: " << query;
1683 }
1684 };
1685
1686 struct AdvMap : boost::statechart::event< AdvMap > {
1687 OSDMapRef osdmap;
1688 OSDMapRef lastmap;
1689 vector<int> newup, newacting;
1690 int up_primary, acting_primary;
1691 AdvMap(
1692 OSDMapRef osdmap, OSDMapRef lastmap,
1693 vector<int>& newup, int up_primary,
1694 vector<int>& newacting, int acting_primary):
1695 osdmap(osdmap), lastmap(lastmap),
1696 newup(newup),
1697 newacting(newacting),
1698 up_primary(up_primary),
1699 acting_primary(acting_primary) {}
1700 void print(std::ostream *out) const {
1701 *out << "AdvMap";
1702 }
1703 };
1704
1705 struct ActMap : boost::statechart::event< ActMap > {
1706 ActMap() : boost::statechart::event< ActMap >() {}
1707 void print(std::ostream *out) const {
1708 *out << "ActMap";
1709 }
1710 };
1711 struct Activate : boost::statechart::event< Activate > {
1712 epoch_t activation_epoch;
1713 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1714 activation_epoch(q) {}
1715 void print(std::ostream *out) const {
1716 *out << "Activate from " << activation_epoch;
1717 }
1718 };
1719 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1720 unsigned priority;
1721 explicit RequestBackfillPrio(unsigned prio) :
1722 boost::statechart::event< RequestBackfillPrio >(),
1723 priority(prio) {}
1724 void print(std::ostream *out) const {
1725 *out << "RequestBackfillPrio: priority " << priority;
1726 }
1727 };
1728 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1729 T() : boost::statechart::event< T >() {} \
1730 void print(std::ostream *out) const { \
1731 *out << #T; \
1732 } \
1733 };
1734 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1735 float delay;
1736 explicit DeferBackfill(float delay) : delay(delay) {}
1737 void print(std::ostream *out) const {
1738 *out << "DeferBackfill: delay " << delay;
1739 }
1740 };
1741 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1742 float delay;
1743 explicit DeferRecovery(float delay) : delay(delay) {}
1744 void print(std::ostream *out) const {
1745 *out << "DeferRecovery: delay " << delay;
1746 }
1747 };
1748 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
1749 explicit UnfoundBackfill() {}
1750 void print(std::ostream *out) const {
1751 *out << "UnfoundBackfill";
1752 }
1753 };
1754 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
1755 explicit UnfoundRecovery() {}
1756 void print(std::ostream *out) const {
1757 *out << "UnfoundRecovery";
1758 }
1759 };
1760 protected:
1761 TrivialEvent(Initialize)
1762 TrivialEvent(Load)
1763 TrivialEvent(GotInfo)
1764 TrivialEvent(NeedUpThru)
1765 TrivialEvent(NullEvt)
1766 TrivialEvent(FlushedEvt)
1767 TrivialEvent(Backfilled)
1768 TrivialEvent(LocalBackfillReserved)
1769 TrivialEvent(RemoteBackfillReserved)
1770 TrivialEvent(RejectRemoteReservation)
1771 TrivialEvent(RemoteReservationRejected)
1772 TrivialEvent(RemoteReservationCanceled)
1773 TrivialEvent(RequestBackfill)
1774 TrivialEvent(RequestRecovery)
1775 TrivialEvent(RecoveryDone)
1776 TrivialEvent(BackfillTooFull)
1777 TrivialEvent(RecoveryTooFull)
1778
1779 TrivialEvent(MakePrimary)
1780 TrivialEvent(MakeStray)
1781 TrivialEvent(NeedActingChange)
1782 TrivialEvent(IsIncomplete)
1783 TrivialEvent(IsDown)
1784
1785 TrivialEvent(AllReplicasRecovered)
1786 TrivialEvent(DoRecovery)
1787 TrivialEvent(LocalRecoveryReserved)
1788 TrivialEvent(RemoteRecoveryReserved)
1789 TrivialEvent(AllRemotesReserved)
1790 TrivialEvent(AllBackfillsReserved)
1791 TrivialEvent(GoClean)
1792
1793 TrivialEvent(AllReplicasActivated)
1794
1795 TrivialEvent(IntervalFlush)
1796
1797 /* Encapsulates PG recovery process */
1798 class RecoveryState {
1799 void start_handle(RecoveryCtx *new_ctx);
1800 void end_handle();
1801 public:
1802 void begin_block_outgoing();
1803 void end_block_outgoing();
1804 void clear_blocked_outgoing();
1805 private:
1806
1807 /* States */
1808 struct Initial;
1809 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1810 RecoveryState *state;
1811 public:
1812 PG *pg;
1813
1814 utime_t event_time;
1815 uint64_t event_count;
1816
1817 void clear_event_counters() {
1818 event_time = utime_t();
1819 event_count = 0;
1820 }
1821
1822 void log_enter(const char *state_name);
1823 void log_exit(const char *state_name, utime_t duration);
1824
1825 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1826
1827 /* Accessor functions for state methods */
1828 ObjectStore::Transaction* get_cur_transaction() {
1829 assert(state->rctx);
1830 assert(state->rctx->transaction);
1831 return state->rctx->transaction;
1832 }
1833
1834 void send_query(pg_shard_t to, const pg_query_t &query) {
1835 assert(state->rctx);
1836 assert(state->rctx->query_map);
1837 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1838 query;
1839 }
1840
1841 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1842 assert(state->rctx);
1843 assert(state->rctx->query_map);
1844 return state->rctx->query_map;
1845 }
1846
1847 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1848 assert(state->rctx);
1849 assert(state->rctx->info_map);
1850 return state->rctx->info_map;
1851 }
1852
1853 list< Context* > *get_on_safe_context_list() {
1854 assert(state->rctx);
1855 assert(state->rctx->on_safe);
1856 return &(state->rctx->on_safe->contexts);
1857 }
1858
1859 list< Context * > *get_on_applied_context_list() {
1860 assert(state->rctx);
1861 assert(state->rctx->on_applied);
1862 return &(state->rctx->on_applied->contexts);
1863 }
1864
1865 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1866
1867 void send_notify(pg_shard_t to,
1868 const pg_notify_t &info, const PastIntervals &pi) {
1869 assert(state->rctx);
1870 state->rctx->send_notify(to, info, pi);
1871 }
1872 };
1873 friend class RecoveryMachine;
1874
1875 /* States */
1876 // Initial
1877 // Reset
1878 // Start
1879 // Started
1880 // Primary
1881 // WaitActingChange
1882 // Peering
1883 // GetInfo
1884 // GetLog
1885 // GetMissing
1886 // WaitUpThru
1887 // Incomplete
1888 // Active
1889 // Activating
1890 // Clean
1891 // Recovered
1892 // Backfilling
1893 // WaitRemoteBackfillReserved
1894 // WaitLocalBackfillReserved
1895 // NotBackfilling
1896 // NotRecovering
1897 // Recovering
1898 // WaitRemoteRecoveryReserved
1899 // WaitLocalRecoveryReserved
1900 // ReplicaActive
1901 // RepNotRecovering
1902 // RepRecovering
1903 // RepWaitBackfillReserved
1904 // RepWaitRecoveryReserved
1905 // Stray
1906
1907 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1908 explicit Crashed(my_context ctx);
1909 };
1910
1911 struct Reset;
1912
1913 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1914 explicit Initial(my_context ctx);
1915 void exit();
1916
1917 typedef boost::mpl::list <
1918 boost::statechart::transition< Initialize, Reset >,
1919 boost::statechart::custom_reaction< Load >,
1920 boost::statechart::custom_reaction< NullEvt >,
1921 boost::statechart::transition< boost::statechart::event_base, Crashed >
1922 > reactions;
1923
1924 boost::statechart::result react(const Load&);
1925 boost::statechart::result react(const MNotifyRec&);
1926 boost::statechart::result react(const MInfoRec&);
1927 boost::statechart::result react(const MLogRec&);
1928 boost::statechart::result react(const boost::statechart::event_base&) {
1929 return discard_event();
1930 }
1931 };
1932
1933 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1934 explicit Reset(my_context ctx);
1935 void exit();
1936
1937 typedef boost::mpl::list <
1938 boost::statechart::custom_reaction< QueryState >,
1939 boost::statechart::custom_reaction< AdvMap >,
1940 boost::statechart::custom_reaction< ActMap >,
1941 boost::statechart::custom_reaction< NullEvt >,
1942 boost::statechart::custom_reaction< FlushedEvt >,
1943 boost::statechart::custom_reaction< IntervalFlush >,
1944 boost::statechart::transition< boost::statechart::event_base, Crashed >
1945 > reactions;
1946 boost::statechart::result react(const QueryState& q);
1947 boost::statechart::result react(const AdvMap&);
1948 boost::statechart::result react(const ActMap&);
1949 boost::statechart::result react(const FlushedEvt&);
1950 boost::statechart::result react(const IntervalFlush&);
1951 boost::statechart::result react(const boost::statechart::event_base&) {
1952 return discard_event();
1953 }
1954 };
1955
1956 struct Start;
1957
1958 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1959 explicit Started(my_context ctx);
1960 void exit();
1961
1962 typedef boost::mpl::list <
1963 boost::statechart::custom_reaction< QueryState >,
1964 boost::statechart::custom_reaction< AdvMap >,
1965 boost::statechart::custom_reaction< NullEvt >,
1966 boost::statechart::custom_reaction< FlushedEvt >,
1967 boost::statechart::custom_reaction< IntervalFlush >,
1968 boost::statechart::transition< boost::statechart::event_base, Crashed >
1969 > reactions;
1970 boost::statechart::result react(const QueryState& q);
1971 boost::statechart::result react(const AdvMap&);
1972 boost::statechart::result react(const FlushedEvt&);
1973 boost::statechart::result react(const IntervalFlush&);
1974 boost::statechart::result react(const boost::statechart::event_base&) {
1975 return discard_event();
1976 }
1977 };
1978
1979 struct Primary;
1980 struct Stray;
1981
1982 struct Start : boost::statechart::state< Start, Started >, NamedState {
1983 explicit Start(my_context ctx);
1984 void exit();
1985
1986 typedef boost::mpl::list <
1987 boost::statechart::transition< MakePrimary, Primary >,
1988 boost::statechart::transition< MakeStray, Stray >
1989 > reactions;
1990 };
1991
1992 struct Peering;
1993 struct WaitActingChange;
1994 struct Incomplete;
1995 struct Down;
1996
1997 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1998 explicit Primary(my_context ctx);
1999 void exit();
2000
2001 typedef boost::mpl::list <
2002 boost::statechart::custom_reaction< ActMap >,
2003 boost::statechart::custom_reaction< MNotifyRec >,
2004 boost::statechart::transition< NeedActingChange, WaitActingChange >
2005 > reactions;
2006 boost::statechart::result react(const ActMap&);
2007 boost::statechart::result react(const MNotifyRec&);
2008 };
2009
2010 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2011 NamedState {
2012 typedef boost::mpl::list <
2013 boost::statechart::custom_reaction< QueryState >,
2014 boost::statechart::custom_reaction< AdvMap >,
2015 boost::statechart::custom_reaction< MLogRec >,
2016 boost::statechart::custom_reaction< MInfoRec >,
2017 boost::statechart::custom_reaction< MNotifyRec >
2018 > reactions;
2019 explicit WaitActingChange(my_context ctx);
2020 boost::statechart::result react(const QueryState& q);
2021 boost::statechart::result react(const AdvMap&);
2022 boost::statechart::result react(const MLogRec&);
2023 boost::statechart::result react(const MInfoRec&);
2024 boost::statechart::result react(const MNotifyRec&);
2025 void exit();
2026 };
2027
2028 struct GetInfo;
2029 struct Active;
2030
2031 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2032 PastIntervals::PriorSet prior_set;
2033 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2034
2035 explicit Peering(my_context ctx);
2036 void exit();
2037
2038 typedef boost::mpl::list <
2039 boost::statechart::custom_reaction< QueryState >,
2040 boost::statechart::transition< Activate, Active >,
2041 boost::statechart::custom_reaction< AdvMap >
2042 > reactions;
2043 boost::statechart::result react(const QueryState& q);
2044 boost::statechart::result react(const AdvMap &advmap);
2045 };
2046
2047 struct WaitLocalRecoveryReserved;
2048 struct Activating;
2049 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2050 explicit Active(my_context ctx);
2051 void exit();
2052
2053 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2054 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2055 bool all_replicas_activated;
2056
2057 typedef boost::mpl::list <
2058 boost::statechart::custom_reaction< QueryState >,
2059 boost::statechart::custom_reaction< ActMap >,
2060 boost::statechart::custom_reaction< AdvMap >,
2061 boost::statechart::custom_reaction< MInfoRec >,
2062 boost::statechart::custom_reaction< MNotifyRec >,
2063 boost::statechart::custom_reaction< MLogRec >,
2064 boost::statechart::custom_reaction< Backfilled >,
2065 boost::statechart::custom_reaction< AllReplicasActivated >,
2066 boost::statechart::custom_reaction< DeferRecovery >,
2067 boost::statechart::custom_reaction< DeferBackfill >,
2068 boost::statechart::custom_reaction< UnfoundRecovery >,
2069 boost::statechart::custom_reaction< UnfoundBackfill >,
2070 boost::statechart::custom_reaction< DoRecovery>
2071 > reactions;
2072 boost::statechart::result react(const QueryState& q);
2073 boost::statechart::result react(const ActMap&);
2074 boost::statechart::result react(const AdvMap&);
2075 boost::statechart::result react(const MInfoRec& infoevt);
2076 boost::statechart::result react(const MNotifyRec& notevt);
2077 boost::statechart::result react(const MLogRec& logevt);
2078 boost::statechart::result react(const Backfilled&) {
2079 return discard_event();
2080 }
2081 boost::statechart::result react(const AllReplicasActivated&);
2082 boost::statechart::result react(const DeferRecovery& evt) {
2083 return discard_event();
2084 }
2085 boost::statechart::result react(const DeferBackfill& evt) {
2086 return discard_event();
2087 }
2088 boost::statechart::result react(const UnfoundRecovery& evt) {
2089 return discard_event();
2090 }
2091 boost::statechart::result react(const UnfoundBackfill& evt) {
2092 return discard_event();
2093 }
2094 boost::statechart::result react(const DoRecovery&) {
2095 return discard_event();
2096 }
2097 };
2098
2099 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2100 typedef boost::mpl::list<
2101 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
2102 > reactions;
2103 explicit Clean(my_context ctx);
2104 void exit();
2105 };
2106
2107 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2108 typedef boost::mpl::list<
2109 boost::statechart::transition< GoClean, Clean >,
2110 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2111 boost::statechart::custom_reaction< AllReplicasActivated >
2112 > reactions;
2113 explicit Recovered(my_context ctx);
2114 void exit();
2115 boost::statechart::result react(const AllReplicasActivated&) {
2116 post_event(GoClean());
2117 return forward_event();
2118 }
2119 };
2120
2121 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2122 typedef boost::mpl::list<
2123 boost::statechart::transition< Backfilled, Recovered >,
2124 boost::statechart::custom_reaction< DeferBackfill >,
2125 boost::statechart::custom_reaction< UnfoundBackfill >,
2126 boost::statechart::custom_reaction< RemoteReservationRejected >
2127 > reactions;
2128 explicit Backfilling(my_context ctx);
2129 boost::statechart::result react(const RemoteReservationRejected& evt);
2130 boost::statechart::result react(const DeferBackfill& evt);
2131 boost::statechart::result react(const UnfoundBackfill& evt);
2132 void exit();
2133 };
2134
2135 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2136 typedef boost::mpl::list<
2137 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2138 boost::statechart::custom_reaction< RemoteReservationRejected >,
2139 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2140 > reactions;
2141 set<pg_shard_t>::const_iterator backfill_osd_it;
2142 explicit WaitRemoteBackfillReserved(my_context ctx);
2143 void exit();
2144 boost::statechart::result react(const RemoteBackfillReserved& evt);
2145 boost::statechart::result react(const RemoteReservationRejected& evt);
2146 };
2147
2148 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2149 typedef boost::mpl::list<
2150 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2151 > reactions;
2152 explicit WaitLocalBackfillReserved(my_context ctx);
2153 void exit();
2154 };
2155
2156 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2157 typedef boost::mpl::list<
2158 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2159 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2160 boost::statechart::custom_reaction< RemoteReservationRejected >
2161 > reactions;
2162 explicit NotBackfilling(my_context ctx);
2163 void exit();
2164 boost::statechart::result react(const RemoteBackfillReserved& evt);
2165 boost::statechart::result react(const RemoteReservationRejected& evt);
2166 };
2167
2168 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2169 typedef boost::mpl::list<
2170 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2171 boost::statechart::custom_reaction< DeferRecovery >,
2172 boost::statechart::custom_reaction< UnfoundRecovery >
2173 > reactions;
2174 explicit NotRecovering(my_context ctx);
2175 boost::statechart::result react(const DeferRecovery& evt) {
2176 /* no-op */
2177 return discard_event();
2178 }
2179 boost::statechart::result react(const UnfoundRecovery& evt) {
2180 /* no-op */
2181 return discard_event();
2182 }
2183 void exit();
2184 };
2185
2186 struct RepNotRecovering;
2187 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2188 explicit ReplicaActive(my_context ctx);
2189 void exit();
2190
2191 typedef boost::mpl::list <
2192 boost::statechart::custom_reaction< QueryState >,
2193 boost::statechart::custom_reaction< ActMap >,
2194 boost::statechart::custom_reaction< MQuery >,
2195 boost::statechart::custom_reaction< MInfoRec >,
2196 boost::statechart::custom_reaction< MLogRec >,
2197 boost::statechart::custom_reaction< Activate >,
2198 boost::statechart::custom_reaction< DeferRecovery >,
2199 boost::statechart::custom_reaction< DeferBackfill >,
2200 boost::statechart::custom_reaction< UnfoundRecovery >,
2201 boost::statechart::custom_reaction< UnfoundBackfill >
2202 > reactions;
2203 boost::statechart::result react(const QueryState& q);
2204 boost::statechart::result react(const MInfoRec& infoevt);
2205 boost::statechart::result react(const MLogRec& logevt);
2206 boost::statechart::result react(const ActMap&);
2207 boost::statechart::result react(const MQuery&);
2208 boost::statechart::result react(const Activate&);
2209 boost::statechart::result react(const DeferRecovery& evt) {
2210 return discard_event();
2211 }
2212 boost::statechart::result react(const DeferBackfill& evt) {
2213 return discard_event();
2214 }
2215 boost::statechart::result react(const UnfoundRecovery& evt) {
2216 return discard_event();
2217 }
2218 boost::statechart::result react(const UnfoundBackfill& evt) {
2219 return discard_event();
2220 }
2221 };
2222
2223 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2224 typedef boost::mpl::list<
2225 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2226 // for compat with old peers
2227 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2228 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2229 boost::statechart::custom_reaction< BackfillTooFull >
2230 > reactions;
2231 explicit RepRecovering(my_context ctx);
2232 boost::statechart::result react(const BackfillTooFull &evt);
2233 void exit();
2234 };
2235
2236 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2237 typedef boost::mpl::list<
2238 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2239 boost::statechart::custom_reaction< RejectRemoteReservation >,
2240 boost::statechart::custom_reaction< RemoteReservationRejected >,
2241 boost::statechart::custom_reaction< RemoteReservationCanceled >
2242 > reactions;
2243 explicit RepWaitBackfillReserved(my_context ctx);
2244 void exit();
2245 boost::statechart::result react(const RemoteBackfillReserved &evt);
2246 boost::statechart::result react(const RejectRemoteReservation &evt);
2247 boost::statechart::result react(const RemoteReservationRejected &evt);
2248 boost::statechart::result react(const RemoteReservationCanceled &evt);
2249 };
2250
2251 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2252 typedef boost::mpl::list<
2253 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2254 // for compat with old peers
2255 boost::statechart::custom_reaction< RemoteReservationRejected >,
2256 boost::statechart::custom_reaction< RemoteReservationCanceled >
2257 > reactions;
2258 explicit RepWaitRecoveryReserved(my_context ctx);
2259 void exit();
2260 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2261 boost::statechart::result react(const RemoteReservationRejected &evt) {
2262 // for compat with old peers
2263 post_event(RemoteReservationCanceled());
2264 return discard_event();
2265 }
2266 boost::statechart::result react(const RemoteReservationCanceled &evt);
2267 };
2268
2269 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2270 typedef boost::mpl::list<
2271 boost::statechart::custom_reaction< RequestBackfillPrio >,
2272 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
2273 boost::statechart::custom_reaction< RejectRemoteReservation >,
2274 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2275 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2276 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2277 > reactions;
2278 explicit RepNotRecovering(my_context ctx);
2279 boost::statechart::result react(const RequestBackfillPrio &evt);
2280 boost::statechart::result react(const RejectRemoteReservation &evt);
2281 void exit();
2282 };
2283
2284 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2285 typedef boost::mpl::list <
2286 boost::statechart::custom_reaction< AllReplicasRecovered >,
2287 boost::statechart::custom_reaction< DeferRecovery >,
2288 boost::statechart::custom_reaction< UnfoundRecovery >,
2289 boost::statechart::custom_reaction< RequestBackfill >
2290 > reactions;
2291 explicit Recovering(my_context ctx);
2292 void exit();
2293 void release_reservations(bool cancel = false);
2294 boost::statechart::result react(const AllReplicasRecovered &evt);
2295 boost::statechart::result react(const DeferRecovery& evt);
2296 boost::statechart::result react(const UnfoundRecovery& evt);
2297 boost::statechart::result react(const RequestBackfill &evt);
2298 };
2299
2300 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2301 typedef boost::mpl::list <
2302 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2303 boost::statechart::transition< AllRemotesReserved, Recovering >
2304 > reactions;
2305 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2306 explicit WaitRemoteRecoveryReserved(my_context ctx);
2307 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2308 void exit();
2309 };
2310
2311 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2312 typedef boost::mpl::list <
2313 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2314 boost::statechart::custom_reaction< RecoveryTooFull >
2315 > reactions;
2316 explicit WaitLocalRecoveryReserved(my_context ctx);
2317 void exit();
2318 boost::statechart::result react(const RecoveryTooFull &evt);
2319 };
2320
2321 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2322 typedef boost::mpl::list <
2323 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2324 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2325 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2326 > reactions;
2327 explicit Activating(my_context ctx);
2328 void exit();
2329 };
2330
2331 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2332 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2333
2334 explicit Stray(my_context ctx);
2335 void exit();
2336
2337 typedef boost::mpl::list <
2338 boost::statechart::custom_reaction< MQuery >,
2339 boost::statechart::custom_reaction< MLogRec >,
2340 boost::statechart::custom_reaction< MInfoRec >,
2341 boost::statechart::custom_reaction< ActMap >,
2342 boost::statechart::custom_reaction< RecoveryDone >
2343 > reactions;
2344 boost::statechart::result react(const MQuery& query);
2345 boost::statechart::result react(const MLogRec& logevt);
2346 boost::statechart::result react(const MInfoRec& infoevt);
2347 boost::statechart::result react(const ActMap&);
2348 boost::statechart::result react(const RecoveryDone&) {
2349 return discard_event();
2350 }
2351 };
2352
2353 struct GetLog;
2354
2355 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2356 set<pg_shard_t> peer_info_requested;
2357
2358 explicit GetInfo(my_context ctx);
2359 void exit();
2360 void get_infos();
2361
2362 typedef boost::mpl::list <
2363 boost::statechart::custom_reaction< QueryState >,
2364 boost::statechart::transition< GotInfo, GetLog >,
2365 boost::statechart::custom_reaction< MNotifyRec >,
2366 boost::statechart::transition< IsDown, Down >
2367 > reactions;
2368 boost::statechart::result react(const QueryState& q);
2369 boost::statechart::result react(const MNotifyRec& infoevt);
2370 };
2371
2372 struct GotLog : boost::statechart::event< GotLog > {
2373 GotLog() : boost::statechart::event< GotLog >() {}
2374 };
2375
2376 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2377 pg_shard_t auth_log_shard;
2378 boost::intrusive_ptr<MOSDPGLog> msg;
2379
2380 explicit GetLog(my_context ctx);
2381 void exit();
2382
2383 typedef boost::mpl::list <
2384 boost::statechart::custom_reaction< QueryState >,
2385 boost::statechart::custom_reaction< MLogRec >,
2386 boost::statechart::custom_reaction< GotLog >,
2387 boost::statechart::custom_reaction< AdvMap >,
2388 boost::statechart::transition< IsIncomplete, Incomplete >
2389 > reactions;
2390 boost::statechart::result react(const AdvMap&);
2391 boost::statechart::result react(const QueryState& q);
2392 boost::statechart::result react(const MLogRec& logevt);
2393 boost::statechart::result react(const GotLog&);
2394 };
2395
2396 struct WaitUpThru;
2397
2398 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2399 set<pg_shard_t> peer_missing_requested;
2400
2401 explicit GetMissing(my_context ctx);
2402 void exit();
2403
2404 typedef boost::mpl::list <
2405 boost::statechart::custom_reaction< QueryState >,
2406 boost::statechart::custom_reaction< MLogRec >,
2407 boost::statechart::transition< NeedUpThru, WaitUpThru >
2408 > reactions;
2409 boost::statechart::result react(const QueryState& q);
2410 boost::statechart::result react(const MLogRec& logevt);
2411 };
2412
2413 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2414 explicit WaitUpThru(my_context ctx);
2415 void exit();
2416
2417 typedef boost::mpl::list <
2418 boost::statechart::custom_reaction< QueryState >,
2419 boost::statechart::custom_reaction< ActMap >,
2420 boost::statechart::custom_reaction< MLogRec >
2421 > reactions;
2422 boost::statechart::result react(const QueryState& q);
2423 boost::statechart::result react(const ActMap& am);
2424 boost::statechart::result react(const MLogRec& logrec);
2425 };
2426
2427 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2428 explicit Down(my_context ctx);
2429 typedef boost::mpl::list <
2430 boost::statechart::custom_reaction< QueryState >
2431 > reactions;
2432 boost::statechart::result react(const QueryState& infoevt);
2433 void exit();
2434 };
2435
2436 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2437 typedef boost::mpl::list <
2438 boost::statechart::custom_reaction< AdvMap >,
2439 boost::statechart::custom_reaction< MNotifyRec >,
2440 boost::statechart::custom_reaction< QueryState >
2441 > reactions;
2442 explicit Incomplete(my_context ctx);
2443 boost::statechart::result react(const AdvMap &advmap);
2444 boost::statechart::result react(const MNotifyRec& infoevt);
2445 boost::statechart::result react(const QueryState& infoevt);
2446 void exit();
2447 };
2448
2449
2450 RecoveryMachine machine;
2451 PG *pg;
2452
2453 /// context passed in by state machine caller
2454 RecoveryCtx *orig_ctx;
2455
2456 /// populated if we are buffering messages pending a flush
2457 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2458
2459 /**
2460 * populated between start_handle() and end_handle(), points into
2461 * the message lists for messages_pending_flush while blocking messages
2462 * or into orig_ctx otherwise
2463 */
2464 boost::optional<RecoveryCtx> rctx;
2465
2466 public:
2467 explicit RecoveryState(PG *pg)
2468 : machine(this, pg), pg(pg), orig_ctx(0) {
2469 machine.initiate();
2470 }
2471
2472 void handle_event(const boost::statechart::event_base &evt,
2473 RecoveryCtx *rctx) {
2474 start_handle(rctx);
2475 machine.process_event(evt);
2476 end_handle();
2477 }
2478
2479 void handle_event(CephPeeringEvtRef evt,
2480 RecoveryCtx *rctx) {
2481 start_handle(rctx);
2482 machine.process_event(evt->get_event());
2483 end_handle();
2484 }
2485
2486 } recovery_state;
2487
2488
2489 public:
2490 PG(OSDService *o, OSDMapRef curmap,
2491 const PGPool &pool, spg_t p);
2492 ~PG() override;
2493 const spg_t pg_id;
2494
2495 private:
2496 // Prevent copying
2497 explicit PG(const PG& rhs);
2498 PG& operator=(const PG& rhs);
2499 uint64_t peer_features;
2500 uint64_t acting_features;
2501 uint64_t upacting_features;
2502
2503 epoch_t last_epoch;
2504
2505 public:
2506 const spg_t& get_pgid() const { return pg_id; }
2507
2508 void set_last_scrub_stamp(utime_t t) {
2509 info.stats.last_scrub_stamp = t;
2510 info.history.last_scrub_stamp = t;
2511 }
2512
2513 void set_last_deep_scrub_stamp(utime_t t) {
2514 info.stats.last_deep_scrub_stamp = t;
2515 info.history.last_deep_scrub_stamp = t;
2516 }
2517
2518 void reset_min_peer_features() {
2519 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2520 }
2521 uint64_t get_min_peer_features() const { return peer_features; }
2522 void apply_peer_features(uint64_t f) { peer_features &= f; }
2523
2524 uint64_t get_min_acting_features() const { return acting_features; }
2525 uint64_t get_min_upacting_features() const { return upacting_features; }
2526 bool perform_deletes_during_peering() const {
2527 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2528 }
2529
2530 bool hard_limit_pglog() const {
2531 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2532 }
2533
2534 void init_primary_up_acting(
2535 const vector<int> &newup,
2536 const vector<int> &newacting,
2537 int new_up_primary,
2538 int new_acting_primary) {
2539 actingset.clear();
2540 acting = newacting;
2541 for (uint8_t i = 0; i < acting.size(); ++i) {
2542 if (acting[i] != CRUSH_ITEM_NONE)
2543 actingset.insert(
2544 pg_shard_t(
2545 acting[i],
2546 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2547 }
2548 upset.clear();
2549 up = newup;
2550 for (uint8_t i = 0; i < up.size(); ++i) {
2551 if (up[i] != CRUSH_ITEM_NONE)
2552 upset.insert(
2553 pg_shard_t(
2554 up[i],
2555 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2556 }
2557 if (!pool.info.ec_pool()) {
2558 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2559 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2560 return;
2561 }
2562 up_primary = pg_shard_t();
2563 primary = pg_shard_t();
2564 for (uint8_t i = 0; i < up.size(); ++i) {
2565 if (up[i] == new_up_primary) {
2566 up_primary = pg_shard_t(up[i], shard_id_t(i));
2567 break;
2568 }
2569 }
2570 for (uint8_t i = 0; i < acting.size(); ++i) {
2571 if (acting[i] == new_acting_primary) {
2572 primary = pg_shard_t(acting[i], shard_id_t(i));
2573 break;
2574 }
2575 }
2576 assert(up_primary.osd == new_up_primary);
2577 assert(primary.osd == new_acting_primary);
2578 }
2579 pg_shard_t get_primary() const { return primary; }
2580
2581 int get_role() const { return role; }
2582 void set_role(int r) { role = r; }
2583
2584 bool is_primary() const { return pg_whoami == primary; }
2585 bool is_replica() const { return role > 0; }
2586
2587 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2588
2589 //int get_state() const { return state; }
2590 bool state_test(int m) const { return (state & m) != 0; }
2591 void state_set(int m) { state |= m; }
2592 void state_clear(int m) { state &= ~m; }
2593
2594 bool is_complete() const { return info.last_complete == info.last_update; }
2595 bool should_send_notify() const { return send_notify; }
2596
2597 int get_state() const { return state; }
2598 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2599 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2600 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2601 bool is_down() const { return state_test(PG_STATE_DOWN); }
2602 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2603 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2604 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2605 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2606 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2607 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2608
2609 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2610 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2611 bool is_peered() const {
2612 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2613 }
2614 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
2615
2616 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2617
2618 void init(
2619 int role,
2620 const vector<int>& up,
2621 int up_primary,
2622 const vector<int>& acting,
2623 int acting_primary,
2624 const pg_history_t& history,
2625 const PastIntervals& pim,
2626 bool backfill,
2627 ObjectStore::Transaction *t);
2628
2629 // pg on-disk state
2630 void do_pending_flush();
2631
2632 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2633 static void _init(ObjectStore::Transaction& t,
2634 spg_t pgid, const pg_pool_t *pool);
2635
2636 private:
2637 void prepare_write_info(map<string,bufferlist> *km);
2638
2639 void update_store_with_options();
2640 void update_store_on_load();
2641
2642 public:
2643 static int _prepare_write_info(
2644 CephContext* cct,
2645 map<string,bufferlist> *km,
2646 epoch_t epoch,
2647 pg_info_t &info,
2648 pg_info_t &last_written_info,
2649 PastIntervals &past_intervals,
2650 bool dirty_big_info,
2651 bool dirty_epoch,
2652 bool try_fast_info,
2653 PerfCounters *logger = nullptr);
2654 void write_if_dirty(ObjectStore::Transaction& t);
2655
2656 PGLog::IndexedLog projected_log;
2657 bool check_in_progress_op(
2658 const osd_reqid_t &r,
2659 eversion_t *version,
2660 version_t *user_version,
2661 int *return_code) const;
2662 eversion_t projected_last_update;
2663 eversion_t get_next_version() const {
2664 eversion_t at_version(
2665 get_osdmap()->get_epoch(),
2666 projected_last_update.version+1);
2667 assert(at_version > info.last_update);
2668 assert(at_version > pg_log.get_head());
2669 assert(at_version > projected_last_update);
2670 return at_version;
2671 }
2672
2673 void add_log_entry(const pg_log_entry_t& e, bool applied);
2674 void append_log(
2675 const vector<pg_log_entry_t>& logv,
2676 eversion_t trim_to,
2677 eversion_t roll_forward_to,
2678 ObjectStore::Transaction &t,
2679 bool transaction_applied = true);
2680 bool check_log_for_corruption(ObjectStore *store);
2681
2682 std::string get_corrupt_pg_log_name() const;
2683 static int read_info(
2684 ObjectStore *store, spg_t pgid, const coll_t &coll,
2685 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2686 __u8 &);
2687 void read_state(ObjectStore *store, bufferlist &bl);
2688 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2689 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2690 epoch_t *pepoch, bufferlist *bl);
2691 void update_snap_map(
2692 const vector<pg_log_entry_t> &log_entries,
2693 ObjectStore::Transaction& t);
2694
2695 void filter_snapc(vector<snapid_t> &snaps);
2696
2697 void log_weirdness();
2698
2699 virtual void kick_snap_trim() = 0;
2700 virtual void snap_trimmer_scrub_complete() = 0;
2701 bool requeue_scrub(bool high_priority = false);
2702 void queue_recovery();
2703 bool queue_scrub();
2704 unsigned get_scrub_priority();
2705
2706 /// share pg info after a pg is active
2707 void share_pg_info();
2708
2709
2710 bool append_log_entries_update_missing(
2711 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2712 ObjectStore::Transaction &t,
2713 boost::optional<eversion_t> trim_to,
2714 boost::optional<eversion_t> roll_forward_to);
2715
2716 /**
2717 * Merge entries updating missing as necessary on all
2718 * actingbackfill logs and missings (also missing_loc)
2719 */
2720 void merge_new_log_entries(
2721 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2722 ObjectStore::Transaction &t,
2723 boost::optional<eversion_t> trim_to,
2724 boost::optional<eversion_t> roll_forward_to);
2725
2726 void reset_interval_flush();
2727 void start_peering_interval(
2728 const OSDMapRef lastmap,
2729 const vector<int>& newup, int up_primary,
2730 const vector<int>& newacting, int acting_primary,
2731 ObjectStore::Transaction *t);
2732 void on_new_interval();
2733 virtual void _on_new_interval() = 0;
2734 void start_flush(ObjectStore::Transaction *t,
2735 list<Context *> *on_applied,
2736 list<Context *> *on_safe);
2737 void set_last_peering_reset();
2738 bool pg_has_reset_since(epoch_t e) {
2739 assert(is_locked());
2740 return deleting || e < get_last_peering_reset();
2741 }
2742
2743 void update_history(const pg_history_t& history);
2744 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2745 pair<pg_shard_t, pg_info_t> &notify_info);
2746 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2747 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
2748 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2749
2750 bool should_restart_peering(
2751 int newupprimary,
2752 int newactingprimary,
2753 const vector<int>& newup,
2754 const vector<int>& newacting,
2755 OSDMapRef lastmap,
2756 OSDMapRef osdmap);
2757
2758 // OpRequest queueing
2759 bool can_discard_op(OpRequestRef& op);
2760 bool can_discard_scan(OpRequestRef op);
2761 bool can_discard_backfill(OpRequestRef op);
2762 bool can_discard_request(OpRequestRef& op);
2763
2764 template<typename T, int MSGTYPE>
2765 bool can_discard_replica_op(OpRequestRef& op);
2766
2767 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2768 bool old_peering_evt(CephPeeringEvtRef evt) {
2769 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2770 }
2771 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2772 return e <= cur_epoch;
2773 }
2774 bool have_same_or_newer_map(epoch_t e) {
2775 return e <= get_osdmap()->get_epoch();
2776 }
2777
2778 bool op_has_sufficient_caps(OpRequestRef& op);
2779
2780
2781 // recovery bits
2782 void take_waiters();
2783 void queue_peering_event(CephPeeringEvtRef evt);
2784 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2785 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2786 pg_shard_t from, const pg_query_t& q);
2787 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2788 void queue_flushed(epoch_t started_at);
2789 void handle_advance_map(
2790 OSDMapRef osdmap, OSDMapRef lastmap,
2791 vector<int>& newup, int up_primary,
2792 vector<int>& newacting, int acting_primary,
2793 RecoveryCtx *rctx);
2794 void handle_activate_map(RecoveryCtx *rctx);
2795 void handle_create(RecoveryCtx *rctx);
2796 void handle_loaded(RecoveryCtx *rctx);
2797 void handle_query_state(Formatter *f);
2798
2799 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2800
2801
2802 // abstract bits
2803 virtual void do_request(
2804 OpRequestRef& op,
2805 ThreadPool::TPHandle &handle
2806 ) = 0;
2807
2808 virtual void do_op(OpRequestRef& op) = 0;
2809 virtual void do_sub_op(OpRequestRef op) = 0;
2810 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2811 virtual void do_scan(
2812 OpRequestRef op,
2813 ThreadPool::TPHandle &handle
2814 ) = 0;
2815 virtual void do_backfill(OpRequestRef op) = 0;
2816 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2817
2818 virtual int do_command(
2819 cmdmap_t cmdmap,
2820 ostream& ss,
2821 bufferlist& idata,
2822 bufferlist& odata,
2823 ConnectionRef conn,
2824 ceph_tid_t tid) = 0;
2825
2826 virtual void on_role_change() = 0;
2827 virtual void on_pool_change() = 0;
2828 virtual void on_change(ObjectStore::Transaction *t) = 0;
2829 virtual void on_activate() = 0;
2830 virtual void on_flushed() = 0;
2831 virtual void on_shutdown() = 0;
2832 virtual void check_blacklisted_watchers() = 0;
2833 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2834
2835 virtual bool agent_work(int max) = 0;
2836 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2837 virtual void agent_stop() = 0;
2838 virtual void agent_delay() = 0;
2839 virtual void agent_clear() = 0;
2840 virtual void agent_choose_mode_restart() = 0;
2841 };
2842
2843 ostream& operator<<(ostream& out, const PG& pg);
2844
2845 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2846
2847 #endif